RTL code refactoring

This commit is contained in:
Blaise Tine
2020-04-19 03:38:00 -04:00
parent 460aabf6b1
commit 9b476f1e17
97 changed files with 3127 additions and 18563 deletions

View File

@@ -23,6 +23,8 @@ SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/generic_cache -I../../hw/rtl/shared_memory -I../../hw/rtl/pipe_regs -I../../hw/rtl/compat
VL_FLAGS += --assert -Wall -Wpedantic
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
#VL_FLAGS += --threads $(THREADS)

View File

@@ -10,6 +10,8 @@ EXE += --exe ./simulate/testbench.cpp ./simulate/simulator.cpp
VF += -compiler gcc --language 1800-2009
VF += --assert -Wall -Wpedantic
# LIB=-LDFLAGS '-L/usr/local/systemc/'
LIB +=

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,3 @@
`include "../VX_define.vh"
//`define NUM_BANKS 8
@@ -118,7 +117,7 @@ reg[31:0] io_data;
.i_m_readdata_i (i_m_readdata_i),
.i_m_ready_i (i_m_ready_i),
.out_ebreak (out_ebreak)
);
);
always @(negedge clk) begin
ibus_driver(clk, o_m_read_addr_i, o_m_evict_addr_i, o_m_valid_i, o_m_writedata_i, o_m_read_or_write_i, `ICACHE_BANKS, `ICACHE_NUM_WORDS_PER_BLOCK, i_m_readdata_i, i_m_ready_i);
@@ -138,14 +137,13 @@ reg[31:0] io_data;
cycle_num = cycle_num + 1;
end
always @(clk, posedge reset) begin
always @(clk) begin
if (reset) begin
reset = 0;
clk = 0;
end
#5 clk <= ~clk;
end
endmodule

View File

@@ -92,7 +92,7 @@ vortex_afu.json
../rtl/interfaces/VX_gpu_dcache_dram_req_inter.v
../rtl/interfaces/VX_csr_req_inter.v
../rtl/interfaces/VX_icache_request_inter.v
../rtl/interfaces/VX_gpu_dcache_res_inter.v
../rtl/interfaces/VX_gpu_dcache_rsp_inter.v
../rtl/interfaces/VX_frE_to_bckE_req_inter.v
../rtl/interfaces/VX_dram_req_rsp_inter.v
../rtl/interfaces/VX_dcache_request_inter.v
@@ -113,7 +113,7 @@ vortex_afu.json
../rtl/interfaces/VX_jal_response_inter.v
../rtl/interfaces/VX_warp_ctl_inter.v
../rtl/interfaces/VX_gpu_dcache_snp_req_inter.v
../rtl/interfaces/VX_gpu_dcache_dram_res_inter.v
../rtl/interfaces/VX_gpu_dcache_dram_rsp_inter.v
../rtl/interfaces/VX_inst_mem_wb_inter.v
ccip_interface_reg.sv

View File

@@ -70,16 +70,16 @@ logic vx_dram_req_read;
logic vx_dram_req_write;
logic [31:0] vx_dram_req_addr;
logic [31:0] vx_dram_req_data[15:0];
logic vx_dram_req_delay;
logic vx_dram_req_full;
logic vx_dram_fill_accept;
logic vx_dram_fill_rsp;
logic [31:0] vx_dram_fill_rsp_addr;
logic [31:0] vx_dram_fill_rsp_data[15:0];
logic vx_dram_rsp_ready;
logic vx_dram_rsp_valid;
logic [31:0] vx_dram_rsp_addr;
logic [31:0] vx_dram_rsp_data[15:0];
logic vx_snp_req;
logic [31:0] vx_snp_req_addr;
logic vx_snp_req_delay;
logic vx_snp_req_full;
logic vx_ebreak;
@@ -316,7 +316,7 @@ begin
STATE_RUN, STATE_CLFLUSH: begin
if (vx_dram_req_read
&& !vx_dram_req_delay)
&& !vx_dram_req_full)
begin
avs_address <= (vx_dram_req_addr >> 6);
avs_read <= 1;
@@ -324,7 +324,7 @@ begin
end
if (vx_dram_req_write
&& !vx_dram_req_delay)
&& !vx_dram_req_full)
begin
avs_writedata <= {>>{vx_dram_req_data}};
avs_address <= (vx_dram_req_addr >> 6);
@@ -348,16 +348,16 @@ logic vortex_enabled;
always_comb
begin
vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state);
vx_dram_req_delay = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full;
vx_dram_req_full = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full;
end
// Vortex DRAM fill response
always_comb
begin
vx_dram_fill_rsp = vortex_enabled && !avs_rdq_empty && vx_dram_fill_accept;
vx_dram_fill_rsp_addr = (avs_raq_dout << 6);
{>>{vx_dram_fill_rsp_data}} = avs_rdq_dout;
vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty && vx_dram_rsp_ready;
vx_dram_rsp_addr = (avs_raq_dout << 6);
{>>{vx_dram_rsp_data}} = avs_rdq_dout;
end
// AVS address read request queue /////////////////////////////////////////////
@@ -366,7 +366,7 @@ logic cci_write_req;
always_comb
begin
avs_raq_pop = vx_dram_fill_rsp || cci_write_req;
avs_raq_pop = vx_dram_rsp_valid || cci_write_req;
avs_raq_din = avs_address;
avs_raq_push = avs_read;
end
@@ -531,7 +531,7 @@ begin
if ((STATE_CLFLUSH == state)
&& vx_snoop_ctr < csr_data_size
&& !vx_snp_req_delay)
&& !vx_snp_req_full)
begin
vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6;
vx_snp_req <= 1;
@@ -548,29 +548,29 @@ end
// Vortex binding /////////////////////////////////////////////////////////////
Vortex_Socket #() vx_socket (
.clk (clk),
.reset (SoftReset || vx_reset),
.clk (clk),
.reset (SoftReset || vx_reset),
// DRAM Req
.out_dram_req_write (vx_dram_req_write),
.out_dram_req_read (vx_dram_req_read),
.out_dram_req_addr (vx_dram_req_addr),
.out_dram_req_data (vx_dram_req_data),
.out_dram_req_delay (vx_dram_req_delay),
.dram_req_write (vx_dram_req_write),
.dram_req_read (vx_dram_req_read),
.dram_req_addr (vx_dram_req_addr),
.dram_req_data (vx_dram_req_data),
.dram_req_full (vx_dram_req_full),
// DRAM Rsp
.out_dram_fill_accept (vx_dram_fill_accept),
.out_dram_fill_rsp (vx_dram_fill_rsp),
.out_dram_fill_rsp_addr (vx_dram_fill_rsp_addr),
.out_dram_fill_rsp_data (vx_dram_fill_rsp_data),
.out_dram_rsp_ready (vx_dram_rsp_ready),
.dram_rsp_valid (vx_dram_rsp_valid),
.out_dram_rsp_addr (vx_dram_rsp_addr),
.out_dram_rsp_data (vx_dram_rsp_data),
// Cache Snooping Req
.llc_snp_req (vx_snp_req),
.llc_snp_req_addr (vx_snp_req_addr),
.llc_snp_req_delay (vx_snp_req_delay),
.llc_snp_req_valid (vx_snp_req),
.llc_snp_req_addr (vx_snp_req_addr),
.llc_snp_req_full (vx_snp_req_full),
// program exit signal
.out_ebreak (vx_ebreak)
.out_ebreak (vx_ebreak)
);
endmodule

View File

@@ -1,69 +0,0 @@
onerror {resume}
quietly WaveActivateNextPane {} 0
add wave -noupdate -label clk /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/clk
add wave -noupdate -label reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/SoftReset
add wave -noupdate -label state /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/state
add wave -noupdate -label cci_write_pending /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/cci_write_pending
add wave -noupdate -label cci_write_ctr -radix decimal -radixshowbase 0 /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/cci_write_ctr
add wave -noupdate -label csr_data_size -radix decimal -radixshowbase 0 /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/csr_data_size
add wave -noupdate -label avs_read_ctr -radix decimal -radixshowbase 0 /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_read_ctr
add wave -noupdate -label avs_waitrequest /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_waitrequest
add wave -noupdate -label avs_address -radix hexadecimal -radixshowbase 0 /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_address
add wave -noupdate -label avs_readdata -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_readdata
add wave -noupdate -label avs_writedata -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_writedata
add wave -noupdate -label avs_write /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_write
add wave -noupdate -label avs_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_read
add wave -noupdate -label avs_readdatavalid /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_readdatavalid
add wave -noupdate -label sRx.c0.rspValid /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/cp2af_sRxPort.c0.rspValid
add wave -noupdate -label sRx.c1.rspValid /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/cp2af_sRxPort.c1.rspValid
add wave -noupdate -label sTx.c0.valid /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/af2cp_sTxPort.c0.valid
add wave -noupdate -label sTx.c1.valid /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/af2cp_sTxPort.c1.valid
add wave -noupdate -label cci_write_req /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/cci_write_req
add wave -noupdate -label avs_raq_push /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_push
add wave -noupdate -label avs_rdq_push /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_push
add wave -noupdate -label avs_raq_pop /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_pop
add wave -noupdate -label avs_rdq_pop /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_pop
add wave -noupdate -label avs_raq_full /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_full
add wave -noupdate -label avs_rdq_full /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_full
add wave -noupdate -label avs_raq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_empty
add wave -noupdate -label avs_rdq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_empty
add wave -noupdate -label vortex_enabled /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vortex_enabled
add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset
add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read
add wave -noupdate -label vx_dram_req_write /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_write
add wave -noupdate -label vx_dram_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_delay
add wave -noupdate -label vx_dram_req_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_addr
add wave -noupdate -label vx_draw_req_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_data
add wave -noupdate -label out_dram_fill_rsp /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_rsp
add wave -noupdate -label out_dram_fill_accept /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_accept
add wave -noupdate -label vx_draw_fill_rsp_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_data
add wave -noupdate -label vx_dram_fill_rsp_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_addr
add wave -noupdate -label llc_snp_req /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req
add wave -noupdate -label llc_snp_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req_delay
add wave -noupdate -label out_break /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_ebreak
add wave -noupdate -label warp_pc -radix hexadecimal -radixshowbase 0 {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_pc}
add wave -noupdate -label scheduled_warp {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/scheduled_warp}
add wave -noupdate -label thread_mask {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/thread_mask}
add wave -noupdate -label warp_num {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_num}
add wave -noupdate -label warp_active {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/warp_active}
add wave -noupdate -label warp_stalled {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/warp_stalled}
add wave -noupdate -label warp_lock {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/warp_lock}
add wave -noupdate -label use_active {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/use_active}
TreeUpdate [SetDefaultTree]
WaveRestoreCursors {{Cursor 2} {360293 ps} 0}
quietly wave cursor active 1
configure wave -namecolwidth 195
configure wave -valuecolwidth 100
configure wave -justifyvalue left
configure wave -signalnamewidth 0
configure wave -snapdistance 10
configure wave -datasetprefix 0
configure wave -rowmargin 4
configure wave -childrowmargin 2
configure wave -gridoffset 0
configure wave -gridperiod 1
configure wave -griddelta 40
configure wave -timeline 0
configure wave -timelineunits ps
update
WaveRestoreZoom {346453 ps} {711141 ps}

View File

@@ -14,196 +14,198 @@ module VX_alu(
output reg out_alu_stall
);
localparam div_pipeline_len = 20;
localparam mul_pipeline_len = 8;
localparam div_pipeline_len = 20;
localparam mul_pipeline_len = 8;
wire[31:0] unsigned_div_result;
wire[31:0] unsigned_rem_result;
wire[31:0] signed_div_result;
wire[31:0] signed_rem_result;
wire[31:0] unsigned_div_result;
wire[31:0] unsigned_rem_result;
wire[31:0] signed_div_result;
wire[31:0] signed_rem_result;
wire[63:0] mul_data_a, mul_data_b;
wire[63:0] mul_result;
wire[63:0] mul_data_a, mul_data_b;
wire[63:0] mul_result;
wire[31:0] ALU_in1;
wire[31:0] ALU_in2;
wire[31:0] ALU_in1;
wire[31:0] ALU_in2;
VX_divide #(
.WIDTHN(32),
.WIDTHD(32),
.SPEED("HIGHEST"),
.PIPELINE(div_pipeline_len)
) unsigned_div (
.clock(clk),
.aclr(1'b0),
.clken(1'b1), // TODO this could be disabled on inactive instructions
.numer(ALU_in1),
.denom(ALU_in2),
.quotient(unsigned_div_result),
.remainder(unsigned_rem_result)
);
VX_divide #(
.WIDTHN(32),
.WIDTHD(32),
.SPEED("HIGHEST"),
.PIPELINE(div_pipeline_len)
) unsigned_div (
.clock(clk),
.aclr(1'b0),
.clken(1'b1), // TODO this could be disabled on inactive instructions
.numer(ALU_in1),
.denom(ALU_in2),
.quotient(unsigned_div_result),
.remainder(unsigned_rem_result)
);
VX_divide #(
.WIDTHN(32),
.WIDTHD(32),
.NREP("SIGNED"),
.DREP("SIGNED"),
.SPEED("HIGHEST"),
.PIPELINE(div_pipeline_len)
) signed_div (
.clock(clk),
.aclr(1'b0),
.clken(1'b1), // TODO this could be disabled on inactive instructions
.numer(ALU_in1),
.denom(ALU_in2),
.quotient(signed_div_result),
.remainder(signed_rem_result)
);
VX_divide #(
.WIDTHN(32),
.WIDTHD(32),
.NREP("SIGNED"),
.DREP("SIGNED"),
.SPEED("HIGHEST"),
.PIPELINE(div_pipeline_len)
) signed_div (
.clock(clk),
.aclr(1'b0),
.clken(1'b1), // TODO this could be disabled on inactive instructions
.numer(ALU_in1),
.denom(ALU_in2),
.quotient(signed_div_result),
.remainder(signed_rem_result)
);
VX_mult #(
.WIDTHA(64),
.WIDTHB(64),
.WIDTHP(64),
.SPEED("HIGHEST"),
.FORCE_LE("YES"),
.PIPELINE(mul_pipeline_len)
) multiplier (
.clock(clk),
.aclr(1'b0),
.clken(1'b1), // TODO this could be disabled on inactive instructions
.dataa(mul_data_a),
.datab(mul_data_b),
.result(mul_result)
);
VX_mult #(
.WIDTHA(64),
.WIDTHB(64),
.WIDTHP(64),
.SPEED("HIGHEST"),
.FORCE_LE("YES"),
.PIPELINE(mul_pipeline_len)
) multiplier (
.clock(clk),
.aclr(1'b0),
.clken(1'b1), // TODO this could be disabled on inactive instructions
.dataa(mul_data_a),
.datab(mul_data_b),
.result(mul_result)
);
// MUL, MULH (signed*signed), MULHSU (signed*unsigned), MULHU (unsigned*unsigned)
wire[63:0] alu_in1_signed = {{32{ALU_in1[31]}}, ALU_in1};
wire[63:0] alu_in2_signed = {{32{ALU_in2[31]}}, ALU_in2};
assign mul_data_a = (in_alu_op == `MULHU) ? {32'b0, ALU_in1} : alu_in1_signed;
assign mul_data_b = (in_alu_op == `MULHU || in_alu_op == `MULHSU) ? {32'b0, ALU_in2} : alu_in2_signed;
// MUL, MULH (signed*signed), MULHSU (signed*unsigned), MULHU (unsigned*unsigned)
wire[63:0] alu_in1_signed = {{32{ALU_in1[31]}}, ALU_in1};
wire[63:0] alu_in2_signed = {{32{ALU_in2[31]}}, ALU_in2};
assign mul_data_a = (in_alu_op == `MULHU) ? {32'b0, ALU_in1} : alu_in1_signed;
assign mul_data_b = (in_alu_op == `MULHU || in_alu_op == `MULHSU) ? {32'b0, ALU_in2} : alu_in2_signed;
reg [15:0] curr_inst_delay;
reg [15:0] inst_delay;
reg inst_was_stalling;
reg [15:0] curr_inst_delay;
reg [15:0] inst_delay;
reg inst_was_stalling;
wire inst_delay_stall = inst_was_stalling ? inst_delay != 0 : curr_inst_delay != 0;
assign out_alu_stall = inst_delay_stall;
wire inst_delay_stall = inst_was_stalling ? inst_delay != 0 : curr_inst_delay != 0;
assign out_alu_stall = inst_delay_stall;
always @(*) begin
case(in_alu_op)
`DIV,
`DIVU,
`REM,
`REMU: curr_inst_delay = div_pipeline_len;
`MUL,
`MULH,
`MULHSU,
`MULHU: curr_inst_delay = mul_pipeline_len;
default: curr_inst_delay = 0;
endcase // in_alu_op
end
always @(*) begin
case(in_alu_op)
`DIV,
`DIVU,
`REM,
`REMU: curr_inst_delay = div_pipeline_len;
`MUL,
`MULH,
`MULHSU,
`MULHU: curr_inst_delay = mul_pipeline_len;
default: curr_inst_delay = 0;
endcase // in_alu_op
end
always @(posedge clk or posedge reset) begin
if (reset) begin
inst_delay <= 0;
inst_was_stalling <= 0;
end
else if (inst_delay_stall) begin
if (inst_was_stalling) begin
if (inst_delay > 0)
inst_delay <= inst_delay - 1;
end
else begin
inst_was_stalling <= 1;
inst_delay <= curr_inst_delay - 1;
end
end
else begin
inst_was_stalling <= 0;
end
end
`ifdef SYN_FUNC
wire which_in2;
wire[31:0] upper_immed;
assign which_in2 = in_rs2_src == `RS2_IMMED;
assign ALU_in1 = in_1;
assign ALU_in2 = which_in2 ? in_itype_immed : in_2;
assign upper_immed = {in_upper_immed, {12{1'b0}}};
always @(*) begin
case(in_alu_op)
`ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2);
`SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2);
`SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0];
`SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0;
`SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0;
`XOR: out_alu_result = ALU_in1 ^ ALU_in2;
`SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0];
`SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0];
`OR: out_alu_result = ALU_in1 | ALU_in2;
`AND: out_alu_result = ALU_in2 & ALU_in1;
`SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff;
`LUI_ALU: out_alu_result = upper_immed;
`AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed);
// TODO profitable to roll these exceptional cases into inst_delay to avoid pipeline when possible?
`MUL: out_alu_result = mul_result[31:0];
`MULH: out_alu_result = mul_result[63:32];
`MULHSU: out_alu_result = mul_result[63:32];
`MULHU: out_alu_result = mul_result[63:32];
`DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : signed_div_result;
`DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : unsigned_div_result;
`REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : signed_rem_result;
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : unsigned_rem_result;
default: out_alu_result = 32'h0;
endcase // in_alu_op
always @(posedge clk) begin
if (reset) begin
inst_delay <= 0;
inst_was_stalling <= 0;
end
`else
wire which_in2;
wire[31:0] upper_immed;
assign which_in2 = in_rs2_src == `RS2_IMMED;
assign ALU_in1 = in_1;
assign ALU_in2 = which_in2 ? in_itype_immed : in_2;
assign upper_immed = {in_upper_immed, {12{1'b0}}};
always @(*) begin
case(in_alu_op)
`ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2);
`SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2);
`SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0];
`SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0;
`SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0;
`XOR: out_alu_result = ALU_in1 ^ ALU_in2;
`SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0];
`SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0];
`OR: out_alu_result = ALU_in1 | ALU_in2;
`AND: out_alu_result = ALU_in2 & ALU_in1;
`SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff;
`LUI_ALU: out_alu_result = upper_immed;
`AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed);
// TODO profitable to roll these exceptional cases into inst_delay to avoid pipeline when possible?
`MUL: out_alu_result = mul_result[31:0];
`MULH: out_alu_result = mul_result[63:32];
`MULHSU: out_alu_result = mul_result[63:32];
`MULHU: out_alu_result = mul_result[63:32];
`DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : signed_div_result;
`DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : unsigned_div_result;
`REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : signed_rem_result;
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : unsigned_rem_result;
default: out_alu_result = 32'h0;
endcase // in_alu_op
else if (inst_delay_stall) begin
if (inst_was_stalling) begin
if (inst_delay > 0)
inst_delay <= inst_delay - 1;
end
else begin
inst_was_stalling <= 1;
inst_delay <= curr_inst_delay - 1;
end
end
`endif
else begin
inst_was_stalling <= 0;
end
end
`ifdef SYN_FUNC
wire which_in2;
wire[31:0] upper_immed;
assign which_in2 = in_rs2_src == `RS2_IMMED;
assign ALU_in1 = in_1;
assign ALU_in2 = which_in2 ? in_itype_immed : in_2;
assign upper_immed = {in_upper_immed, {12{1'b0}}};
always @(*) begin
case(in_alu_op)
`ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2);
`SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2);
`SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0];
`SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0;
`SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0;
`XOR: out_alu_result = ALU_in1 ^ ALU_in2;
`SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0];
`SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0];
`OR: out_alu_result = ALU_in1 | ALU_in2;
`AND: out_alu_result = ALU_in2 & ALU_in1;
`SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff;
`LUI_ALU: out_alu_result = upper_immed;
`AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed);
// TODO profitable to roll these exceptional cases into inst_delay to avoid pipeline when possible?
`MUL: out_alu_result = mul_result[31:0];
`MULH: out_alu_result = mul_result[63:32];
`MULHSU: out_alu_result = mul_result[63:32];
`MULHU: out_alu_result = mul_result[63:32];
`DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : signed_div_result;
`DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : unsigned_div_result;
`REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : signed_rem_result;
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : unsigned_rem_result;
default: out_alu_result = 32'h0;
endcase // in_alu_op
end
`else
wire which_in2;
wire[31:0] upper_immed;
assign which_in2 = in_rs2_src == `RS2_IMMED;
assign ALU_in1 = in_1;
assign ALU_in2 = which_in2 ? in_itype_immed : in_2;
assign upper_immed = {in_upper_immed, {12{1'b0}}};
always @(*) begin
case(in_alu_op)
`ADD: out_alu_result = $signed(ALU_in1) + $signed(ALU_in2);
`SUB: out_alu_result = $signed(ALU_in1) - $signed(ALU_in2);
`SLLA: out_alu_result = ALU_in1 << ALU_in2[4:0];
`SLT: out_alu_result = ($signed(ALU_in1) < $signed(ALU_in2)) ? 32'h1 : 32'h0;
`SLTU: out_alu_result = ALU_in1 < ALU_in2 ? 32'h1 : 32'h0;
`XOR: out_alu_result = ALU_in1 ^ ALU_in2;
`SRL: out_alu_result = ALU_in1 >> ALU_in2[4:0];
`SRA: out_alu_result = $signed(ALU_in1) >>> ALU_in2[4:0];
`OR: out_alu_result = ALU_in1 | ALU_in2;
`AND: out_alu_result = ALU_in2 & ALU_in1;
`SUBU: out_alu_result = (ALU_in1 >= ALU_in2) ? 32'h0 : 32'hffffffff;
`LUI_ALU: out_alu_result = upper_immed;
`AUIPC_ALU: out_alu_result = $signed(in_curr_PC) + $signed(upper_immed);
// TODO profitable to roll these exceptional cases into inst_delay to avoid pipeline when possible?
`MUL: out_alu_result = mul_result[31:0];
`MULH: out_alu_result = mul_result[63:32];
`MULHSU: out_alu_result = mul_result[63:32];
`MULHU: out_alu_result = mul_result[63:32];
`DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : signed_div_result;
`DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : unsigned_div_result;
`REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : signed_rem_result;
`REMU: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : unsigned_rem_result;
default: out_alu_result = 32'h0;
endcase // in_alu_op
end
`endif
endmodule : VX_alu

View File

@@ -9,71 +9,63 @@ module VX_back_end
input wire reset,
input wire schedule_delay,
VX_gpu_dcache_res_inter VX_dcache_rsp,
VX_gpu_dcache_req_inter VX_dcache_req,
VX_gpu_dcache_rsp_inter vx_dcache_rsp,
VX_gpu_dcache_req_inter vx_dcache_req,
output wire out_mem_delay,
output wire out_exec_delay,
output wire gpr_stage_delay,
VX_jal_response_inter VX_jal_rsp,
VX_branch_response_inter VX_branch_rsp,
VX_jal_response_inter vx_jal_rsp,
VX_branch_response_inter vx_branch_rsp,
VX_frE_to_bckE_req_inter VX_bckE_req,
VX_wb_inter VX_writeback_inter,
VX_frE_to_bckE_req_inter vx_bckE_req,
VX_wb_inter vx_writeback_inter,
VX_warp_ctl_inter VX_warp_ctl
VX_warp_ctl_inter vx_warp_ctl
);
VX_wb_inter VX_writeback_temp();
assign VX_writeback_inter.wb = VX_writeback_temp.wb;
assign VX_writeback_inter.rd = VX_writeback_temp.rd;
assign VX_writeback_inter.write_data = VX_writeback_temp.write_data;
assign VX_writeback_inter.wb_valid = VX_writeback_temp.wb_valid;
assign VX_writeback_inter.wb_warp_num = VX_writeback_temp.wb_warp_num;
assign VX_writeback_inter.wb_pc = VX_writeback_temp.wb_pc;
VX_wb_inter vx_writeback_temp();
assign vx_writeback_inter.wb = vx_writeback_temp.wb;
assign vx_writeback_inter.rd = vx_writeback_temp.rd;
assign vx_writeback_inter.write_data = vx_writeback_temp.write_data;
assign vx_writeback_inter.wb_valid = vx_writeback_temp.wb_valid;
assign vx_writeback_inter.wb_warp_num = vx_writeback_temp.wb_warp_num;
assign vx_writeback_inter.wb_pc = vx_writeback_temp.wb_pc;
// assign VX_writeback_inter(VX_writeback_temp);
// assign VX_writeback_inter(vx_writeback_temp);
VX_mw_wb_inter VX_mw_wb();
wire no_slot_mem;
wire no_slot_exec;
VX_mem_req_inter VX_exe_mem_req();
VX_mem_req_inter VX_mem_req();
// LSU input + output
VX_lsu_req_inter VX_lsu_req();
VX_inst_mem_wb_inter VX_mem_wb();
VX_lsu_req_inter vx_lsu_req();
VX_inst_mem_wb_inter vx_mem_wb();
// Exec unit input + output
VX_exec_unit_req_inter VX_exec_unit_req();
VX_inst_exec_wb_inter VX_inst_exec_wb();
VX_exec_unit_req_inter vx_exec_unit_req();
VX_inst_exec_wb_inter vx_inst_exec_wb();
// GPU unit input
VX_gpu_inst_req_inter VX_gpu_inst_req();
VX_gpu_inst_req_inter vx_gpu_inst_req();
// CSR unit inputs
VX_csr_req_inter VX_csr_req();
VX_csr_wb_inter VX_csr_wb();
VX_csr_req_inter vx_csr_req();
VX_csr_wb_inter vx_csr_wb();
wire no_slot_csr;
wire stall_gpr_csr;
VX_gpr_stage VX_gpr_stage(
VX_gpr_stage vx_gpr_stage(
.clk (clk),
.reset (reset),
.schedule_delay (schedule_delay),
.VX_writeback_inter(VX_writeback_temp),
.VX_bckE_req (VX_bckE_req),
.vx_writeback_inter(vx_writeback_temp),
.vx_bckE_req (vx_bckE_req),
// New
.VX_exec_unit_req(VX_exec_unit_req),
.VX_lsu_req (VX_lsu_req),
.VX_gpu_inst_req (VX_gpu_inst_req),
.VX_csr_req (VX_csr_req),
.vx_exec_unit_req(vx_exec_unit_req),
.vx_lsu_req (vx_lsu_req),
.vx_gpu_inst_req (vx_gpu_inst_req),
.vx_csr_req (vx_csr_req),
.stall_gpr_csr (stall_gpr_csr),
// End new
.memory_delay (out_mem_delay),
@@ -81,62 +73,61 @@ VX_gpr_stage VX_gpr_stage(
.gpr_stage_delay (gpr_stage_delay)
);
VX_lsu load_store_unit(
VX_lsu load_store_unit (
.clk (clk),
.reset (reset),
.VX_lsu_req (VX_lsu_req),
.VX_mem_wb (VX_mem_wb),
.VX_dcache_rsp(VX_dcache_rsp),
.VX_dcache_req(VX_dcache_req),
.vx_lsu_req (vx_lsu_req),
.vx_mem_wb (vx_mem_wb),
.vx_dcache_rsp(vx_dcache_rsp),
.vx_dcache_req(vx_dcache_req),
.out_delay (out_mem_delay),
.no_slot_mem (no_slot_mem)
);
);
VX_execute_unit VX_execUnit(
VX_execute_unit vx_execUnit (
.clk (clk),
.reset (reset),
.VX_exec_unit_req(VX_exec_unit_req),
.VX_inst_exec_wb (VX_inst_exec_wb),
.VX_jal_rsp (VX_jal_rsp),
.VX_branch_rsp (VX_branch_rsp),
.vx_exec_unit_req(vx_exec_unit_req),
.vx_inst_exec_wb (vx_inst_exec_wb),
.vx_jal_rsp (vx_jal_rsp),
.vx_branch_rsp (vx_branch_rsp),
.out_delay (out_exec_delay),
.no_slot_exec (no_slot_exec)
);
);
VX_gpgpu_inst vx_gpgpu_inst (
.vx_gpu_inst_req(vx_gpu_inst_req),
.vx_warp_ctl (vx_warp_ctl)
);
VX_gpgpu_inst VX_gpgpu_inst(
.VX_gpu_inst_req(VX_gpu_inst_req),
.VX_warp_ctl (VX_warp_ctl)
);
// VX_csr_wrapper VX_csr_wrapper(
// .VX_csr_req(VX_csr_req),
// .VX_csr_wb (VX_csr_wb)
// VX_csr_wrapper vx_csr_wrapper(
// .vx_csr_req(vx_csr_req),
// .vx_csr_wb (vx_csr_wb)
// );
VX_csr_pipe #(.CORE_ID(CORE_ID)) VX_csr_pipe(
VX_csr_pipe #(
.CORE_ID(CORE_ID)
) vx_csr_pipe (
.clk (clk),
.reset (reset),
.no_slot_csr (no_slot_csr),
.VX_csr_req (VX_csr_req),
.VX_writeback(VX_writeback_temp),
.VX_csr_wb (VX_csr_wb),
.vx_csr_req (vx_csr_req),
.vx_writeback(vx_writeback_temp),
.vx_csr_wb (vx_csr_wb),
.stall_gpr_csr(stall_gpr_csr)
);
);
VX_writeback VX_wb(
VX_writeback vx_wb (
.clk (clk),
.reset (reset),
.VX_mem_wb (VX_mem_wb),
.VX_inst_exec_wb (VX_inst_exec_wb),
.VX_csr_wb (VX_csr_wb),
.vx_mem_wb (vx_mem_wb),
.vx_inst_exec_wb (vx_inst_exec_wb),
.vx_csr_wb (vx_csr_wb),
.VX_writeback_inter(VX_writeback_temp),
.vx_writeback_inter(vx_writeback_temp),
.no_slot_mem (no_slot_mem),
.no_slot_exec (no_slot_exec),
.no_slot_csr (no_slot_csr)
);
);
endmodule

View File

@@ -4,21 +4,20 @@ module VX_csr_data (
input wire clk, // Clock
input wire reset,
input wire[11:0] in_read_csr_address,
input wire[`CSR_ADDR_SIZE-1:0] in_read_csr_address,
input wire in_write_valid,
input wire[`CSR_WIDTH-1:0] in_write_csr_data,
input wire in_write_valid,
input wire[31:0] in_write_csr_data,
input wire[11:0] in_write_csr_address,
/* verilator lint_off UNUSED */
// We use a smaller storage for CSRs than the standard 4KB in RISC-V
input wire[`CSR_ADDR_SIZE-1:0] in_write_csr_address,
/* verilator lint_on UNUSED */
output wire[31:0] out_read_csr_data,
// For instruction retire counting
input wire in_writeback_valid
);
/* verilator lint_off WIDTH */
// wire[`NUM_THREADS-1:0][31:0] thread_ids;
// wire[`NUM_THREADS-1:0][31:0] warp_ids;
@@ -32,45 +31,44 @@ module VX_csr_data (
// assign warp_ids[cur_tw] = {{(31-`NW_BITS-1){1'b0}}, in_read_warp_num};
// end
reg[11:0] csr[1023:0];
reg[63:0] cycle;
reg[63:0] instret;
reg [`CSR_WIDTH-1:0] csr[`NUM_CSRS-1:0];
reg [63:0] cycle;
reg [63:0] instret;
wire read_cycle;
wire read_cycleh;
wire read_instret;
wire read_instreth;
assign read_cycle = in_read_csr_address == 12'hC00;
assign read_cycleh = in_read_csr_address == 12'hC80;
assign read_instret = in_read_csr_address == 12'hC02;
assign read_instreth = in_read_csr_address == 12'hC82;
assign read_cycle = in_read_csr_address == `CSR_CYCL_L;
assign read_cycleh = in_read_csr_address == `CSR_CYCL_H;
assign read_instret = in_read_csr_address == `CSR_INST_L;
assign read_instreth = in_read_csr_address == `CSR_INST_H;
wire [$clog2(`NUM_CSRS)-1:0] read_addr, write_addr;
// cast address to physical CSR range
assign read_addr = $size(read_addr)'(in_read_csr_address);
assign write_addr = $size(write_addr)'(in_write_csr_address);
// wire thread_select = in_read_csr_address == 12'h20;
// wire warp_select = in_read_csr_address == 12'h21;
// assign out_read_csr_data = thread_select ? thread_ids :
// assign out_read_csr_data = thread_select ? thread_ids :
// warp_select ? warp_ids :
// 0;
integer curr_e;
always @(posedge clk or posedge reset) begin
genvar curr_e;
always @(posedge clk) begin
if (reset) begin
for (curr_e = 0; curr_e < 1024; curr_e=curr_e+1) begin
`ifdef VERILATOR
// - Verilator does not support delayed assignment in loops.
csr[curr_e] = 0;
`else
csr[curr_e] <= 0;
`endif
end
cycle <= 0;
instret <= 0;
end else begin
cycle <= cycle + 1;
if (in_write_valid) begin
csr[in_write_csr_address] <= in_write_csr_data[11:0];
csr[write_addr] <= in_write_csr_data;
end
if (in_writeback_valid) begin
instret <= instret + 1;
@@ -78,12 +76,9 @@ module VX_csr_data (
end
end
assign out_read_csr_data = read_cycle ? cycle[31:0] :
read_cycleh ? cycle[63:32] :
read_instret ? instret[31:0] :
read_instreth ? instret[63:32] :
{{20{1'b0}}, csr[in_read_csr_address]};
/* verilator lint_on WIDTH */
assign out_read_csr_data = read_cycle ? cycle[31:0] :
read_cycleh ? cycle[63:32] :
read_instret ? instret[31:0] :
read_instreth ? instret[63:32] :
{{20{1'b0}}, csr[read_addr]};
endmodule : VX_csr_data

View File

@@ -1,78 +1,63 @@
module VX_csr_handler (
input wire clk,
input wire[11:0] in_decode_csr_address, // done
VX_csr_write_request_inter VX_csr_w_req,
input wire in_wb_valid,
output wire[31:0] out_decode_csr_data // done
);
input wire clk,
input wire[`CSR_ADDR_SIZE-1:0] in_decode_csr_address, // done
VX_csr_write_request_inter vx_csr_w_req,
input wire in_wb_valid,
output wire[31:0] out_decode_csr_data // done
);
wire in_mem_is_csr;
wire[`CSR_ADDR_SIZE-1:0] in_mem_csr_address;
wire[31:0] in_mem_csr_result;
wire in_mem_is_csr;
wire[11:0] in_mem_csr_address;
/* verilator lint_off UNUSED */
wire[31:0] in_mem_csr_result;
/* verilator lint_on UNUSED */
assign in_mem_is_csr = vx_csr_w_req.is_csr;
assign in_mem_csr_address = vx_csr_w_req.csr_address;
assign in_mem_csr_result = vx_csr_w_req.csr_result;
reg [`CSR_WIDTH-1:0] csr [`NUM_CSRS-1:0];
reg [63:0] cycle;
reg [63:0] instret;
reg [`CSR_ADDR_SIZE-1:0] decode_csr_address;
assign in_mem_is_csr = VX_csr_w_req.is_csr;
assign in_mem_csr_address = VX_csr_w_req.csr_address;
assign in_mem_csr_result = VX_csr_w_req.csr_result;
wire read_cycle;
wire read_cycleh;
wire read_instret;
wire read_instreth;
initial begin
cycle = 0;
instret = 0;
decode_csr_address = 0;
end
reg[1024:0][11:0] csr;
reg[63:0] cycle;
reg[63:0] instret;
reg[11:0] decode_csr_address;
wire read_cycle;
wire read_cycleh;
wire read_instret;
wire read_instreth;
initial begin
cycle = 0;
instret = 0;
decode_csr_address = 0;
always @(posedge clk) begin
cycle <= cycle + 1;
decode_csr_address <= in_decode_csr_address;
if (in_wb_valid) begin
instret <= instret + 1;
end
end
reg[`CSR_WIDTH-1:0] data_read;
always @(posedge clk) begin
cycle <= cycle + 1;
decode_csr_address <= in_decode_csr_address;
if (in_wb_valid) begin
instret <= instret + 1;
end
always @(posedge clk) begin
if (in_mem_is_csr) begin
csr[in_mem_csr_address] <= in_mem_csr_result[11:0];
end
end
reg[11:0] data_read;
always @(posedge clk) begin
if(in_mem_is_csr) begin
csr[in_mem_csr_address] <= in_mem_csr_result[11:0];
end
end
assign data_read = csr[decode_csr_address];
assign read_cycle = decode_csr_address == 12'hC00;
assign read_cycleh = decode_csr_address == 12'hC80;
assign read_instret = decode_csr_address == 12'hC02;
assign read_instreth = decode_csr_address == 12'hC82;
/* verilator lint_off WIDTH */
assign out_decode_csr_data = read_cycle ? cycle[31:0] :
read_cycleh ? cycle[63:32] :
read_instret ? instret[31:0] :
read_instreth ? instret[63:32] :
{{20{1'b0}}, data_read};
/* verilator lint_on WIDTH */
assign data_read = csr[decode_csr_address];
assign read_cycle = decode_csr_address == `CSR_CYCL_L;
assign read_cycleh = decode_csr_address == `CSR_CYCL_H;
assign read_instret = decode_csr_address == `CSR_INST_L;
assign read_instreth = decode_csr_address == `CSR_INST_H;
assign out_decode_csr_data = read_cycle ? cycle[31:0] :
read_cycleh ? cycle[63:32] :
read_instret ? instret[31:0] :
read_instreth ? instret[63:32] :
{{20{1'b0}}, data_read};
endmodule // VX_csr_handler

View File

@@ -1,16 +1,14 @@
`include "VX_define.vh"
module VX_csr_pipe
#(
parameter CORE_ID = 0
)
(
module VX_csr_pipe #(
parameter CORE_ID = 0
) (
input wire clk, // Clock
input wire reset,
input wire no_slot_csr,
VX_csr_req_inter VX_csr_req,
VX_wb_inter VX_writeback,
VX_csr_wb_inter VX_csr_wb,
VX_csr_req_inter vx_csr_req,
VX_wb_inter vx_writeback,
VX_csr_wb_inter vx_csr_wb,
output wire stall_gpr_csr
);
@@ -18,64 +16,61 @@ module VX_csr_pipe
wire[`NW_BITS-1:0] warp_num_s2;
wire[4:0] rd_s2;
wire[1:0] wb_s2;
wire[4:0] alu_op_s2;
wire is_csr_s2;
wire[11:0] csr_address_s2;
wire[`CSR_ADDR_SIZE-1:0] csr_address_s2;
wire[31:0] csr_read_data_s2;
wire[31:0] csr_updated_data_s2;
wire[31:0] csr_read_data_unqual;
wire[31:0] csr_read_data;
assign stall_gpr_csr = no_slot_csr && VX_csr_req.is_csr && |(VX_csr_req.valid);
assign stall_gpr_csr = no_slot_csr && vx_csr_req.is_csr && |(vx_csr_req.valid);
assign csr_read_data = (csr_address_s2 == VX_csr_req.csr_address) ? csr_updated_data_s2 : csr_read_data_unqual;
assign csr_read_data = (csr_address_s2 == vx_csr_req.csr_address) ? csr_updated_data_s2 : csr_read_data_unqual;
wire writeback = |VX_writeback.wb_valid;
VX_csr_data VX_csr_data(
wire writeback = |vx_writeback.wb_valid;
VX_csr_data vx_csr_data(
.clk (clk),
.reset (reset),
.in_read_csr_address (VX_csr_req.csr_address),
.in_read_csr_address (vx_csr_req.csr_address),
.in_write_valid (is_csr_s2),
.in_write_csr_data (csr_updated_data_s2),
.in_write_csr_data (csr_updated_data_s2[`CSR_WIDTH-1:0]),
.in_write_csr_address(csr_address_s2),
.out_read_csr_data (csr_read_data_unqual),
.in_writeback_valid (writeback)
);
);
reg [31:0] csr_updated_data;
reg[31:0] csr_updated_data;
always @(*) begin
case(VX_csr_req.alu_op)
`CSR_ALU_RW: csr_updated_data = VX_csr_req.csr_mask;
`CSR_ALU_RS: csr_updated_data = csr_read_data | VX_csr_req.csr_mask;
`CSR_ALU_RC: csr_updated_data = csr_read_data & (32'hFFFFFFFF - VX_csr_req.csr_mask);
case (vx_csr_req.alu_op)
`CSR_ALU_RW: csr_updated_data = vx_csr_req.csr_mask;
`CSR_ALU_RS: csr_updated_data = csr_read_data | vx_csr_req.csr_mask;
`CSR_ALU_RC: csr_updated_data = csr_read_data & (32'hFFFFFFFF - vx_csr_req.csr_mask);
default: csr_updated_data = 32'hdeadbeef;
endcase
end
wire zero = 0;
VX_generic_register #(.N(32 + 32 + 12 + 1 + 2 + 5 + (`NW_BITS-1+1) + `NUM_THREADS)) csr_reg_s2 (
VX_generic_register #(
.N(32 + 32 + 12 + 1 + 2 + 5 + (`NW_BITS-1+1) + `NUM_THREADS)
) csr_reg_s2 (
.clk (clk),
.reset(reset),
.stall(no_slot_csr),
.flush(zero),
.in ({VX_csr_req.valid, VX_csr_req.warp_num, VX_csr_req.rd, VX_csr_req.wb, VX_csr_req.is_csr, VX_csr_req.csr_address, csr_read_data , csr_updated_data }),
.in ({vx_csr_req.valid, vx_csr_req.warp_num, vx_csr_req.rd, vx_csr_req.wb, vx_csr_req.is_csr, vx_csr_req.csr_address, csr_read_data , csr_updated_data }),
.out ({valid_s2 , warp_num_s2 , rd_s2 , wb_s2 , is_csr_s2 , csr_address_s2 , csr_read_data_s2, csr_updated_data_s2})
);
);
wire [`NUM_THREADS-1:0][31:0] final_csr_data;
wire[`NUM_THREADS-1:0][31:0] final_csr_data;
wire[`NUM_THREADS-1:0][31:0] thread_ids;
wire[`NUM_THREADS-1:0][31:0] warp_ids;
wire[`NUM_THREADS-1:0][31:0] warp_idz;
wire[`NUM_THREADS-1:0][31:0] csr_vec_read_data_s2;
wire [`NUM_THREADS-1:0][31:0] thread_ids;
wire [`NUM_THREADS-1:0][31:0] warp_ids;
wire [`NUM_THREADS-1:0][31:0] warp_idz;
wire [`NUM_THREADS-1:0][31:0] csr_vec_read_data_s2;
genvar cur_t;
for (cur_t = 0; cur_t < `NUM_THREADS; cur_t = cur_t + 1) begin
@@ -102,10 +97,10 @@ module VX_csr_pipe
warp_id_select ? warp_idz :
csr_vec_read_data_s2;
assign VX_csr_wb.valid = valid_s2;
assign VX_csr_wb.warp_num = warp_num_s2;
assign VX_csr_wb.rd = rd_s2;
assign VX_csr_wb.wb = wb_s2;
assign VX_csr_wb.csr_result = final_csr_data;
assign vx_csr_wb.valid = valid_s2;
assign vx_csr_wb.warp_num = warp_num_s2;
assign vx_csr_wb.rd = rd_s2;
assign vx_csr_wb.wb = wb_s2;
assign vx_csr_wb.csr_result = final_csr_data;
endmodule

View File

@@ -2,9 +2,8 @@
`include "VX_define.vh"
module VX_csr_wrapper (
VX_csr_req_inter VX_csr_req,
VX_csr_wb_inter VX_csr_wb
VX_csr_req_inter vx_csr_req,
VX_csr_wb_inter vx_csr_wb
);
@@ -18,21 +17,21 @@ module VX_csr_wrapper (
end
for (cur_tw = 0; cur_tw < `NUM_THREADS; cur_tw = cur_tw + 1) begin : warp_ids_init
assign warp_ids[cur_tw] = {{(31-`NW_BITS-1){1'b0}}, VX_csr_req.warp_num};
assign warp_ids[cur_tw] = {{(31-`NW_BITS-1){1'b0}}, vx_csr_req.warp_num};
end
endgenerate
assign VX_csr_wb.valid = VX_csr_req.valid;
assign VX_csr_wb.warp_num = VX_csr_req.warp_num;
assign VX_csr_wb.rd = VX_csr_req.rd;
assign VX_csr_wb.wb = VX_csr_req.wb;
assign vx_csr_wb.valid = vx_csr_req.valid;
assign vx_csr_wb.warp_num = vx_csr_req.warp_num;
assign vx_csr_wb.rd = vx_csr_req.rd;
assign vx_csr_wb.wb = vx_csr_req.wb;
wire thread_select = VX_csr_req.csr_address == 12'h20;
wire warp_select = VX_csr_req.csr_address == 12'h21;
wire thread_select = vx_csr_req.csr_address == 12'h20;
wire warp_select = vx_csr_req.csr_address == 12'h21;
assign VX_csr_wb.csr_result = thread_select ? thread_ids :
assign vx_csr_wb.csr_result = thread_select ? thread_ids :
warp_select ? warp_ids :
0;

View File

@@ -6,349 +6,338 @@ module VX_decode(
VX_inst_meta_inter fd_inst_meta_de,
// Outputs
VX_frE_to_bckE_req_inter VX_frE_to_bckE_req,
VX_wstall_inter VX_wstall,
VX_join_inter VX_join,
VX_frE_to_bckE_req_inter vx_frE_to_bckE_req,
VX_wstall_inter vx_wstall,
VX_join_inter vx_join,
output wire terminate_sim
);
wire[31:0] in_instruction = fd_inst_meta_de.instruction;
wire[31:0] in_curr_PC = fd_inst_meta_de.inst_pc;
wire[`NW_BITS-1:0] in_warp_num = fd_inst_meta_de.warp_num;
wire[31:0] in_instruction = fd_inst_meta_de.instruction;
wire[31:0] in_curr_PC = fd_inst_meta_de.inst_pc;
wire[`NW_BITS-1:0] in_warp_num = fd_inst_meta_de.warp_num;
assign VX_frE_to_bckE_req.curr_PC = in_curr_PC;
assign vx_frE_to_bckE_req.curr_PC = in_curr_PC;
wire[`NUM_THREADS-1:0] in_valid = fd_inst_meta_de.valid;
wire[`NUM_THREADS-1:0] in_valid = fd_inst_meta_de.valid;
wire[6:0] curr_opcode;
wire[6:0] curr_opcode;
wire is_itype;
wire is_rtype;
wire is_stype;
wire is_btype;
wire is_linst;
wire is_jal;
wire is_jalr;
wire is_lui;
wire is_auipc;
wire is_csr;
wire is_csr_immed;
wire is_e_inst;
wire is_itype;
wire is_rtype;
wire is_stype;
wire is_btype;
wire is_linst;
wire is_jal;
wire is_jalr;
wire is_lui;
wire is_auipc;
wire is_csr;
wire is_csr_immed;
wire is_e_inst;
wire is_gpgpu;
wire is_wspawn;
wire is_tmc;
wire is_split;
wire is_join;
wire is_barrier;
wire is_gpgpu;
wire is_wspawn;
wire is_tmc;
wire is_split;
wire is_join;
wire is_barrier;
wire[2:0] func3;
wire[6:0] func7;
wire[11:0] u_12;
wire[2:0] func3;
wire[6:0] func7;
wire[11:0] u_12;
wire[7:0] jal_b_19_to_12;
wire jal_b_11;
wire[9:0] jal_b_10_to_1;
wire jal_b_20;
wire jal_b_0;
wire[20:0] jal_unsigned_offset;
wire[31:0] jal_1_offset;
wire[7:0] jal_b_19_to_12;
wire jal_b_11;
wire[9:0] jal_b_10_to_1;
wire jal_b_20;
wire jal_b_0;
wire[20:0] jal_unsigned_offset;
wire[31:0] jal_1_offset;
wire[11:0] jalr_immed;
wire[31:0] jal_2_offset;
wire[11:0] jalr_immed;
wire[31:0] jal_2_offset;
wire jal_sys_cond1;
wire jal_sys_cond2;
wire jal_sys_jal;
wire[31:0] jal_sys_off;
wire jal_sys_cond1;
wire jal_sys_cond2;
wire jal_sys_jal;
wire[31:0] jal_sys_off;
wire csr_cond1;
wire csr_cond2;
wire csr_cond1;
wire csr_cond2;
wire[11:0] alu_tempp;
wire alu_shift_i;
wire[11:0] alu_shift_i_immed;
wire[11:0] alu_tempp;
wire alu_shift_i;
wire[11:0] alu_shift_i_immed;
wire[1:0] csr_type;
wire[1:0] csr_type;
reg[4:0] csr_alu;
reg[4:0] alu_op;
reg[4:0] mul_alu;
reg[19:0] temp_upper_immed;
reg temp_jal;
reg[31:0] temp_jal_offset;
reg[31:0] temp_itype_immed;
reg[2:0] temp_branch_type;
reg temp_branch_stall;
reg[4:0] csr_alu;
reg[4:0] alu_op;
reg[4:0] mul_alu;
reg[19:0] temp_upper_immed;
reg temp_jal;
reg[31:0] temp_jal_offset;
reg[31:0] temp_itype_immed;
reg[2:0] temp_branch_type;
reg temp_branch_stall;
// always @(posedge reset) begin
// end
assign vx_frE_to_bckE_req.valid = fd_inst_meta_de.valid;
assign VX_frE_to_bckE_req.valid = fd_inst_meta_de.valid;
assign vx_frE_to_bckE_req.warp_num = in_warp_num;
assign VX_frE_to_bckE_req.warp_num = in_warp_num;
assign curr_opcode = in_instruction[6:0];
assign vx_frE_to_bckE_req.rd = in_instruction[11:7];
assign vx_frE_to_bckE_req.rs1 = in_instruction[19:15];
assign vx_frE_to_bckE_req.rs2 = in_instruction[24:20];
assign func3 = in_instruction[14:12];
assign func7 = in_instruction[31:25];
assign u_12 = in_instruction[31:20];
assign vx_frE_to_bckE_req.PC_next = in_curr_PC + 32'h4;
// Write Back sigal
assign is_rtype = (curr_opcode == `R_INST);
assign is_linst = (curr_opcode == `L_INST);
assign is_itype = (curr_opcode == `ALU_INST) || is_linst;
assign is_stype = (curr_opcode == `S_INST);
assign is_btype = (curr_opcode == `B_INST);
assign is_jal = (curr_opcode == `JAL_INST);
assign is_jalr = (curr_opcode == `JALR_INST);
assign is_lui = (curr_opcode == `LUI_INST);
assign is_auipc = (curr_opcode == `AUIPC_INST);
assign is_csr = (curr_opcode == `SYS_INST) && (func3 != 0);
assign is_csr_immed = (is_csr) && (func3[2] == 1);
// assign is_e_inst = (curr_opcode == `SYS_INST) && (func3 == 0);
assign is_e_inst = in_instruction == 32'h00000073;
assign is_gpgpu = (curr_opcode == `GPGPU_INST);
assign is_tmc = is_gpgpu && (func3 == 0); // Goes to BE
assign is_wspawn = is_gpgpu && (func3 == 1); // Goes to BE
assign is_barrier = is_gpgpu && (func3 == 4); // Goes to BE
assign is_split = is_gpgpu && (func3 == 2); // Goes to BE
assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE
assign curr_opcode = in_instruction[6:0];
assign vx_join.is_join = is_join;
assign vx_join.join_warp_num = in_warp_num;
assign VX_frE_to_bckE_req.rd = in_instruction[11:7];
assign VX_frE_to_bckE_req.rs1 = in_instruction[19:15];
assign VX_frE_to_bckE_req.rs2 = in_instruction[24:20];
assign func3 = in_instruction[14:12];
assign func7 = in_instruction[31:25];
assign u_12 = in_instruction[31:20];
assign VX_frE_to_bckE_req.PC_next = in_curr_PC + 32'h4;
// Write Back sigal
assign is_rtype = (curr_opcode == `R_INST);
assign is_linst = (curr_opcode == `L_INST);
assign is_itype = (curr_opcode == `ALU_INST) || is_linst;
assign is_stype = (curr_opcode == `S_INST);
assign is_btype = (curr_opcode == `B_INST);
assign is_jal = (curr_opcode == `JAL_INST);
assign is_jalr = (curr_opcode == `JALR_INST);
assign is_lui = (curr_opcode == `LUI_INST);
assign is_auipc = (curr_opcode == `AUIPC_INST);
assign is_csr = (curr_opcode == `SYS_INST) && (func3 != 0);
assign is_csr_immed = (is_csr) && (func3[2] == 1);
// assign is_e_inst = (curr_opcode == `SYS_INST) && (func3 == 0);
assign is_e_inst = in_instruction == 32'h00000073;
assign is_gpgpu = (curr_opcode == `GPGPU_INST);
assign is_tmc = is_gpgpu && (func3 == 0); // Goes to BE
assign is_wspawn = is_gpgpu && (func3 == 1); // Goes to BE
assign is_barrier = is_gpgpu && (func3 == 4); // Goes to BE
assign is_split = is_gpgpu && (func3 == 2); // Goes to BE
assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE
assign VX_join.is_join = is_join;
assign VX_join.join_warp_num = in_warp_num;
assign VX_frE_to_bckE_req.is_wspawn = is_wspawn;
assign VX_frE_to_bckE_req.is_tmc = is_tmc;
assign VX_frE_to_bckE_req.is_split = is_split;
assign VX_frE_to_bckE_req.is_barrier = is_barrier;
assign vx_frE_to_bckE_req.is_wspawn = is_wspawn;
assign vx_frE_to_bckE_req.is_tmc = is_tmc;
assign vx_frE_to_bckE_req.is_split = is_split;
assign vx_frE_to_bckE_req.is_barrier = is_barrier;
assign VX_frE_to_bckE_req.csr_immed = is_csr_immed;
assign VX_frE_to_bckE_req.is_csr = is_csr;
assign vx_frE_to_bckE_req.csr_immed = is_csr_immed;
assign vx_frE_to_bckE_req.is_csr = is_csr;
assign VX_frE_to_bckE_req.wb = (is_jal || is_jalr || is_e_inst) ? `WB_JAL :
is_linst ? `WB_MEM :
(is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU :
`NO_WB;
assign vx_frE_to_bckE_req.wb = (is_jal || is_jalr || is_e_inst) ? `WB_JAL :
is_linst ? `WB_MEM :
(is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU :
`NO_WB;
assign VX_frE_to_bckE_req.rs2_src = (is_itype || is_stype) ? `RS2_IMMED : `RS2_REG;
assign vx_frE_to_bckE_req.rs2_src = (is_itype || is_stype) ? `RS2_IMMED : `RS2_REG;
// MEM signals
assign VX_frE_to_bckE_req.mem_read = (is_linst) ? func3 : `NO_MEM_READ;
assign VX_frE_to_bckE_req.mem_write = (is_stype) ? func3 : `NO_MEM_WRITE;
// MEM signals
assign vx_frE_to_bckE_req.mem_read = (is_linst) ? func3 : `NO_MEM_READ;
assign vx_frE_to_bckE_req.mem_write = (is_stype) ? func3 : `NO_MEM_WRITE;
// UPPER IMMEDIATE
always @(*) begin
case(curr_opcode)
`LUI_INST: temp_upper_immed = {func7, VX_frE_to_bckE_req.rs2, VX_frE_to_bckE_req.rs1, func3};
`AUIPC_INST: temp_upper_immed = {func7, VX_frE_to_bckE_req.rs2, VX_frE_to_bckE_req.rs1, func3};
default: temp_upper_immed = 20'h0;
endcase // curr_opcode
end
// UPPER IMMEDIATE
always @(*) begin
case(curr_opcode)
`LUI_INST: temp_upper_immed = {func7, vx_frE_to_bckE_req.rs2, vx_frE_to_bckE_req.rs1, func3};
`AUIPC_INST: temp_upper_immed = {func7, vx_frE_to_bckE_req.rs2, vx_frE_to_bckE_req.rs1, func3};
default: temp_upper_immed = 20'h0;
endcase // curr_opcode
end
assign VX_frE_to_bckE_req.upper_immed = temp_upper_immed;
assign vx_frE_to_bckE_req.upper_immed = temp_upper_immed;
assign jal_b_19_to_12 = in_instruction[19:12];
assign jal_b_11 = in_instruction[20];
assign jal_b_10_to_1 = in_instruction[30:21];
assign jal_b_20 = in_instruction[31];
assign jal_b_0 = 1'b0;
assign jal_unsigned_offset = {jal_b_20, jal_b_19_to_12, jal_b_11, jal_b_10_to_1, jal_b_0};
assign jal_1_offset = {{11{jal_b_20}}, jal_unsigned_offset};
assign jal_b_19_to_12 = in_instruction[19:12];
assign jal_b_11 = in_instruction[20];
assign jal_b_10_to_1 = in_instruction[30:21];
assign jal_b_20 = in_instruction[31];
assign jal_b_0 = 1'b0;
assign jal_unsigned_offset = {jal_b_20, jal_b_19_to_12, jal_b_11, jal_b_10_to_1, jal_b_0};
assign jal_1_offset = {{11{jal_b_20}}, jal_unsigned_offset};
assign jalr_immed = {func7, VX_frE_to_bckE_req.rs2};
assign jal_2_offset = {{20{jalr_immed[11]}}, jalr_immed};
assign jalr_immed = {func7, vx_frE_to_bckE_req.rs2};
assign jal_2_offset = {{20{jalr_immed[11]}}, jalr_immed};
assign jal_sys_cond1 = func3 == 3'h0;
assign jal_sys_cond2 = u_12 < 12'h2;
assign jal_sys_cond1 = func3 == 3'h0;
assign jal_sys_cond2 = u_12 < 12'h2;
assign jal_sys_jal = (jal_sys_cond1 && jal_sys_cond2) ? 1'b1 : 1'b0;
assign jal_sys_off = (jal_sys_cond1 && jal_sys_cond2) ? 32'hb0000000 : 32'hdeadbeef;
assign jal_sys_jal = (jal_sys_cond1 && jal_sys_cond2) ? 1'b1 : 1'b0;
assign jal_sys_off = (jal_sys_cond1 && jal_sys_cond2) ? 32'hb0000000 : 32'hdeadbeef;
// JAL
always @(*) begin
case(curr_opcode)
`JAL_INST:
begin
temp_jal = 1'b1 && (|in_valid);
temp_jal_offset = jal_1_offset;
end
`JALR_INST:
begin
temp_jal = 1'b1 && (|in_valid);
temp_jal_offset = jal_2_offset;
end
`SYS_INST:
begin
// $display("SYS EBREAK %h", (jal_sys_jal && (|in_valid)) );
temp_jal = jal_sys_jal && (|in_valid);
temp_jal_offset = jal_sys_off;
end
default:
begin
temp_jal = 1'b0 && (|in_valid);
temp_jal_offset = 32'hdeadbeef;
end
// JAL
always @(*) begin
case(curr_opcode)
`JAL_INST:
begin
temp_jal = 1'b1 && (|in_valid);
temp_jal_offset = jal_1_offset;
end
`JALR_INST:
begin
temp_jal = 1'b1 && (|in_valid);
temp_jal_offset = jal_2_offset;
end
`SYS_INST:
begin
// $display("SYS EBREAK %h", (jal_sys_jal && (|in_valid)) );
temp_jal = jal_sys_jal && (|in_valid);
temp_jal_offset = jal_sys_off;
end
default:
begin
temp_jal = 1'b0 && (|in_valid);
temp_jal_offset = 32'hdeadbeef;
end
endcase
end
assign vx_frE_to_bckE_req.jalQual = is_jal;
assign vx_frE_to_bckE_req.jal = temp_jal;
assign vx_frE_to_bckE_req.jal_offset = temp_jal_offset;
// wire is_ebreak;
// assign is_ebreak = is_e_inst;
wire ebreak = (curr_opcode == `SYS_INST) && (jal_sys_jal && (|in_valid));
assign vx_frE_to_bckE_req.ebreak = ebreak;
assign terminate_sim = is_e_inst;
// CSR
assign csr_cond1 = func3 != 3'h0;
assign csr_cond2 = u_12 >= 12'h2;
assign vx_frE_to_bckE_req.csr_address = (csr_cond1 && csr_cond2) ? u_12 : 12'h55;
// ITYPE IMEED
assign alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5);
assign alu_shift_i_immed = {{7{1'b0}}, vx_frE_to_bckE_req.rs2};
assign alu_tempp = alu_shift_i ? alu_shift_i_immed : u_12;
always @(*) begin
case(curr_opcode)
`ALU_INST: temp_itype_immed = {{20{alu_tempp[11]}}, alu_tempp};
`S_INST: temp_itype_immed = {{20{func7[6]}}, func7, vx_frE_to_bckE_req.rd};
`L_INST: temp_itype_immed = {{20{u_12[11]}}, u_12};
`B_INST: temp_itype_immed = {{20{in_instruction[31]}}, in_instruction[31], in_instruction[7], in_instruction[30:25], in_instruction[11:8]};
default: temp_itype_immed = 32'hdeadbeef;
endcase
end
end
assign VX_frE_to_bckE_req.jalQual = is_jal;
assign VX_frE_to_bckE_req.jal = temp_jal;
assign VX_frE_to_bckE_req.jal_offset = temp_jal_offset;
assign vx_frE_to_bckE_req.itype_immed = temp_itype_immed;
// wire is_ebreak;
always @(*) begin
case(curr_opcode)
`B_INST:
begin
// $display("BRANCH IN DECODE");
temp_branch_stall = 1'b1 && (|in_valid);
case(func3)
3'h0: temp_branch_type = `BEQ;
3'h1: temp_branch_type = `BNE;
3'h4: temp_branch_type = `BLT;
3'h5: temp_branch_type = `BGT;
3'h6: temp_branch_type = `BLTU;
3'h7: temp_branch_type = `BGTU;
default: temp_branch_type = `NO_BRANCH;
endcase
end
`JAL_INST:
begin
temp_branch_type = `NO_BRANCH;
temp_branch_stall = 1'b1 && (|in_valid);
end
`JALR_INST:
begin
temp_branch_type = `NO_BRANCH;
temp_branch_stall = 1'b1 && (|in_valid);
end
default:
begin
temp_branch_type = `NO_BRANCH;
temp_branch_stall = 1'b0 && (|in_valid);
end
endcase
end
// assign is_ebreak = is_e_inst;
wire ebreak = (curr_opcode == `SYS_INST) && (jal_sys_jal && (|in_valid));
assign VX_frE_to_bckE_req.ebreak = ebreak;
wire out_ebreak = ebreak;
assign terminate_sim = is_e_inst;
assign vx_frE_to_bckE_req.branch_type = temp_branch_type;
assign vx_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_barrier) && (|in_valid);
assign vx_wstall.warp_num = in_warp_num;
// CSR
always @(*) begin
// ALU OP
case(func3)
3'h0: alu_op = (curr_opcode == `ALU_INST) ? `ADD : (func7 == 7'h0 ? `ADD : `SUB);
3'h1: alu_op = `SLLA;
3'h2: alu_op = `SLT;
3'h3: alu_op = `SLTU;
3'h4: alu_op = `XOR;
3'h5: alu_op = (func7 == 7'h0) ? `SRL : `SRA;
3'h6: alu_op = `OR;
3'h7: alu_op = `AND;
default: alu_op = `NO_ALU;
endcase
end
assign csr_cond1 = func3 != 3'h0;
assign csr_cond2 = u_12 >= 12'h2;
always @(*) begin
// ALU OP
case(func3)
3'h0: mul_alu = `MUL;
3'h1: mul_alu = `MULH;
3'h2: mul_alu = `MULHSU;
3'h3: mul_alu = `MULHU;
3'h4: mul_alu = `DIV;
3'h5: mul_alu = `DIVU;
3'h6: mul_alu = `REM;
3'h7: mul_alu = `REMU;
default: mul_alu = `NO_ALU;
endcase
end
assign VX_frE_to_bckE_req.csr_address = (csr_cond1 && csr_cond2) ? u_12 : 12'h55;
assign csr_type = func3[1:0];
always @(*) begin
case(csr_type)
2'h1: csr_alu = `CSR_ALU_RW;
2'h2: csr_alu = `CSR_ALU_RS;
2'h3: csr_alu = `CSR_ALU_RC;
default: csr_alu = `NO_ALU;
endcase
end
// ITYPE IMEED
assign alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5);
assign alu_shift_i_immed = {{7{1'b0}}, VX_frE_to_bckE_req.rs2};
assign alu_tempp = alu_shift_i ? alu_shift_i_immed : u_12;
wire[4:0] temp_final_alu;
assign temp_final_alu = is_btype ? ((vx_frE_to_bckE_req.branch_type < `BLTU) ? `SUB : `SUBU) :
is_lui ? `LUI_ALU :
is_auipc ? `AUIPC_ALU :
is_csr ? csr_alu :
(is_stype || is_linst) ? `ADD :
alu_op;
always @(*) begin
case(curr_opcode)
`ALU_INST: temp_itype_immed = {{20{alu_tempp[11]}}, alu_tempp};
`S_INST: temp_itype_immed = {{20{func7[6]}}, func7, VX_frE_to_bckE_req.rd};
`L_INST: temp_itype_immed = {{20{u_12[11]}}, u_12};
`B_INST: temp_itype_immed = {{20{in_instruction[31]}}, in_instruction[31], in_instruction[7], in_instruction[30:25], in_instruction[11:8]};
default: temp_itype_immed = 32'hdeadbeef;
endcase
end
assign VX_frE_to_bckE_req.itype_immed = temp_itype_immed;
always @(*) begin
case(curr_opcode)
`B_INST:
begin
// $display("BRANCH IN DECODE");
temp_branch_stall = 1'b1 && (|in_valid);
case(func3)
3'h0: temp_branch_type = `BEQ;
3'h1: temp_branch_type = `BNE;
3'h4: temp_branch_type = `BLT;
3'h5: temp_branch_type = `BGT;
3'h6: temp_branch_type = `BLTU;
3'h7: temp_branch_type = `BGTU;
default: temp_branch_type = `NO_BRANCH;
endcase
end
`JAL_INST:
begin
temp_branch_type = `NO_BRANCH;
temp_branch_stall = 1'b1 && (|in_valid);
end
`JALR_INST:
begin
temp_branch_type = `NO_BRANCH;
temp_branch_stall = 1'b1 && (|in_valid);
end
default:
begin
temp_branch_type = `NO_BRANCH;
temp_branch_stall = 1'b0 && (|in_valid);
end
endcase
end
assign VX_frE_to_bckE_req.branch_type = temp_branch_type;
assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_barrier) && (|in_valid);
assign VX_wstall.warp_num = in_warp_num;
always @(*) begin
// ALU OP
case(func3)
3'h0: alu_op = (curr_opcode == `ALU_INST) ? `ADD : (func7 == 7'h0 ? `ADD : `SUB);
3'h1: alu_op = `SLLA;
3'h2: alu_op = `SLT;
3'h3: alu_op = `SLTU;
3'h4: alu_op = `XOR;
3'h5: alu_op = (func7 == 7'h0) ? `SRL : `SRA;
3'h6: alu_op = `OR;
3'h7: alu_op = `AND;
default: alu_op = `NO_ALU;
endcase
end
always @(*) begin
// ALU OP
case(func3)
3'h0: mul_alu = `MUL;
3'h1: mul_alu = `MULH;
3'h2: mul_alu = `MULHSU;
3'h3: mul_alu = `MULHU;
3'h4: mul_alu = `DIV;
3'h5: mul_alu = `DIVU;
3'h6: mul_alu = `REM;
3'h7: mul_alu = `REMU;
default: mul_alu = `NO_ALU;
endcase
end
assign csr_type = func3[1:0];
always @(*) begin
case(csr_type)
2'h1: csr_alu = `CSR_ALU_RW;
2'h2: csr_alu = `CSR_ALU_RS;
2'h3: csr_alu = `CSR_ALU_RC;
default: csr_alu = `NO_ALU;
endcase
end
wire[4:0] temp_final_alu;
assign temp_final_alu = is_btype ? ((VX_frE_to_bckE_req.branch_type < `BLTU) ? `SUB : `SUBU) :
is_lui ? `LUI_ALU :
is_auipc ? `AUIPC_ALU :
is_csr ? csr_alu :
(is_stype || is_linst) ? `ADD :
alu_op;
assign VX_frE_to_bckE_req.alu_op = ((func7[0] == 1'b1) && is_rtype) ? mul_alu : temp_final_alu;
assign vx_frE_to_bckE_req.alu_op = ((func7[0] == 1'b1) && is_rtype) ? mul_alu : temp_final_alu;
endmodule

View File

@@ -9,15 +9,35 @@
// `define ASIC 1
// `define SYN_FUNC 1
`define STRINGIFY(x) `"x`"
`define STATIC_ASSERT(cond, msg) \
generate \
if (!(cond)) $error(msg); \
endgenerate
`define LOG2UP(x) ((x > 1) ? $clog2(x) : 1)
`define NUM_CORES_PER_CLUSTER (`NUM_CORES / `NUM_CLUSTERS)
`define NW_BITS `LOG2UP(`NUM_WARPS)
`define NW_BITS (`LOG2UP(`NUM_WARPS))
`define NT_BITS `LOG2UP(`NUM_THREADS)
`define NT_BITS (`LOG2UP(`NUM_THREADS))
`define NC_BITS `LOG2UP(`NUM_CORES)
`define NC_BITS (`LOG2UP(`NUM_CORES))
`define NUM_GPRS 32
`define CSR_ADDR_SIZE 12
`define NUM_CSRS 1024
`define CSR_WIDTH 12
`define CSR_CYCL_L 12'hC00;
`define CSR_CYCL_H 12'hC80;
`define CSR_INST_L 12'hC02;
`define CSR_INST_H 12'hC82;
`define R_INST 7'd51
`define L_INST 7'd3
@@ -115,6 +135,9 @@
// Bank Number of words in a line
`define DBANK_LINE_WORDS (`DBANK_LINE_SIZE_BYTES / `DWORD_SIZE_BYTES)
// Word size in bits
`define DWORD_SIZE_BITS (`DWORD_SIZE_BYTES * 8)
// ======================= Icache Configurable Knobs ==========================
// Function ID

View File

@@ -5,78 +5,69 @@ module VX_dmem_controller (
input wire reset,
// Dram <-> Dcache
VX_gpu_dcache_dram_req_inter VX_gpu_dcache_dram_req,
VX_gpu_dcache_dram_res_inter VX_gpu_dcache_dram_res,
VX_gpu_snp_req_rsp VX_gpu_dcache_snp_req,
VX_gpu_dcache_dram_req_inter vx_gpu_dcache_dram_req,
VX_gpu_dcache_dram_rsp_inter vx_gpu_dcache_dram_res,
VX_gpu_snp_req_rsp vx_gpu_dcache_snp_req,
// Dram <-> Icache
VX_gpu_dcache_dram_req_inter VX_gpu_icache_dram_req,
VX_gpu_dcache_dram_res_inter VX_gpu_icache_dram_res,
VX_gpu_snp_req_rsp VX_gpu_icache_snp_req,
VX_gpu_dcache_dram_req_inter vx_gpu_icache_dram_req,
VX_gpu_dcache_dram_rsp_inter vx_gpu_icache_dram_res,
VX_gpu_snp_req_rsp vx_gpu_icache_snp_req,
// Core <-> Dcache
VX_gpu_dcache_res_inter VX_dcache_rsp,
VX_gpu_dcache_req_inter VX_dcache_req,
VX_gpu_dcache_rsp_inter vx_dcache_rsp,
VX_gpu_dcache_req_inter vx_dcache_req,
// Core <-> Icache
VX_gpu_dcache_res_inter VX_icache_rsp,
VX_gpu_dcache_req_inter VX_icache_req
VX_gpu_dcache_rsp_inter vx_icache_rsp,
VX_gpu_dcache_req_inter vx_icache_req
);
VX_gpu_dcache_rsp_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) vx_dcache_rsp_smem();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) vx_dcache_req_smem();
VX_gpu_dcache_res_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) VX_dcache_rsp_smem();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) VX_dcache_req_smem();
VX_gpu_dcache_rsp_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) vx_dcache_rsp_dcache();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) vx_dcache_req_dcache();
VX_gpu_dcache_res_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) VX_dcache_rsp_dcache();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) VX_dcache_req_dcache();
wire to_shm = VX_dcache_req.core_req_addr[0][31:24] == 8'hFF;
wire dcache_wants_wb = (|VX_dcache_rsp_dcache.core_wb_valid);
wire to_shm = vx_dcache_req.core_req_addr[0][31:24] == 8'hFF;
wire dcache_wants_wb = (|vx_dcache_rsp_dcache.core_wb_valid);
// Dcache Request
assign VX_dcache_req_dcache.core_req_valid = VX_dcache_req.core_req_valid & {`NUM_THREADS{~to_shm}};
assign VX_dcache_req_dcache.core_req_addr = VX_dcache_req.core_req_addr;
assign VX_dcache_req_dcache.core_req_writedata = VX_dcache_req.core_req_writedata;
assign VX_dcache_req_dcache.core_req_mem_read = VX_dcache_req.core_req_mem_read;
assign VX_dcache_req_dcache.core_req_mem_write = VX_dcache_req.core_req_mem_write;
assign VX_dcache_req_dcache.core_req_rd = VX_dcache_req.core_req_rd;
assign VX_dcache_req_dcache.core_req_wb = VX_dcache_req.core_req_wb;
assign VX_dcache_req_dcache.core_req_warp_num = VX_dcache_req.core_req_warp_num;
assign VX_dcache_req_dcache.core_req_pc = VX_dcache_req.core_req_pc;
assign VX_dcache_req_dcache.core_no_wb_slot = VX_dcache_req.core_no_wb_slot;
assign vx_dcache_req_dcache.core_req_valid = vx_dcache_req.core_req_valid & {`NUM_THREADS{~to_shm}};
assign vx_dcache_req_dcache.core_req_addr = vx_dcache_req.core_req_addr;
assign vx_dcache_req_dcache.core_req_writedata = vx_dcache_req.core_req_writedata;
assign vx_dcache_req_dcache.core_req_mem_read = vx_dcache_req.core_req_mem_read;
assign vx_dcache_req_dcache.core_req_mem_write = vx_dcache_req.core_req_mem_write;
assign vx_dcache_req_dcache.core_req_rd = vx_dcache_req.core_req_rd;
assign vx_dcache_req_dcache.core_req_wb = vx_dcache_req.core_req_wb;
assign vx_dcache_req_dcache.core_req_warp_num = vx_dcache_req.core_req_warp_num;
assign vx_dcache_req_dcache.core_req_pc = vx_dcache_req.core_req_pc;
assign vx_dcache_req_dcache.core_no_wb_slot = vx_dcache_req.core_no_wb_slot;
// Shred Memory Request
assign VX_dcache_req_smem.core_req_valid = VX_dcache_req.core_req_valid & {`NUM_THREADS{to_shm}};
assign VX_dcache_req_smem.core_req_addr = VX_dcache_req.core_req_addr;
assign VX_dcache_req_smem.core_req_writedata = VX_dcache_req.core_req_writedata;
assign VX_dcache_req_smem.core_req_mem_read = VX_dcache_req.core_req_mem_read;
assign VX_dcache_req_smem.core_req_mem_write = VX_dcache_req.core_req_mem_write;
assign VX_dcache_req_smem.core_req_rd = VX_dcache_req.core_req_rd;
assign VX_dcache_req_smem.core_req_wb = VX_dcache_req.core_req_wb;
assign VX_dcache_req_smem.core_req_warp_num = VX_dcache_req.core_req_warp_num;
assign VX_dcache_req_smem.core_req_pc = VX_dcache_req.core_req_pc;
assign VX_dcache_req_smem.core_no_wb_slot = VX_dcache_req.core_no_wb_slot || dcache_wants_wb;
assign vx_dcache_req_smem.core_req_valid = vx_dcache_req.core_req_valid & {`NUM_THREADS{to_shm}};
assign vx_dcache_req_smem.core_req_addr = vx_dcache_req.core_req_addr;
assign vx_dcache_req_smem.core_req_writedata = vx_dcache_req.core_req_writedata;
assign vx_dcache_req_smem.core_req_mem_read = vx_dcache_req.core_req_mem_read;
assign vx_dcache_req_smem.core_req_mem_write = vx_dcache_req.core_req_mem_write;
assign vx_dcache_req_smem.core_req_rd = vx_dcache_req.core_req_rd;
assign vx_dcache_req_smem.core_req_wb = vx_dcache_req.core_req_wb;
assign vx_dcache_req_smem.core_req_warp_num = vx_dcache_req.core_req_warp_num;
assign vx_dcache_req_smem.core_req_pc = vx_dcache_req.core_req_pc;
assign vx_dcache_req_smem.core_no_wb_slot = vx_dcache_req.core_no_wb_slot || dcache_wants_wb;
// Dcache Response
assign VX_dcache_rsp.core_wb_valid = dcache_wants_wb ? VX_dcache_rsp_dcache.core_wb_valid : VX_dcache_rsp_smem.core_wb_valid;
assign VX_dcache_rsp.core_wb_req_rd = dcache_wants_wb ? VX_dcache_rsp_dcache.core_wb_req_rd : VX_dcache_rsp_smem.core_wb_req_rd;
assign VX_dcache_rsp.core_wb_req_wb = dcache_wants_wb ? VX_dcache_rsp_dcache.core_wb_req_wb : VX_dcache_rsp_smem.core_wb_req_wb;
assign VX_dcache_rsp.core_wb_warp_num = dcache_wants_wb ? VX_dcache_rsp_dcache.core_wb_warp_num : VX_dcache_rsp_smem.core_wb_warp_num;
assign VX_dcache_rsp.core_wb_readdata = dcache_wants_wb ? VX_dcache_rsp_dcache.core_wb_readdata : VX_dcache_rsp_smem.core_wb_readdata;
assign VX_dcache_rsp.core_wb_pc = dcache_wants_wb ? VX_dcache_rsp_dcache.core_wb_pc : VX_dcache_rsp_smem.core_wb_pc;
assign VX_dcache_rsp.delay_req = to_shm ? VX_dcache_rsp_smem.delay_req : VX_dcache_rsp_dcache.delay_req;
VX_gpu_dcache_dram_req_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) VX_gpu_smem_dram_req();
VX_gpu_dcache_dram_res_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) VX_gpu_smem_dram_res();
assign vx_dcache_rsp.core_wb_valid = dcache_wants_wb ? vx_dcache_rsp_dcache.core_wb_valid : vx_dcache_rsp_smem.core_wb_valid;
assign vx_dcache_rsp.core_wb_req_rd = dcache_wants_wb ? vx_dcache_rsp_dcache.core_wb_req_rd : vx_dcache_rsp_smem.core_wb_req_rd;
assign vx_dcache_rsp.core_wb_req_wb = dcache_wants_wb ? vx_dcache_rsp_dcache.core_wb_req_wb : vx_dcache_rsp_smem.core_wb_req_wb;
assign vx_dcache_rsp.core_wb_warp_num = dcache_wants_wb ? vx_dcache_rsp_dcache.core_wb_warp_num : vx_dcache_rsp_smem.core_wb_warp_num;
assign vx_dcache_rsp.core_wb_readdata = dcache_wants_wb ? vx_dcache_rsp_dcache.core_wb_readdata : vx_dcache_rsp_smem.core_wb_readdata;
assign vx_dcache_rsp.core_wb_pc = dcache_wants_wb ? vx_dcache_rsp_dcache.core_wb_pc : vx_dcache_rsp_smem.core_wb_pc;
assign vx_dcache_rsp.delay_req = to_shm ? vx_dcache_rsp_smem.delay_req : vx_dcache_rsp_dcache.delay_req;
VX_gpu_dcache_dram_req_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) vx_gpu_smem_dram_req();
VX_gpu_dcache_dram_rsp_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) vx_gpu_smem_dram_res();
VX_cache #(
.CACHE_SIZE_BYTES (`SCACHE_SIZE_BYTES),
@@ -99,69 +90,67 @@ module VX_dmem_controller (
.PRFQ_STRIDE (`SPRFQ_STRIDE),
.FILL_INVALIDAOR_SIZE (`SFILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(`SSIMULATED_DRAM_LATENCY_CYCLES)
)
gpu_smem
(
) gpu_smem (
.clk (clk),
.reset (reset),
// Core req
.core_req_valid (VX_dcache_req_smem.core_req_valid),
.core_req_addr (VX_dcache_req_smem.core_req_addr),
.core_req_writedata(VX_dcache_req_smem.core_req_writedata),
.core_req_mem_read (VX_dcache_req_smem.core_req_mem_read),
.core_req_mem_write(VX_dcache_req_smem.core_req_mem_write),
.core_req_rd (VX_dcache_req_smem.core_req_rd),
.core_req_wb (VX_dcache_req_smem.core_req_wb),
.core_req_warp_num (VX_dcache_req_smem.core_req_warp_num),
.core_req_pc (VX_dcache_req_smem.core_req_pc),
.core_req_valid (vx_dcache_req_smem.core_req_valid),
.core_req_mem_read (vx_dcache_req_smem.core_req_mem_read),
.core_req_mem_write(vx_dcache_req_smem.core_req_mem_write),
.core_req_addr (vx_dcache_req_smem.core_req_addr),
.core_req_writedata(vx_dcache_req_smem.core_req_writedata),
.core_req_rd (vx_dcache_req_smem.core_req_rd),
.core_req_wb (vx_dcache_req_smem.core_req_wb),
.core_req_warp_num (vx_dcache_req_smem.core_req_warp_num),
.core_req_pc (vx_dcache_req_smem.core_req_pc),
// Delay Core Req
.delay_req (VX_dcache_rsp_smem.delay_req),
.delay_req (vx_dcache_rsp_smem.delay_req),
// Core Cache Can't WB
.core_no_wb_slot (VX_dcache_req_smem.core_no_wb_slot),
.core_no_wb_slot (vx_dcache_req_smem.core_no_wb_slot),
// Cache CWB
.core_wb_valid (VX_dcache_rsp_smem.core_wb_valid),
.core_wb_req_rd (VX_dcache_rsp_smem.core_wb_req_rd),
.core_wb_req_wb (VX_dcache_rsp_smem.core_wb_req_wb),
.core_wb_warp_num (VX_dcache_rsp_smem.core_wb_warp_num),
.core_wb_readdata (VX_dcache_rsp_smem.core_wb_readdata),
.core_wb_pc (VX_dcache_rsp_smem.core_wb_pc),
.core_wb_address (),
.core_wb_valid (vx_dcache_rsp_smem.core_wb_valid),
.core_wb_req_rd (vx_dcache_rsp_smem.core_wb_req_rd),
.core_wb_req_wb (vx_dcache_rsp_smem.core_wb_req_wb),
.core_wb_warp_num (vx_dcache_rsp_smem.core_wb_warp_num),
.core_wb_readdata (vx_dcache_rsp_smem.core_wb_readdata),
.core_wb_pc (vx_dcache_rsp_smem.core_wb_pc),
/* verilator lint_off PINCONNECTEMPTY */
.core_wb_address (),
/* verilator lint_on PINCONNECTEMPTY */
// DRAM response
.dram_fill_rsp (VX_gpu_smem_dram_res.dram_fill_rsp),
.dram_fill_rsp_addr(VX_gpu_smem_dram_res.dram_fill_rsp_addr),
.dram_fill_rsp_data(VX_gpu_smem_dram_res.dram_fill_rsp_data),
.dram_rsp_valid (vx_gpu_smem_dram_res.dram_rsp_valid),
.dram_rsp_addr (vx_gpu_smem_dram_res.dram_rsp_addr),
.dram_rsp_data (vx_gpu_smem_dram_res.dram_rsp_data),
// DRAM accept response
.dram_fill_accept (VX_gpu_smem_dram_req.dram_fill_accept),
.dram_rsp_ready (vx_gpu_smem_dram_req.dram_rsp_ready),
// DRAM Req
.dram_req (VX_gpu_smem_dram_req.dram_req),
.dram_req_write (VX_gpu_smem_dram_req.dram_req_write),
.dram_req_read (VX_gpu_smem_dram_req.dram_req_read),
.dram_req_addr (VX_gpu_smem_dram_req.dram_req_addr),
.dram_req_size (VX_gpu_smem_dram_req.dram_req_size),
.dram_req_data (VX_gpu_smem_dram_req.dram_req_data),
.dram_req_delay (1),
// Snoop Response
.dram_req_because_of_wb(VX_gpu_smem_dram_req.dram_because_of_snp),
.dram_snp_full (VX_gpu_smem_dram_req.dram_snp_full),
.dram_req_read (vx_gpu_smem_dram_req.dram_req_read),
.dram_req_write (vx_gpu_smem_dram_req.dram_req_write),
.dram_req_addr (vx_gpu_smem_dram_req.dram_req_addr),
.dram_req_data (vx_gpu_smem_dram_req.dram_req_data),
.dram_req_full (1),
// Snoop Request
.snp_req (0),
.snp_req_addr (0),
.snp_req_delay (),
.snp_req_valid (0),
.snp_req_addr (0),
/* verilator lint_off PINCONNECTEMPTY */
.snp_req_full (),
/* verilator lint_on PINCONNECTEMPTY */
// Snoop Forward
.snp_fwd (),
.snp_fwd_addr (),
.snp_fwd_delay (0)
);
/* verilator lint_off PINCONNECTEMPTY */
.snp_fwd_valid (),
.snp_fwd_addr (),
/* verilator lint_on PINCONNECTEMPTY */
.snp_fwd_full (0)
);
VX_cache #(
.CACHE_SIZE_BYTES (`DCACHE_SIZE_BYTES),
@@ -184,72 +173,65 @@ module VX_dmem_controller (
.PRFQ_STRIDE (`DPRFQ_STRIDE),
.FILL_INVALIDAOR_SIZE (`DFILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(`DSIMULATED_DRAM_LATENCY_CYCLES)
)
gpu_dcache
(
) gpu_dcache (
.clk (clk),
.reset (reset),
// Core req
.core_req_valid (VX_dcache_req_dcache.core_req_valid),
.core_req_addr (VX_dcache_req_dcache.core_req_addr),
.core_req_writedata(VX_dcache_req_dcache.core_req_writedata),
.core_req_mem_read (VX_dcache_req_dcache.core_req_mem_read),
.core_req_mem_write(VX_dcache_req_dcache.core_req_mem_write),
.core_req_rd (VX_dcache_req_dcache.core_req_rd),
.core_req_wb (VX_dcache_req_dcache.core_req_wb),
.core_req_warp_num (VX_dcache_req_dcache.core_req_warp_num),
.core_req_pc (VX_dcache_req_dcache.core_req_pc),
.core_req_valid (vx_dcache_req_dcache.core_req_valid),
.core_req_mem_read (vx_dcache_req_dcache.core_req_mem_read),
.core_req_mem_write(vx_dcache_req_dcache.core_req_mem_write),
.core_req_addr (vx_dcache_req_dcache.core_req_addr),
.core_req_writedata(vx_dcache_req_dcache.core_req_writedata),
.core_req_rd (vx_dcache_req_dcache.core_req_rd),
.core_req_wb (vx_dcache_req_dcache.core_req_wb),
.core_req_warp_num (vx_dcache_req_dcache.core_req_warp_num),
.core_req_pc (vx_dcache_req_dcache.core_req_pc),
// Delay Core Req
.delay_req (VX_dcache_rsp_dcache.delay_req),
.delay_req (vx_dcache_rsp_dcache.delay_req),
// Core Cache Can't WB
.core_no_wb_slot (VX_dcache_req_dcache.core_no_wb_slot),
.core_no_wb_slot (vx_dcache_req_dcache.core_no_wb_slot),
// Cache CWB
.core_wb_valid (VX_dcache_rsp_dcache.core_wb_valid),
.core_wb_req_rd (VX_dcache_rsp_dcache.core_wb_req_rd),
.core_wb_req_wb (VX_dcache_rsp_dcache.core_wb_req_wb),
.core_wb_warp_num (VX_dcache_rsp_dcache.core_wb_warp_num),
.core_wb_readdata (VX_dcache_rsp_dcache.core_wb_readdata),
.core_wb_pc (VX_dcache_rsp_dcache.core_wb_pc),
.core_wb_valid (vx_dcache_rsp_dcache.core_wb_valid),
.core_wb_req_rd (vx_dcache_rsp_dcache.core_wb_req_rd),
.core_wb_req_wb (vx_dcache_rsp_dcache.core_wb_req_wb),
.core_wb_warp_num (vx_dcache_rsp_dcache.core_wb_warp_num),
.core_wb_readdata (vx_dcache_rsp_dcache.core_wb_readdata),
.core_wb_pc (vx_dcache_rsp_dcache.core_wb_pc),
/* verilator lint_off PINCONNECTEMPTY */
.core_wb_address (),
/* verilator lint_on PINCONNECTEMPTY */
// DRAM response
.dram_fill_rsp (VX_gpu_dcache_dram_res.dram_fill_rsp),
.dram_fill_rsp_addr(VX_gpu_dcache_dram_res.dram_fill_rsp_addr),
.dram_fill_rsp_data(VX_gpu_dcache_dram_res.dram_fill_rsp_data),
.dram_rsp_valid (vx_gpu_dcache_dram_res.dram_rsp_valid),
.dram_rsp_addr (vx_gpu_dcache_dram_res.dram_rsp_addr),
.dram_rsp_data (vx_gpu_dcache_dram_res.dram_rsp_data),
// DRAM accept response
.dram_fill_accept (VX_gpu_dcache_dram_req.dram_fill_accept),
.dram_rsp_ready (vx_gpu_dcache_dram_req.dram_rsp_ready),
// DRAM Req
.dram_req (VX_gpu_dcache_dram_req.dram_req),
.dram_req_write (VX_gpu_dcache_dram_req.dram_req_write),
.dram_req_read (VX_gpu_dcache_dram_req.dram_req_read),
.dram_req_addr (VX_gpu_dcache_dram_req.dram_req_addr),
.dram_req_size (VX_gpu_dcache_dram_req.dram_req_size),
.dram_req_data (VX_gpu_dcache_dram_req.dram_req_data),
.dram_req_delay (VX_gpu_dcache_dram_req.dram_req_delay),
// Snoop Response
.dram_req_because_of_wb(VX_gpu_dcache_dram_req.dram_because_of_snp),
.dram_snp_full (VX_gpu_dcache_dram_req.dram_snp_full),
.dram_req_read (vx_gpu_dcache_dram_req.dram_req_read),
.dram_req_write (vx_gpu_dcache_dram_req.dram_req_write),
.dram_req_addr (vx_gpu_dcache_dram_req.dram_req_addr),
.dram_req_data (vx_gpu_dcache_dram_req.dram_req_data),
.dram_req_full (vx_gpu_dcache_dram_req.dram_req_full),
// Snoop Request
.snp_req (VX_gpu_dcache_snp_req.snp_req),
.snp_req_addr (VX_gpu_dcache_snp_req.snp_req_addr),
.snp_req_delay (VX_gpu_dcache_snp_req.snp_delay),
.snp_req_valid (vx_gpu_dcache_snp_req.snp_req_valid),
.snp_req_addr (vx_gpu_dcache_snp_req.snp_req_addr),
.snp_req_full (vx_gpu_dcache_snp_req.snp_req_full),
// Snoop Forward
.snp_fwd (),
.snp_fwd_addr (),
.snp_fwd_delay (0)
);
/* verilator lint_off PINCONNECTEMPTY */
.snp_fwd_valid (),
.snp_fwd_addr (),
/* verilator lint_on PINCONNECTEMPTY */
.snp_fwd_full (0)
);
VX_cache #(
.CACHE_SIZE_BYTES (`ICACHE_SIZE_BYTES),
@@ -272,71 +254,64 @@ module VX_dmem_controller (
.PRFQ_STRIDE (`IPRFQ_STRIDE),
.FILL_INVALIDAOR_SIZE (`IFILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(`ISIMULATED_DRAM_LATENCY_CYCLES)
)
gpu_icache
(
.clk (clk),
.reset (reset),
) gpu_icache (
.clk (clk),
.reset (reset),
// Core req
.core_req_valid (VX_icache_req.core_req_valid),
.core_req_addr (VX_icache_req.core_req_addr),
.core_req_writedata(VX_icache_req.core_req_writedata),
.core_req_mem_read (VX_icache_req.core_req_mem_read),
.core_req_mem_write(VX_icache_req.core_req_mem_write),
.core_req_rd (VX_icache_req.core_req_rd),
.core_req_wb (VX_icache_req.core_req_wb),
.core_req_warp_num (VX_icache_req.core_req_warp_num),
.core_req_pc (VX_icache_req.core_req_pc),
.core_req_valid (vx_icache_req.core_req_valid),
.core_req_mem_read (vx_icache_req.core_req_mem_read),
.core_req_mem_write (vx_icache_req.core_req_mem_write),
.core_req_addr (vx_icache_req.core_req_addr),
.core_req_writedata (vx_icache_req.core_req_writedata),
.core_req_rd (vx_icache_req.core_req_rd),
.core_req_wb (vx_icache_req.core_req_wb),
.core_req_warp_num (vx_icache_req.core_req_warp_num),
.core_req_pc (vx_icache_req.core_req_pc),
// Delay Core Req
.delay_req (VX_icache_rsp.delay_req),
.delay_req (vx_icache_rsp.delay_req),
// Core Cache Can't WB
.core_no_wb_slot (VX_icache_req.core_no_wb_slot),
.core_no_wb_slot (vx_icache_req.core_no_wb_slot),
// Cache CWB
.core_wb_valid (VX_icache_rsp.core_wb_valid),
.core_wb_req_rd (VX_icache_rsp.core_wb_req_rd),
.core_wb_req_wb (VX_icache_rsp.core_wb_req_wb),
.core_wb_warp_num (VX_icache_rsp.core_wb_warp_num),
.core_wb_readdata (VX_icache_rsp.core_wb_readdata),
.core_wb_pc (VX_icache_rsp.core_wb_pc),
.core_wb_address (),
.core_wb_valid (vx_icache_rsp.core_wb_valid),
.core_wb_req_rd (vx_icache_rsp.core_wb_req_rd),
.core_wb_req_wb (vx_icache_rsp.core_wb_req_wb),
.core_wb_warp_num (vx_icache_rsp.core_wb_warp_num),
.core_wb_readdata (vx_icache_rsp.core_wb_readdata),
.core_wb_pc (vx_icache_rsp.core_wb_pc),
/* verilator lint_off PINCONNECTEMPTY */
.core_wb_address (),
/* verilator lint_on PINCONNECTEMPTY */
// DRAM response
.dram_fill_rsp (VX_gpu_icache_dram_res.dram_fill_rsp),
.dram_fill_rsp_addr(VX_gpu_icache_dram_res.dram_fill_rsp_addr),
.dram_fill_rsp_data(VX_gpu_icache_dram_res.dram_fill_rsp_data),
.dram_rsp_valid (vx_gpu_icache_dram_res.dram_rsp_valid),
.dram_rsp_addr (vx_gpu_icache_dram_res.dram_rsp_addr),
.dram_rsp_data (vx_gpu_icache_dram_res.dram_rsp_data),
// DRAM accept response
.dram_fill_accept (VX_gpu_icache_dram_req.dram_fill_accept),
.dram_rsp_ready (vx_gpu_icache_dram_req.dram_rsp_ready),
// DRAM Req
.dram_req (VX_gpu_icache_dram_req.dram_req),
.dram_req_write (VX_gpu_icache_dram_req.dram_req_write),
.dram_req_read (VX_gpu_icache_dram_req.dram_req_read),
.dram_req_addr (VX_gpu_icache_dram_req.dram_req_addr),
.dram_req_size (VX_gpu_icache_dram_req.dram_req_size),
.dram_req_data (VX_gpu_icache_dram_req.dram_req_data),
.dram_req_delay (VX_gpu_icache_dram_req.dram_req_delay),
// Snoop Response
.dram_req_because_of_wb(VX_gpu_icache_dram_req.dram_because_of_snp),
.dram_snp_full (VX_gpu_icache_dram_req.dram_snp_full),
.dram_req_read (vx_gpu_icache_dram_req.dram_req_read),
.dram_req_write (vx_gpu_icache_dram_req.dram_req_write),
.dram_req_addr (vx_gpu_icache_dram_req.dram_req_addr),
.dram_req_data (vx_gpu_icache_dram_req.dram_req_data),
.dram_req_full (vx_gpu_icache_dram_req.dram_req_full),
// Snoop Request
.snp_req (VX_gpu_icache_snp_req.snp_req),
.snp_req_addr (VX_gpu_icache_snp_req.snp_req_addr),
.snp_req_delay (VX_gpu_icache_snp_req.snp_delay),
.snp_req_valid (vx_gpu_icache_snp_req.snp_req_valid),
.snp_req_addr (vx_gpu_icache_snp_req.snp_req_addr),
.snp_req_full (vx_gpu_icache_snp_req.snp_req_full),
// Snoop Forward
.snp_fwd (),
.snp_fwd_addr (),
.snp_fwd_delay (0)
);
/* verilator lint_off PINCONNECTEMPTY */
.snp_fwd_valid (),
.snp_fwd_addr (),
/* verilator lint_on PINCONNECTEMPTY */
.snp_fwd_full (0)
);
endmodule

View File

@@ -4,15 +4,15 @@ module VX_execute_unit (
input wire clk,
input wire reset,
// Request
VX_exec_unit_req_inter VX_exec_unit_req,
VX_exec_unit_req_inter vx_exec_unit_req,
// Output
// Writeback
VX_inst_exec_wb_inter VX_inst_exec_wb,
VX_inst_exec_wb_inter vx_inst_exec_wb,
// JAL Response
VX_jal_response_inter VX_jal_rsp,
VX_jal_response_inter vx_jal_rsp,
// Branch Response
VX_branch_response_inter VX_branch_rsp,
VX_branch_response_inter vx_branch_rsp,
input wire no_slot_exec,
output wire out_delay
@@ -23,23 +23,24 @@ module VX_execute_unit (
wire[4:0] in_alu_op;
wire in_rs2_src;
wire[31:0] in_itype_immed;
/* verilator lint_off UNUSED */
wire[2:0] in_branch_type;
/* verilator lint_on UNUSED */
wire[19:0] in_upper_immed;
wire in_jal;
wire[31:0] in_jal_offset;
wire[31:0] in_curr_PC;
assign in_a_reg_data = VX_exec_unit_req.a_reg_data;
assign in_b_reg_data = VX_exec_unit_req.b_reg_data;
assign in_alu_op = VX_exec_unit_req.alu_op;
assign in_rs2_src = VX_exec_unit_req.rs2_src;
assign in_itype_immed = VX_exec_unit_req.itype_immed;
assign in_branch_type = VX_exec_unit_req.branch_type;
assign in_upper_immed = VX_exec_unit_req.upper_immed;
assign in_jal = VX_exec_unit_req.jal;
assign in_jal_offset = VX_exec_unit_req.jal_offset;
assign in_curr_PC = VX_exec_unit_req.curr_PC;
assign in_a_reg_data = vx_exec_unit_req.a_reg_data;
assign in_b_reg_data = vx_exec_unit_req.b_reg_data;
assign in_alu_op = vx_exec_unit_req.alu_op;
assign in_rs2_src = vx_exec_unit_req.rs2_src;
assign in_itype_immed = vx_exec_unit_req.itype_immed;
assign in_branch_type = vx_exec_unit_req.branch_type;
assign in_upper_immed = vx_exec_unit_req.upper_immed;
assign in_jal = vx_exec_unit_req.jal;
assign in_jal_offset = vx_exec_unit_req.jal_offset;
assign in_curr_PC = vx_exec_unit_req.curr_PC;
wire[`NUM_THREADS-1:0][31:0] alu_result;
wire[`NUM_THREADS-1:0] alu_stall;
@@ -68,11 +69,15 @@ module VX_execute_unit (
assign out_delay = no_slot_exec || internal_stall;
/* verilator lint_off UNUSED */
wire [$clog2(`NUM_THREADS)-1:0] jal_branch_use_index;
wire jal_branch_found_valid;
VX_generic_priority_encoder #(.N(`NUM_THREADS)) choose_alu_result(
.valids(VX_exec_unit_req.valid),
wire jal_branch_found_valid;
/* verilator lint_on UNUSED */
VX_generic_priority_encoder #(
.N(`NUM_THREADS)
) choose_alu_result (
.valids(vx_exec_unit_req.valid),
.index (jal_branch_use_index),
.found (jal_branch_found_valid)
);
@@ -82,7 +87,7 @@ module VX_execute_unit (
reg temp_branch_dir;
always @(*)
begin
case(VX_exec_unit_req.branch_type)
case (vx_exec_unit_req.branch_type)
`BEQ: temp_branch_dir = (branch_use_alu_result == 0) ? `TAKEN : `NOT_TAKEN;
`BNE: temp_branch_dir = (branch_use_alu_result == 0) ? `NOT_TAKEN : `TAKEN;
`BLT: temp_branch_dir = (branch_use_alu_result[31] == 0) ? `NOT_TAKEN : `TAKEN;
@@ -99,35 +104,35 @@ module VX_execute_unit (
genvar i;
generate
for (i = 0; i < `NUM_THREADS; i=i+1) begin : pc_data_setup
assign duplicate_PC_data[i] = VX_exec_unit_req.PC_next;
assign duplicate_PC_data[i] = vx_exec_unit_req.PC_next;
end
endgenerate
// VX_inst_exec_wb_inter VX_inst_exec_wb_temp();
// VX_inst_exec_wb_inter vx_inst_exec_wb_temp();
// JAL Response
VX_jal_response_inter VX_jal_rsp_temp();
VX_jal_response_inter vx_jal_rsp_temp();
// Branch Response
VX_branch_response_inter VX_branch_rsp_temp();
VX_branch_response_inter vx_branch_rsp_temp();
// Actual Writeback
assign VX_inst_exec_wb.rd = VX_exec_unit_req.rd;
assign VX_inst_exec_wb.wb = VX_exec_unit_req.wb;
assign VX_inst_exec_wb.wb_valid = VX_exec_unit_req.valid & {`NUM_THREADS{!internal_stall}};
assign VX_inst_exec_wb.wb_warp_num = VX_exec_unit_req.warp_num;
assign VX_inst_exec_wb.alu_result = VX_exec_unit_req.jal ? duplicate_PC_data : alu_result;
assign vx_inst_exec_wb.rd = vx_exec_unit_req.rd;
assign vx_inst_exec_wb.wb = vx_exec_unit_req.wb;
assign vx_inst_exec_wb.wb_valid = vx_exec_unit_req.valid & {`NUM_THREADS{!internal_stall}};
assign vx_inst_exec_wb.wb_warp_num = vx_exec_unit_req.warp_num;
assign vx_inst_exec_wb.alu_result = vx_exec_unit_req.jal ? duplicate_PC_data : alu_result;
assign VX_inst_exec_wb.exec_wb_pc = in_curr_PC;
assign vx_inst_exec_wb.exec_wb_pc = in_curr_PC;
// Jal rsp
assign VX_jal_rsp_temp.jal = in_jal;
assign VX_jal_rsp_temp.jal_dest = $signed(in_a_reg_data[jal_branch_use_index]) + $signed(in_jal_offset);
assign VX_jal_rsp_temp.jal_warp_num = VX_exec_unit_req.warp_num;
assign vx_jal_rsp_temp.jal = in_jal;
assign vx_jal_rsp_temp.jal_dest = $signed(in_a_reg_data[jal_branch_use_index]) + $signed(in_jal_offset);
assign vx_jal_rsp_temp.jal_warp_num = vx_exec_unit_req.warp_num;
// Branch rsp
assign VX_branch_rsp_temp.valid_branch = (VX_exec_unit_req.branch_type != `NO_BRANCH) && (|VX_exec_unit_req.valid);
assign VX_branch_rsp_temp.branch_dir = temp_branch_dir;
assign VX_branch_rsp_temp.branch_warp_num = VX_exec_unit_req.warp_num;
assign VX_branch_rsp_temp.branch_dest = $signed(VX_exec_unit_req.curr_PC) + ($signed(VX_exec_unit_req.itype_immed) << 1); // itype_immed = branch_offset
assign vx_branch_rsp_temp.valid_branch = (vx_exec_unit_req.branch_type != `NO_BRANCH) && (|vx_exec_unit_req.valid);
assign vx_branch_rsp_temp.branch_dir = temp_branch_dir;
assign vx_branch_rsp_temp.branch_warp_num = vx_exec_unit_req.warp_num;
assign vx_branch_rsp_temp.branch_dest = $signed(vx_exec_unit_req.curr_PC) + ($signed(vx_exec_unit_req.itype_immed) << 1); // itype_immed = branch_offset
wire zero = 0;
@@ -137,27 +142,31 @@ module VX_execute_unit (
// .reset(reset),
// .stall(zero),
// .flush(zero),
// .in ({VX_inst_exec_wb_temp.rd, VX_inst_exec_wb_temp.wb, VX_inst_exec_wb_temp.wb_valid, VX_inst_exec_wb_temp.wb_warp_num, VX_inst_exec_wb_temp.alu_result, VX_inst_exec_wb_temp.exec_wb_pc}),
// .out ({VX_inst_exec_wb.rd , VX_inst_exec_wb.wb , VX_inst_exec_wb.wb_valid , VX_inst_exec_wb.wb_warp_num , VX_inst_exec_wb.alu_result , VX_inst_exec_wb.exec_wb_pc })
// .in ({vx_inst_exec_wb_temp.rd, vx_inst_exec_wb_temp.wb, vx_inst_exec_wb_temp.wb_valid, vx_inst_exec_wb_temp.wb_warp_num, vx_inst_exec_wb_temp.alu_result, vx_inst_exec_wb_temp.exec_wb_pc}),
// .out ({vx_inst_exec_wb.rd , vx_inst_exec_wb.wb , vx_inst_exec_wb.wb_valid , vx_inst_exec_wb.wb_warp_num , vx_inst_exec_wb.alu_result , vx_inst_exec_wb.exec_wb_pc })
// );
VX_generic_register #(.N(33 + `NW_BITS-1 + 1)) jal_reg(
VX_generic_register #(
.N(33 + `NW_BITS-1 + 1)
) jal_reg (
.clk (clk),
.reset(reset),
.stall(zero),
.flush(zero),
.in ({VX_jal_rsp_temp.jal, VX_jal_rsp_temp.jal_dest, VX_jal_rsp_temp.jal_warp_num}),
.out ({VX_jal_rsp.jal , VX_jal_rsp.jal_dest , VX_jal_rsp.jal_warp_num})
);
.in ({vx_jal_rsp_temp.jal, vx_jal_rsp_temp.jal_dest, vx_jal_rsp_temp.jal_warp_num}),
.out ({vx_jal_rsp.jal , vx_jal_rsp.jal_dest , vx_jal_rsp.jal_warp_num})
);
VX_generic_register #(.N(34 + `NW_BITS-1 + 1)) branch_reg(
VX_generic_register #(
.N(34 + `NW_BITS-1 + 1)
) branch_reg (
.clk (clk),
.reset(reset),
.stall(zero),
.flush(zero),
.in ({VX_branch_rsp_temp.valid_branch, VX_branch_rsp_temp.branch_dir, VX_branch_rsp_temp.branch_warp_num, VX_branch_rsp_temp.branch_dest}),
.out ({VX_branch_rsp.valid_branch , VX_branch_rsp.branch_dir , VX_branch_rsp.branch_warp_num , VX_branch_rsp.branch_dest })
);
.in ({vx_branch_rsp_temp.valid_branch, vx_branch_rsp_temp.branch_dir, vx_branch_rsp_temp.branch_warp_num, vx_branch_rsp_temp.branch_dest}),
.out ({vx_branch_rsp.valid_branch , vx_branch_rsp.branch_dir , vx_branch_rsp.branch_warp_num , vx_branch_rsp.branch_dest })
);
// always @(*) begin
// case(in_alu_op)
@@ -169,8 +178,7 @@ module VX_execute_unit (
// end
// assign out_is_csr = VX_exec_unit_req.is_csr;
// assign out_csr_address = VX_exec_unit_req.csr_address;
// assign out_is_csr = vx_exec_unit_req.is_csr;
// assign out_csr_address = vx_exec_unit_req.csr_address;
endmodule : VX_execute_unit

View File

@@ -3,103 +3,103 @@
module VX_fetch (
input wire clk,
input wire reset,
VX_wstall_inter VX_wstall,
VX_join_inter VX_join,
VX_wstall_inter vx_wstall,
VX_join_inter vx_join,
input wire schedule_delay,
input wire icache_stage_delay,
input wire[`NW_BITS-1:0] icache_stage_wid,
input wire[`NUM_THREADS-1:0] icache_stage_valids,
output wire out_ebreak,
VX_jal_response_inter VX_jal_rsp,
VX_branch_response_inter VX_branch_rsp,
VX_jal_response_inter vx_jal_rsp,
VX_branch_response_inter vx_branch_rsp,
VX_inst_meta_inter fe_inst_meta_fi,
VX_warp_ctl_inter VX_warp_ctl
VX_warp_ctl_inter vx_warp_ctl
);
wire[`NUM_THREADS-1:0] thread_mask;
wire[`NW_BITS-1:0] warp_num;
wire[31:0] warp_pc;
wire scheduled_warp;
wire[`NUM_THREADS-1:0] thread_mask;
wire[`NW_BITS-1:0] warp_num;
wire[31:0] warp_pc;
wire scheduled_warp;
wire pipe_stall;
wire pipe_stall;
// Only reason this is there is because there is a hidden assumption that decode is exactly after fetch
// Only reason this is there is because there is a hidden assumption that decode is exactly after fetch
// Locals
// Locals
assign pipe_stall = schedule_delay || icache_stage_delay;
assign pipe_stall = schedule_delay || icache_stage_delay;
VX_warp_scheduler warp_scheduler(
.clk (clk),
.reset (reset),
.stall (pipe_stall),
VX_warp_scheduler warp_scheduler(
.clk (clk),
.reset (reset),
.stall (pipe_stall),
.is_barrier (VX_warp_ctl.is_barrier),
.barrier_id (VX_warp_ctl.barrier_id),
.num_warps (VX_warp_ctl.num_warps),
.barrier_warp_num (VX_warp_ctl.warp_num),
.is_barrier (vx_warp_ctl.is_barrier),
.barrier_id (vx_warp_ctl.barrier_id),
.num_warps (vx_warp_ctl.num_warps),
.barrier_warp_num (vx_warp_ctl.warp_num),
// Wspawn
.wspawn (VX_warp_ctl.wspawn),
.wsapwn_pc (VX_warp_ctl.wspawn_pc),
.wspawn_new_active(VX_warp_ctl.wspawn_new_active),
// CTM
.ctm (VX_warp_ctl.change_mask),
.ctm_mask (VX_warp_ctl.thread_mask),
.ctm_warp_num (VX_warp_ctl.warp_num),
// WHALT
.whalt (VX_warp_ctl.ebreak),
.whalt_warp_num (VX_warp_ctl.warp_num),
// Wstall
.wstall (VX_wstall.wstall),
.wstall_warp_num (VX_wstall.warp_num),
// Wspawn
.wspawn (vx_warp_ctl.wspawn),
.wsapwn_pc (vx_warp_ctl.wspawn_pc),
.wspawn_new_active(vx_warp_ctl.wspawn_new_active),
// CTM
.ctm (vx_warp_ctl.change_mask),
.ctm_mask (vx_warp_ctl.thread_mask),
.ctm_warp_num (vx_warp_ctl.warp_num),
// WHALT
.whalt (vx_warp_ctl.ebreak),
.whalt_warp_num (vx_warp_ctl.warp_num),
// Wstall
.wstall (vx_wstall.wstall),
.wstall_warp_num (vx_wstall.warp_num),
// Lock/release Stuff
.icache_stage_valids(icache_stage_valids),
.icache_stage_wid (icache_stage_wid),
// Lock/release Stuff
.icache_stage_valids(icache_stage_valids),
.icache_stage_wid (icache_stage_wid),
// Join
.is_join (VX_join.is_join),
.join_warp_num (VX_join.join_warp_num),
// Join
.is_join (vx_join.is_join),
.join_warp_num (vx_join.join_warp_num),
// Split
.is_split (VX_warp_ctl.is_split),
.dont_split (VX_warp_ctl.dont_split),
.split_new_mask (VX_warp_ctl.split_new_mask),
.split_later_mask (VX_warp_ctl.split_later_mask),
.split_save_pc (VX_warp_ctl.split_save_pc),
.split_warp_num (VX_warp_ctl.warp_num),
// Split
.is_split (vx_warp_ctl.is_split),
.dont_split (vx_warp_ctl.dont_split),
.split_new_mask (vx_warp_ctl.split_new_mask),
.split_later_mask (vx_warp_ctl.split_later_mask),
.split_save_pc (vx_warp_ctl.split_save_pc),
.split_warp_num (vx_warp_ctl.warp_num),
// JAL
.jal (VX_jal_rsp.jal),
.jal_dest (VX_jal_rsp.jal_dest),
.jal_warp_num (VX_jal_rsp.jal_warp_num),
// JAL
.jal (vx_jal_rsp.jal),
.jal_dest (vx_jal_rsp.jal_dest),
.jal_warp_num (vx_jal_rsp.jal_warp_num),
// Branch
.branch_valid (VX_branch_rsp.valid_branch),
.branch_dir (VX_branch_rsp.branch_dir),
.branch_dest (VX_branch_rsp.branch_dest),
.branch_warp_num (VX_branch_rsp.branch_warp_num),
// Branch
.branch_valid (vx_branch_rsp.valid_branch),
.branch_dir (vx_branch_rsp.branch_dir),
.branch_dest (vx_branch_rsp.branch_dest),
.branch_warp_num (vx_branch_rsp.branch_warp_num),
// Outputs
.thread_mask (thread_mask),
.warp_num (warp_num),
.warp_pc (warp_pc),
.out_ebreak (out_ebreak),
.scheduled_warp (scheduled_warp)
);
assign fe_inst_meta_fi.warp_num = warp_num;
assign fe_inst_meta_fi.valid = thread_mask;
assign fe_inst_meta_fi.instruction = 32'h0;
assign fe_inst_meta_fi.inst_pc = warp_pc;
wire start_mat_add = scheduled_warp && (warp_pc == 32'h80000ed8) && (warp_num == 0);
wire end_mat_add = scheduled_warp && (warp_pc == 32'h80000fbc) && (warp_num == 0);
// Outputs
.thread_mask (thread_mask),
.warp_num (warp_num),
.warp_pc (warp_pc),
.out_ebreak (out_ebreak),
.scheduled_warp (scheduled_warp)
);
assign fe_inst_meta_fi.warp_num = warp_num;
assign fe_inst_meta_fi.valid = thread_mask;
assign fe_inst_meta_fi.instruction = 32'h0;
assign fe_inst_meta_fi.inst_pc = warp_pc;
/* verilator lint_off UNUSED */
wire start_mat_add = scheduled_warp && (warp_pc == 32'h80000ed8) && (warp_num == 0);
wire end_mat_add = scheduled_warp && (warp_pc == 32'h80000fbc) && (warp_num == 0);
/* verilator lint_on UNUSED */
endmodule

View File

@@ -6,15 +6,15 @@ module VX_front_end (
input wire schedule_delay,
VX_warp_ctl_inter VX_warp_ctl,
VX_warp_ctl_inter vx_warp_ctl,
VX_gpu_dcache_res_inter VX_icache_rsp,
VX_gpu_dcache_req_inter VX_icache_req,
VX_gpu_dcache_rsp_inter vx_icache_rsp,
VX_gpu_dcache_req_inter vx_icache_req,
VX_jal_response_inter VX_jal_rsp,
VX_branch_response_inter VX_branch_rsp,
VX_jal_response_inter vx_jal_rsp,
VX_branch_response_inter vx_branch_rsp,
VX_frE_to_bckE_req_inter VX_bckE_req,
VX_frE_to_bckE_req_inter vx_bckE_req,
output wire fetch_ebreak
);
@@ -24,8 +24,8 @@ VX_inst_meta_inter fe_inst_meta_fi();
VX_inst_meta_inter fe_inst_meta_fi2();
VX_inst_meta_inter fe_inst_meta_id();
VX_frE_to_bckE_req_inter VX_frE_to_bckE_req();
VX_inst_meta_inter fd_inst_meta_de();
VX_frE_to_bckE_req_inter vx_frE_to_bckE_req();
VX_inst_meta_inter fd_inst_meta_de();
wire total_freeze = schedule_delay;
wire icache_stage_delay;
@@ -52,21 +52,21 @@ end
assign fetch_ebreak = vortex_ebreak || terminate_sim || old_ebreak;
VX_wstall_inter VX_wstall();
VX_join_inter VX_join();
VX_wstall_inter vx_wstall();
VX_join_inter vx_join();
VX_fetch vx_fetch(
.clk (clk),
.reset (reset),
.icache_stage_wid (icache_stage_wid),
.icache_stage_valids(icache_stage_valids),
.VX_wstall (VX_wstall),
.VX_join (VX_join),
.vx_wstall (vx_wstall),
.vx_join (vx_join),
.schedule_delay (schedule_delay),
.VX_jal_rsp (VX_jal_rsp),
.VX_warp_ctl (VX_warp_ctl),
.vx_jal_rsp (vx_jal_rsp),
.vx_warp_ctl (vx_warp_ctl),
.icache_stage_delay (icache_stage_delay),
.VX_branch_rsp (VX_branch_rsp),
.vx_branch_rsp (vx_branch_rsp),
.out_ebreak (vortex_ebreak), // fetch_ebreak
.fe_inst_meta_fi (fe_inst_meta_fi)
);
@@ -84,7 +84,7 @@ VX_f_d_reg vx_f_i_reg(
.fd_inst_meta_de(fe_inst_meta_fi2)
);
VX_icache_stage VX_icache_stage(
VX_icache_stage vx_icache_stage(
.clk (clk),
.reset (reset),
.total_freeze (total_freeze),
@@ -93,8 +93,8 @@ VX_icache_stage VX_icache_stage(
.icache_stage_wid (icache_stage_wid),
.fe_inst_meta_fi (fe_inst_meta_fi2),
.fe_inst_meta_id (fe_inst_meta_id),
.VX_icache_rsp (VX_icache_rsp),
.VX_icache_req (VX_icache_req)
.vx_icache_rsp (vx_icache_rsp),
.vx_icache_req (vx_icache_req)
);
@@ -109,9 +109,9 @@ VX_i_d_reg vx_i_d_reg(
VX_decode vx_decode(
.fd_inst_meta_de (fd_inst_meta_de),
.VX_frE_to_bckE_req(VX_frE_to_bckE_req),
.VX_wstall (VX_wstall),
.VX_join (VX_join),
.vx_frE_to_bckE_req(vx_frE_to_bckE_req),
.vx_wstall (vx_wstall),
.vx_join (vx_join),
.terminate_sim (terminate_sim)
);
@@ -122,8 +122,8 @@ VX_d_e_reg vx_d_e_reg(
.reset (reset),
.in_branch_stall(no_br_stall),
.in_freeze (total_freeze),
.VX_frE_to_bckE_req(VX_frE_to_bckE_req),
.VX_bckE_req (VX_bckE_req)
.vx_frE_to_bckE_req(vx_frE_to_bckE_req),
.vx_bckE_req (vx_bckE_req)
);
endmodule

View File

@@ -1,10 +1,7 @@
module VX_generic_queue
#(
parameter DATAW = 4,
parameter SIZE = 277
)
(
module VX_generic_queue #(
parameter DATAW = 4,
parameter SIZE = 277
) (
input wire clk,
input wire reset,
input wire push,
@@ -16,31 +13,26 @@ module VX_generic_queue
output wire full
);
reg[DATAW-1:0] data[SIZE-1:0];
reg[$clog2(SIZE)-1:0] head;
reg[$clog2(SIZE)-1:0] tail;
reg [DATAW-1:0] data [SIZE-1:0];
reg [`LOG2UP(SIZE)-1:0] head;
reg [`LOG2UP(SIZE)-1:0] tail;
assign empty = head == tail;
assign full = head == (tail+1);
assign empty = (head == tail);
assign full = (head == (tail+1));
integer i;
always @(posedge clk) begin
if (reset) begin
head <= 0;
tail <= 0;
for (i = 0; i < SIZE; i=i+1) begin
data[i] <= 0;
end
end else begin
if (push && !full) begin
data[tail] <= in_data;
tail <= tail+1;
end
if (pop && !empty) begin
head <= head + 1;
end
end
end

View File

@@ -1,40 +1,36 @@
module VX_generic_queue_ll
#(
parameter DATAW = 4,
parameter SIZE = 277
)
(
module VX_generic_queue_ll #(
parameter DATAW,
parameter SIZE = 16
) (
/* verilator lint_off UNUSED */
input wire clk,
input wire reset,
input wire push,
input wire [DATAW-1:0] in_data,
input wire pop,
output wire [DATAW-1:0] out_data,
input wire pop,
output wire empty,
output wire full
output wire full,
/* verilator lint_on UNUSED */
input wire [DATAW-1:0] in_data,
output wire [DATAW-1:0] out_data
);
/* verilator lint_off WIDTH */
if (SIZE == 0) begin
assign empty = 1;
assign out_data = 0;
assign out_data = in_data;
assign full = 0;
end else begin // (SIZE > 0)
`ifdef QUEUE_FORCE_MLAB
(* syn_ramstyle = "mlab" *) reg[DATAW-1:0] data[SIZE-1:0];
(* syn_ramstyle = "mlab" *) reg [DATAW-1:0] data [SIZE-1:0];
`else
reg[ DATAW-1:0] data[SIZE-1:0];
reg [DATAW-1:0] data [SIZE-1:0];
`endif
reg [DATAW-1:0] head_r;
reg [$clog2(SIZE+1)-1:0] size_r;
wire reading;
wire writing;
reg [DATAW-1:0] head_r;
reg [`LOG2UP(SIZE+1)-1:0] size_r;
wire reading;
wire writing;
assign reading = pop && !empty;
assign writing = push && !full;
@@ -65,9 +61,9 @@ module VX_generic_queue_ll
end else begin // (SIZE > 1)
reg [DATAW-1:0] curr_r;
reg [$clog2(SIZE)-1:0] wr_ctr_r;
reg [$clog2(SIZE)-1:0] rd_ptr_r;
reg [$clog2(SIZE)-1:0] rd_next_ptr_r;
reg [`LOG2UP(SIZE)-1:0] wr_ctr_r;
reg [`LOG2UP(SIZE)-1:0] rd_ptr_r;
reg [`LOG2UP(SIZE)-1:0] rd_next_ptr_r;
reg empty_r;
reg full_r;
reg bypass_r;
@@ -106,7 +102,7 @@ module VX_generic_queue_ll
data[wr_ctr_r] <= in_data;
end
end
always @(posedge clk) begin
if (reset) begin
curr_r <= 0;
@@ -135,7 +131,5 @@ module VX_generic_queue_ll
assign full = full_r;
end
end
/* verilator lint_on WIDTH */
endmodule

View File

@@ -1,24 +1,24 @@
module VX_generic_register #(
parameter N,
parameter PassThru = 0
) (
/* verilator lint_off UNUSED */
input wire clk,
input wire reset,
input wire stall,
input wire flush,
/* verilator lint_on UNUSED */
input wire[N-1:0] in,
output wire[N-1:0] out
);
module VX_generic_register
#( parameter N = 1, parameter Valid = 1)
(
input wire clk,
input wire reset,
input wire stall,
input wire flush,
input wire[(N-1):0] in,
output wire[(N-1):0] out
);
if (Valid == 0) begin
if (PassThru) begin
assign out = in;
end else begin
reg[(N-1):0] value;
reg [(N-1):0] value;
always @(posedge clk or posedge reset) begin
always @(posedge clk) begin
if (reset) begin
value <= 0;
end else if (flush) begin
@@ -29,7 +29,6 @@ module VX_generic_register
end
assign out = value;
end
endmodule

View File

@@ -2,56 +2,57 @@
module VX_gpgpu_inst (
// Input
VX_gpu_inst_req_inter VX_gpu_inst_req,
VX_gpu_inst_req_inter vx_gpu_inst_req,
// Output
VX_warp_ctl_inter VX_warp_ctl
VX_warp_ctl_inter vx_warp_ctl
);
wire[`NUM_THREADS-1:0] curr_valids = VX_gpu_inst_req.valid;
wire is_split = (VX_gpu_inst_req.is_split);
wire[`NUM_THREADS-1:0] curr_valids = vx_gpu_inst_req.valid;
wire is_split = (vx_gpu_inst_req.is_split);
wire[`NUM_THREADS-1:0] tmc_new_mask;
wire all_threads = `NUM_THREADS < VX_gpu_inst_req.a_reg_data[0];
wire all_threads = `NUM_THREADS < vx_gpu_inst_req.a_reg_data[0];
genvar curr_t;
generate
for (curr_t = 0; curr_t < `NUM_THREADS; curr_t=curr_t+1) begin : tmc_new_mask_init
assign tmc_new_mask[curr_t] = all_threads ? 1 : curr_t < VX_gpu_inst_req.a_reg_data[0];
assign tmc_new_mask[curr_t] = all_threads ? 1 : curr_t < vx_gpu_inst_req.a_reg_data[0];
end
endgenerate
wire valid_inst = (|curr_valids);
assign VX_warp_ctl.warp_num = VX_gpu_inst_req.warp_num;
assign VX_warp_ctl.change_mask = (VX_gpu_inst_req.is_tmc) && valid_inst;
assign VX_warp_ctl.thread_mask = VX_gpu_inst_req.is_tmc ? tmc_new_mask : 0;
assign vx_warp_ctl.warp_num = vx_gpu_inst_req.warp_num;
assign vx_warp_ctl.change_mask = (vx_gpu_inst_req.is_tmc) && valid_inst;
assign vx_warp_ctl.thread_mask = vx_gpu_inst_req.is_tmc ? tmc_new_mask : 0;
// assign VX_warp_ctl.ebreak = (VX_gpu_inst_req.a_reg_data[0] == 0) && valid_inst;
assign VX_warp_ctl.ebreak = VX_warp_ctl.change_mask && (VX_warp_ctl.thread_mask == 0);
// assign vx_warp_ctl.ebreak = (vx_gpu_inst_req.a_reg_data[0] == 0) && valid_inst;
assign vx_warp_ctl.ebreak = vx_warp_ctl.change_mask && (vx_warp_ctl.thread_mask == 0);
wire wspawn = VX_gpu_inst_req.is_wspawn;
wire[31:0] wspawn_pc = VX_gpu_inst_req.rd2;
wire all_active = `NUM_WARPS < VX_gpu_inst_req.a_reg_data[0];
wire wspawn = vx_gpu_inst_req.is_wspawn;
wire[31:0] wspawn_pc = vx_gpu_inst_req.rd2;
wire all_active = `NUM_WARPS < vx_gpu_inst_req.a_reg_data[0];
wire[`NUM_WARPS-1:0] wspawn_new_active;
genvar curr_w;
generate
for (curr_w = 0; curr_w < `NUM_WARPS; curr_w=curr_w+1) begin : wspawn_new_active_init
assign wspawn_new_active[curr_w] = all_active ? 1 : curr_w < VX_gpu_inst_req.a_reg_data[0];
assign wspawn_new_active[curr_w] = all_active ? 1 : curr_w < vx_gpu_inst_req.a_reg_data[0];
end
endgenerate
assign vx_warp_ctl.is_barrier = vx_gpu_inst_req.is_barrier && valid_inst;
assign vx_warp_ctl.barrier_id = vx_gpu_inst_req.a_reg_data[0];
assign VX_warp_ctl.is_barrier = VX_gpu_inst_req.is_barrier && valid_inst;
assign VX_warp_ctl.barrier_id = VX_gpu_inst_req.a_reg_data[0];
/* verilator lint_off UNUSED */
wire[31:0] num_warps_m1 = vx_gpu_inst_req.rd2 - 1;
/* verilator lint_on UNUSED */
wire[31:0] num_warps_m1 = VX_gpu_inst_req.rd2 - 1;
assign VX_warp_ctl.num_warps = num_warps_m1[$clog2(`NUM_WARPS):0];
assign vx_warp_ctl.num_warps = num_warps_m1[$clog2(`NUM_WARPS):0];
assign VX_warp_ctl.wspawn = wspawn;
assign VX_warp_ctl.wspawn_pc = wspawn_pc;
assign VX_warp_ctl.wspawn_new_active = wspawn_new_active;
assign vx_warp_ctl.wspawn = wspawn;
assign vx_warp_ctl.wspawn_pc = wspawn_pc;
assign vx_warp_ctl.wspawn_new_active = wspawn_new_active;
wire[`NUM_THREADS-1:0] split_new_use_mask;
wire[`NUM_THREADS-1:0] split_new_later_mask;
@@ -60,7 +61,7 @@ module VX_gpgpu_inst (
genvar curr_s_t;
generate
for (curr_s_t = 0; curr_s_t < `NUM_THREADS; curr_s_t=curr_s_t+1) begin : masks_init
wire curr_bool = (VX_gpu_inst_req.a_reg_data[curr_s_t] == 32'b1);
wire curr_bool = (vx_gpu_inst_req.a_reg_data[curr_s_t] == 32'b1);
assign split_new_use_mask[curr_s_t] = curr_valids[curr_s_t] & (curr_bool);
assign split_new_later_mask[curr_s_t] = curr_valids[curr_s_t] & (!curr_bool);
@@ -69,23 +70,24 @@ module VX_gpgpu_inst (
wire[$clog2(`NUM_THREADS):0] num_valids;
VX_countones #(.N(`NUM_THREADS)) valids_counter (
VX_countones #(
.N(`NUM_THREADS)
) valids_counter (
.valids(curr_valids),
.count (num_valids)
);
);
// wire[`NW_BITS-1:0] num_valids = $countones(curr_valids);
assign VX_warp_ctl.is_split = is_split && (num_valids > 1);
assign VX_warp_ctl.dont_split = VX_warp_ctl.is_split && ((split_new_use_mask == 0) || (split_new_use_mask == {`NUM_THREADS{1'b1}}));
assign VX_warp_ctl.split_new_mask = split_new_use_mask;
assign VX_warp_ctl.split_later_mask = split_new_later_mask;
assign VX_warp_ctl.split_save_pc = VX_gpu_inst_req.pc_next;
assign VX_warp_ctl.split_warp_num = VX_gpu_inst_req.warp_num;
assign vx_warp_ctl.is_split = is_split && (num_valids > 1);
assign vx_warp_ctl.dont_split = vx_warp_ctl.is_split && ((split_new_use_mask == 0) || (split_new_use_mask == {`NUM_THREADS{1'b1}}));
assign vx_warp_ctl.split_new_mask = split_new_use_mask;
assign vx_warp_ctl.split_later_mask = split_new_later_mask;
assign vx_warp_ctl.split_save_pc = vx_gpu_inst_req.pc_next;
assign vx_warp_ctl.split_warp_num = vx_gpu_inst_req.warp_num;
// VX_gpu_inst_req.is_wspawn
// VX_gpu_inst_req.is_split
// VX_gpu_inst_req.is_barrier
// vx_gpu_inst_req.is_wspawn
// vx_gpu_inst_req.is_split
// vx_gpu_inst_req.is_barrier
endmodule

View File

@@ -1,168 +1,150 @@
`include "VX_define.vh"
module VX_gpr (
input wire clk,
input wire reset,
input wire valid_write_request,
VX_gpr_read_inter VX_gpr_read,
VX_wb_inter VX_writeback_inter,
input wire clk,
input wire reset,
input wire valid_write_request,
VX_gpr_read_inter vx_gpr_read,
VX_wb_inter vx_writeback_inter,
output reg[`NUM_THREADS-1:0][31:0] out_a_reg_data,
output reg[`NUM_THREADS-1:0][31:0] out_b_reg_data
output reg[`NUM_THREADS-1:0][`NUM_GPRS-1:0] out_a_reg_data,
output reg[`NUM_THREADS-1:0][`NUM_GPRS-1:0] out_b_reg_data
);
wire write_enable;
`ifndef ASIC
assign write_enable = valid_write_request && ((VX_writeback_inter.wb != 0)) && (VX_writeback_inter.rd != 0);
assign write_enable = valid_write_request && ((vx_writeback_inter.wb != 0)) && (vx_writeback_inter.rd != 0);
byte_enabled_simple_dual_port_ram first_ram(
.we (write_enable),
.clk (clk),
.reset (reset),
.waddr (VX_writeback_inter.rd),
.raddr1(VX_gpr_read.rs1),
.raddr2(VX_gpr_read.rs2),
.be (VX_writeback_inter.wb_valid),
.wdata (VX_writeback_inter.write_data),
.waddr (vx_writeback_inter.rd),
.raddr1(vx_gpr_read.rs1),
.raddr2(vx_gpr_read.rs2),
.be (vx_writeback_inter.wb_valid),
.wdata (vx_writeback_inter.write_data),
.q1 (out_a_reg_data),
.q2 (out_b_reg_data)
);
`else
assign write_enable = valid_write_request && ((VX_writeback_inter.wb != 0));
wire going_to_write = write_enable & (|VX_writeback_inter.wb_valid);
wire[`NUM_THREADS-1:0][31:0] write_bit_mask;
assign write_enable = valid_write_request && ((vx_writeback_inter.wb != 0));
wire going_to_write = write_enable & (|vx_writeback_inter.wb_valid);
wire[`NUM_THREADS-1:0][`NUM_GPRS-1:0] write_bit_mask;
genvar curr_t;
for (curr_t = 0; curr_t < `NUM_THREADS; curr_t=curr_t+1) begin
wire local_write = write_enable & VX_writeback_inter.wb_valid[curr_t];
assign write_bit_mask[curr_t] = {32{~local_write}};
wire local_write = write_enable & vx_writeback_inter.wb_valid[curr_t];
assign write_bit_mask[curr_t] = {`NUM_GPRS{~local_write}};
end
// wire cenb = !going_to_write;
wire cenb = 0;
// wire cena_1 = (VX_gpr_read.rs1 == 0);
// wire cena_2 = (VX_gpr_read.rs2 == 0);
// wire cena_1 = (vx_gpr_read.rs1 == 0);
// wire cena_2 = (vx_gpr_read.rs2 == 0);
wire cena_1 = 0;
wire cena_2 = 0;
wire[`NUM_THREADS-1:0][31:0] temp_a;
wire[`NUM_THREADS-1:0][31:0] temp_b;
wire[`NUM_THREADS-1:0][`NUM_GPRS-1:0] temp_a;
wire[`NUM_THREADS-1:0][`NUM_GPRS-1:0] temp_b;
`ifndef SYN
genvar thread;
genvar curr_bit;
for (thread = 0; thread < `NUM_THREADS; thread = thread + 1)
`ifndef SYN
genvar thread;
genvar curr_bit;
for (thread = 0; thread < `NUM_THREADS; thread = thread + 1)
begin
for (curr_bit = 0; curr_bit < `NUM_GPRS; curr_bit=curr_bit+1)
begin
for (curr_bit = 0; curr_bit < 32; curr_bit=curr_bit+1)
begin
assign out_a_reg_data[thread][curr_bit] = ((temp_a[thread][curr_bit] === 1'dx) || cena_1 )? 1'b0 : temp_a[thread][curr_bit];
assign out_b_reg_data[thread][curr_bit] = ((temp_b[thread][curr_bit] === 1'dx) || cena_2) ? 1'b0 : temp_b[thread][curr_bit];
end
assign out_a_reg_data[thread][curr_bit] = ((temp_a[thread][curr_bit] === 1'dx) || cena_1 )? 1'b0 : temp_a[thread][curr_bit];
assign out_b_reg_data[thread][curr_bit] = ((temp_b[thread][curr_bit] === 1'dx) || cena_2) ? 1'b0 : temp_b[thread][curr_bit];
end
`else
end
`else
assign out_a_reg_data = temp_a;
assign out_b_reg_data = temp_b;
`endif
`endif
wire[`NUM_THREADS-1:0][31:0] to_write = (VX_writeback_inter.rd != 0) ? VX_writeback_inter.write_data : 0;
wire[`NUM_THREADS-1:0][`NUM_GPRS-1:0] to_write = (vx_writeback_inter.rd != 0) ? vx_writeback_inter.write_data : 0;
genvar curr_base_thread;
for (curr_base_thread = 0; curr_base_thread < 'NT; curr_base_thread=curr_base_thread+4)
begin
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 first_ram (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_a[(curr_base_thread+3):(curr_base_thread)]),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_1),
.AA(VX_gpr_read.rs1[(curr_base_thread+3):(curr_base_thread)]),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
.AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_a[(curr_base_thread+3):(curr_base_thread)]),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_1),
.AA(vx_gpr_read.rs1[(curr_base_thread+3):(curr_base_thread)]),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
.AB(vx_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 second_ram (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_b[(curr_base_thread+3):(curr_base_thread)]),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_2),
.AA(VX_gpr_read.rs2[(curr_base_thread+3):(curr_base_thread)]),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
.AB(VX_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
rf2_`NUM_GPRSx128_wm1 second_ram (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(temp_b[(curr_base_thread+3):(curr_base_thread)]),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena_2),
.AA(vx_gpr_read.rs2[(curr_base_thread+3):(curr_base_thread)]),
.CLKB(clk),
.CENB(cenb),
.WENB(write_bit_mask[(curr_base_thread+3):(curr_base_thread)]),
.AB(vx_writeback_inter.rd[(curr_base_thread+3):(curr_base_thread)]),
.DB(to_write[(curr_base_thread+3):(curr_base_thread)]),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
end

View File

@@ -11,108 +11,100 @@ module VX_gpr_stage (
output wire gpr_stage_delay,
// inputs
// Instruction Information
VX_frE_to_bckE_req_inter VX_bckE_req,
// WriteBack inputs
VX_wb_inter VX_writeback_inter,
// Instruction Information
VX_frE_to_bckE_req_inter vx_bckE_req,
// WriteBack inputs
VX_wb_inter vx_writeback_inter,
// Outputs
VX_exec_unit_req_inter VX_exec_unit_req,
VX_lsu_req_inter VX_lsu_req,
VX_gpu_inst_req_inter VX_gpu_inst_req,
VX_csr_req_inter VX_csr_req
VX_exec_unit_req_inter vx_exec_unit_req,
VX_lsu_req_inter vx_lsu_req,
VX_gpu_inst_req_inter vx_gpu_inst_req,
VX_csr_req_inter vx_csr_req
);
/* verilator lint_off UNUSED */
wire[31:0] curr_PC = vx_bckE_req.curr_PC;
wire[2:0] branchType = vx_bckE_req.branch_type;
wire is_store = (vx_bckE_req.mem_write != `NO_MEM_WRITE);
wire is_load = (vx_bckE_req.mem_read != `NO_MEM_READ);
wire jalQual = vx_bckE_req.jalQual;
/* verilator lint_on UNUSED */
VX_gpr_read_inter vx_gpr_read();
assign vx_gpr_read.rs1 = vx_bckE_req.rs1;
assign vx_gpr_read.rs2 = vx_bckE_req.rs2;
assign vx_gpr_read.warp_num = vx_bckE_req.warp_num;
wire[31:0] curr_PC = VX_bckE_req.curr_PC;
wire[2:0] branchType = VX_bckE_req.branch_type;
`ifndef ASIC
VX_gpr_jal_inter vx_gpr_jal();
assign vx_gpr_jal.is_jal = vx_bckE_req.jalQual;
assign vx_gpr_jal.curr_PC = vx_bckE_req.curr_PC;
`else
VX_gpr_jal_inter vx_gpr_jal();
assign vx_gpr_jal.is_jal = vx_exec_unit_req.jalQual;
assign vx_gpr_jal.curr_PC = vx_exec_unit_req.curr_PC;
`endif
wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE);
wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ);
VX_gpr_data_inter vx_gpr_datf();
VX_gpr_wrapper vx_grp_wrapper (
.clk (clk),
.reset (reset),
.vx_writeback_inter(vx_writeback_inter),
.vx_gpr_read (vx_gpr_read),
.vx_gpr_jal (vx_gpr_jal),
wire jalQual = VX_bckE_req.jalQual;
.out_a_reg_data (vx_gpr_datf.a_reg_data),
.out_b_reg_data (vx_gpr_datf.b_reg_data)
);
VX_gpr_read_inter VX_gpr_read();
assign VX_gpr_read.rs1 = VX_bckE_req.rs1;
assign VX_gpr_read.rs2 = VX_bckE_req.rs2;
assign VX_gpr_read.warp_num = VX_bckE_req.warp_num;
`ifndef ASIC
VX_gpr_jal_inter VX_gpr_jal();
assign VX_gpr_jal.is_jal = VX_bckE_req.jalQual;
assign VX_gpr_jal.curr_PC = VX_bckE_req.curr_PC;
`else
VX_gpr_jal_inter VX_gpr_jal();
assign VX_gpr_jal.is_jal = VX_exec_unit_req.jalQual;
assign VX_gpr_jal.curr_PC = VX_exec_unit_req.curr_PC;
`endif
VX_gpr_data_inter VX_gpr_datf();
VX_gpr_wrapper vx_grp_wrapper(
.clk (clk),
.reset (reset),
.VX_writeback_inter(VX_writeback_inter),
.VX_gpr_read (VX_gpr_read),
.VX_gpr_jal (VX_gpr_jal),
.out_a_reg_data (VX_gpr_datf.a_reg_data),
.out_b_reg_data (VX_gpr_datf.b_reg_data)
);
// assign VX_bckE_req.is_csr = is_csr;
// assign VX_bckE_req_out.csr_mask = (VX_bckE_req.sr_immed == 1'b1) ? {27'h0, VX_bckE_req.rs1} : VX_gpr_data.a_reg_data[0];
// assign vx_bckE_req.is_csr = is_csr;
// assign vx_bckE_req_out.csr_mask = (vx_bckE_req.sr_immed == 1'b1) ? {27'h0, vx_bckE_req.rs1} : vx_gpr_data.a_reg_data[0];
// Outputs
VX_exec_unit_req_inter VX_exec_unit_req_temp();
VX_lsu_req_inter VX_lsu_req_temp();
VX_gpu_inst_req_inter VX_gpu_inst_req_temp();
VX_csr_req_inter VX_csr_req_temp();
VX_inst_multiplex VX_inst_mult(
.VX_bckE_req (VX_bckE_req),
.VX_gpr_data (VX_gpr_datf),
.VX_exec_unit_req(VX_exec_unit_req_temp),
.VX_lsu_req (VX_lsu_req_temp),
.VX_gpu_inst_req (VX_gpu_inst_req_temp),
.VX_csr_req (VX_csr_req_temp)
);
wire is_lsu = (|VX_lsu_req_temp.valid);
VX_exec_unit_req_inter vx_exec_unit_req_temp();
VX_lsu_req_inter vx_lsu_req_temp();
VX_gpu_inst_req_inter vx_gpu_inst_req_temp();
VX_csr_req_inter vx_csr_req_temp();
VX_inst_multiplex vx_inst_mult(
.vx_bckE_req (vx_bckE_req),
.vx_gpr_data (vx_gpr_datf),
.vx_exec_unit_req(vx_exec_unit_req_temp),
.vx_lsu_req (vx_lsu_req_temp),
.vx_gpu_inst_req (vx_gpu_inst_req_temp),
.vx_csr_req (vx_csr_req_temp)
);
/* verilator lint_off UNUSED */
wire is_lsu = (|vx_lsu_req_temp.valid);
/* verilator lint_on UNUSED */
wire stall_rest = 0;
wire flush_rest = schedule_delay;
wire stall_lsu = memory_delay;
wire flush_lsu = schedule_delay && !stall_lsu;
wire stall_exec = exec_delay;
wire flush_exec = schedule_delay && !stall_exec;
wire stall_csr = stall_gpr_csr && VX_bckE_req.is_csr && (|VX_bckE_req.valid);
wire stall_csr = stall_gpr_csr && vx_bckE_req.is_csr && (|vx_bckE_req.valid);
assign gpr_stage_delay = stall_lsu || stall_exec || stall_csr;
`ifdef ASIC
`ifdef ASIC
wire delayed_lsu_last_cycle;
VX_generic_register #(.N(1)) delayed_reg (
VX_generic_register #(
.N(1)
) delayed_reg (
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(stall_rest),
.in (stall_lsu),
.out (delayed_lsu_last_cycle)
);
);
wire[`NUM_THREADS-1:0][31:0] temp_store_data;
wire[`NUM_THREADS-1:0][31:0] temp_base_address; // A reg data
@@ -122,107 +114,120 @@ module VX_gpr_stage (
wire store_curr_real = !delayed_lsu_last_cycle && stall_lsu;
VX_generic_register #(.N(`NUM_THREADS*32*2)) lsu_data(
VX_generic_register #(
.N(`NUM_THREADS*32*2)
) lsu_data (
.clk (clk),
.reset(reset),
.stall(!store_curr_real),
.flush(stall_rest),
.in ({real_store_data, real_base_address}),
.out ({temp_store_data, temp_base_address})
);
);
assign real_store_data = VX_lsu_req_temp.store_data;
assign real_base_address = VX_lsu_req_temp.base_address;
assign real_store_data = vx_lsu_req_temp.store_data;
assign real_base_address = vx_lsu_req_temp.base_address;
assign vx_lsu_req.store_data = (delayed_lsu_last_cycle) ? temp_store_data : real_store_data;
assign vx_lsu_req.base_address = (delayed_lsu_last_cycle) ? temp_base_address : real_base_address;
assign VX_lsu_req.store_data = (delayed_lsu_last_cycle) ? temp_store_data : real_store_data;
assign VX_lsu_req.base_address = (delayed_lsu_last_cycle) ? temp_base_address : real_base_address;
VX_generic_register #(.N(77 + `NW_BITS-1 + 1 + (`NUM_THREADS))) lsu_reg(
VX_generic_register #(
.N(77 + `NW_BITS-1 + 1 + (`NUM_THREADS))
) lsu_reg (
.clk (clk),
.reset(reset),
.stall(stall_lsu),
.flush(flush_lsu),
.in ({VX_lsu_req_temp.valid, VX_lsu_req_temp.lsu_pc, VX_lsu_req_temp.warp_num, VX_lsu_req_temp.offset, VX_lsu_req_temp.mem_read, VX_lsu_req_temp.mem_write, VX_lsu_req_temp.rd, VX_lsu_req_temp.wb}),
.out ({VX_lsu_req.valid , VX_lsu_req.lsu_pc ,VX_lsu_req.warp_num , VX_lsu_req.offset , VX_lsu_req.mem_read , VX_lsu_req.mem_write , VX_lsu_req.rd , VX_lsu_req.wb })
);
.in ({vx_lsu_req_temp.valid, vx_lsu_req_temp.lsu_pc, vx_lsu_req_temp.warp_num, vx_lsu_req_temp.offset, vx_lsu_req_temp.mem_read, vx_lsu_req_temp.mem_write, vx_lsu_req_temp.rd, vx_lsu_req_temp.wb}),
.out ({vx_lsu_req.valid , vx_lsu_req.lsu_pc ,vx_lsu_req.warp_num , vx_lsu_req.offset , vx_lsu_req.mem_read , vx_lsu_req.mem_write , vx_lsu_req.rd , vx_lsu_req.wb })
);
VX_generic_register #(.N(224 + `NW_BITS-1 + 1 + (`NUM_THREADS))) exec_unit_reg(
VX_generic_register #(
.N(224 + `NW_BITS-1 + 1 + (`NUM_THREADS))
) exec_unit_reg (
.clk (clk),
.reset(reset),
.stall(stall_exec),
.flush(flush_exec),
.in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}),
.out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask })
);
.in ({vx_exec_unit_req_temp.valid, vx_exec_unit_req_temp.warp_num, vx_exec_unit_req_temp.curr_PC, vx_exec_unit_req_temp.PC_next, vx_exec_unit_req_temp.rd, vx_exec_unit_req_temp.wb, vx_exec_unit_req_temp.alu_op, vx_exec_unit_req_temp.rs1, vx_exec_unit_req_temp.rs2, vx_exec_unit_req_temp.rs2_src, vx_exec_unit_req_temp.itype_immed, vx_exec_unit_req_temp.upper_immed, vx_exec_unit_req_temp.branch_type, vx_exec_unit_req_temp.jalQual, vx_exec_unit_req_temp.jal, vx_exec_unit_req_temp.jal_offset, vx_exec_unit_req_temp.ebreak, vx_exec_unit_req_temp.wspawn, vx_exec_unit_req_temp.is_csr, vx_exec_unit_req_temp.csr_address, vx_exec_unit_req_temp.csr_immed, vx_exec_unit_req_temp.csr_mask}),
.out ({vx_exec_unit_req.valid , vx_exec_unit_req.warp_num , vx_exec_unit_req.curr_PC , vx_exec_unit_req.PC_next , vx_exec_unit_req.rd , vx_exec_unit_req.wb , vx_exec_unit_req.alu_op , vx_exec_unit_req.rs1 , vx_exec_unit_req.rs2 , vx_exec_unit_req.rs2_src , vx_exec_unit_req.itype_immed , vx_exec_unit_req.upper_immed , vx_exec_unit_req.branch_type , vx_exec_unit_req.jalQual , vx_exec_unit_req.jal , vx_exec_unit_req.jal_offset , vx_exec_unit_req.ebreak , vx_exec_unit_req.wspawn , vx_exec_unit_req.is_csr , vx_exec_unit_req.csr_address , vx_exec_unit_req.csr_immed , vx_exec_unit_req.csr_mask })
);
assign VX_exec_unit_req.a_reg_data = real_base_address;
assign VX_exec_unit_req.b_reg_data = real_store_data;
assign vx_exec_unit_req.a_reg_data = real_base_address;
assign vx_exec_unit_req.b_reg_data = real_store_data;
VX_generic_register #(.N(36 + `NW_BITS-1 + 1 + (`NUM_THREADS))) gpu_inst_reg(
VX_generic_register #(
.N(36 + `NW_BITS-1 + 1 + (`NUM_THREADS))
) gpu_inst_reg (
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(flush_rest),
.in ({VX_gpu_inst_req_temp.valid, VX_gpu_inst_req_temp.warp_num, VX_gpu_inst_req_temp.is_wspawn, VX_gpu_inst_req_temp.is_tmc, VX_gpu_inst_req_temp.is_split, VX_gpu_inst_req_temp.is_barrier, VX_gpu_inst_req_temp.pc_next}),
.out ({VX_gpu_inst_req.valid , VX_gpu_inst_req.warp_num , VX_gpu_inst_req.is_wspawn , VX_gpu_inst_req.is_tmc , VX_gpu_inst_req.is_split , VX_gpu_inst_req.is_barrier , VX_gpu_inst_req.pc_next })
);
.in ({vx_gpu_inst_req_temp.valid, vx_gpu_inst_req_temp.warp_num, vx_gpu_inst_req_temp.is_wspawn, vx_gpu_inst_req_temp.is_tmc, vx_gpu_inst_req_temp.is_split, vx_gpu_inst_req_temp.is_barrier, vx_gpu_inst_req_temp.pc_next}),
.out ({vx_gpu_inst_req.valid , vx_gpu_inst_req.warp_num , vx_gpu_inst_req.is_wspawn , vx_gpu_inst_req.is_tmc , vx_gpu_inst_req.is_split , vx_gpu_inst_req.is_barrier , vx_gpu_inst_req.pc_next })
);
assign VX_gpu_inst_req.a_reg_data = real_base_address;
assign VX_gpu_inst_req.rd2 = real_store_data;
assign vx_gpu_inst_req.a_reg_data = real_base_address;
assign vx_gpu_inst_req.rd2 = real_store_data;
VX_generic_register #(.N(`NW_BITS-1 + 1 + `NUM_THREADS + 58)) csr_reg(
VX_generic_register #(
.N(`NW_BITS-1 + 1 + `NUM_THREADS + 58)
) csr_reg (
.clk (clk),
.reset(reset),
.stall(stall_gpr_csr),
.flush(flush_rest),
.in ({VX_csr_req_temp.valid, VX_csr_req_temp.warp_num, VX_csr_req_temp.rd, VX_csr_req_temp.wb, VX_csr_req_temp.alu_op, VX_csr_req_temp.is_csr, VX_csr_req_temp.csr_address, VX_csr_req_temp.csr_immed, VX_csr_req_temp.csr_mask}),
.out ({VX_csr_req.valid , VX_csr_req.warp_num , VX_csr_req.rd , VX_csr_req.wb , VX_csr_req.alu_op , VX_csr_req.is_csr , VX_csr_req.csr_address , VX_csr_req.csr_immed , VX_csr_req.csr_mask })
);
.in ({vx_csr_req_temp.valid, vx_csr_req_temp.warp_num, vx_csr_req_temp.rd, vx_csr_req_temp.wb, vx_csr_req_temp.alu_op, vx_csr_req_temp.is_csr, vx_csr_req_temp.csr_address, vx_csr_req_temp.csr_immed, vx_csr_req_temp.csr_mask}),
.out ({vx_csr_req.valid , vx_csr_req.warp_num , vx_csr_req.rd , vx_csr_req.wb , vx_csr_req.alu_op , vx_csr_req.is_csr , vx_csr_req.csr_address , vx_csr_req.csr_immed , vx_csr_req.csr_mask })
);
// assign
`else
`else
// 341
VX_generic_register #(.N(77 + `NW_BITS-1 + 1 + 65*(`NUM_THREADS))) lsu_reg(
VX_generic_register #(
.N(77 + `NW_BITS-1 + 1 + 65*(`NUM_THREADS))
) lsu_reg (
.clk (clk),
.reset(reset),
.stall(stall_lsu),
.flush(flush_lsu),
.in ({VX_lsu_req_temp.valid, VX_lsu_req_temp.lsu_pc, VX_lsu_req_temp.warp_num, VX_lsu_req_temp.store_data, VX_lsu_req_temp.base_address, VX_lsu_req_temp.offset, VX_lsu_req_temp.mem_read, VX_lsu_req_temp.mem_write, VX_lsu_req_temp.rd, VX_lsu_req_temp.wb}),
.out ({VX_lsu_req.valid , VX_lsu_req.lsu_pc , VX_lsu_req.warp_num , VX_lsu_req.store_data , VX_lsu_req.base_address , VX_lsu_req.offset , VX_lsu_req.mem_read , VX_lsu_req.mem_write , VX_lsu_req.rd , VX_lsu_req.wb })
);
.in ({vx_lsu_req_temp.valid, vx_lsu_req_temp.lsu_pc, vx_lsu_req_temp.warp_num, vx_lsu_req_temp.store_data, vx_lsu_req_temp.base_address, vx_lsu_req_temp.offset, vx_lsu_req_temp.mem_read, vx_lsu_req_temp.mem_write, vx_lsu_req_temp.rd, vx_lsu_req_temp.wb}),
.out ({vx_lsu_req.valid , vx_lsu_req.lsu_pc , vx_lsu_req.warp_num , vx_lsu_req.store_data , vx_lsu_req.base_address , vx_lsu_req.offset , vx_lsu_req.mem_read , vx_lsu_req.mem_write , vx_lsu_req.rd , vx_lsu_req.wb })
);
VX_generic_register #(.N(224 + `NW_BITS-1 + 1 + 65*(`NUM_THREADS))) exec_unit_reg(
VX_generic_register #(
.N(224 + `NW_BITS-1 + 1 + 65*(`NUM_THREADS))
) exec_unit_reg (
.clk (clk),
.reset(reset),
.stall(stall_exec),
.flush(flush_exec),
.in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.a_reg_data, VX_exec_unit_req_temp.b_reg_data, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}),
.out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.a_reg_data , VX_exec_unit_req.b_reg_data , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask })
);
.in ({vx_exec_unit_req_temp.valid, vx_exec_unit_req_temp.warp_num, vx_exec_unit_req_temp.curr_PC, vx_exec_unit_req_temp.PC_next, vx_exec_unit_req_temp.rd, vx_exec_unit_req_temp.wb, vx_exec_unit_req_temp.a_reg_data, vx_exec_unit_req_temp.b_reg_data, vx_exec_unit_req_temp.alu_op, vx_exec_unit_req_temp.rs1, vx_exec_unit_req_temp.rs2, vx_exec_unit_req_temp.rs2_src, vx_exec_unit_req_temp.itype_immed, vx_exec_unit_req_temp.upper_immed, vx_exec_unit_req_temp.branch_type, vx_exec_unit_req_temp.jalQual, vx_exec_unit_req_temp.jal, vx_exec_unit_req_temp.jal_offset, vx_exec_unit_req_temp.ebreak, vx_exec_unit_req_temp.wspawn, vx_exec_unit_req_temp.is_csr, vx_exec_unit_req_temp.csr_address, vx_exec_unit_req_temp.csr_immed, vx_exec_unit_req_temp.csr_mask}),
.out ({vx_exec_unit_req.valid , vx_exec_unit_req.warp_num , vx_exec_unit_req.curr_PC , vx_exec_unit_req.PC_next , vx_exec_unit_req.rd , vx_exec_unit_req.wb , vx_exec_unit_req.a_reg_data , vx_exec_unit_req.b_reg_data , vx_exec_unit_req.alu_op , vx_exec_unit_req.rs1 , vx_exec_unit_req.rs2 , vx_exec_unit_req.rs2_src , vx_exec_unit_req.itype_immed , vx_exec_unit_req.upper_immed , vx_exec_unit_req.branch_type , vx_exec_unit_req.jalQual , vx_exec_unit_req.jal , vx_exec_unit_req.jal_offset , vx_exec_unit_req.ebreak , vx_exec_unit_req.wspawn , vx_exec_unit_req.is_csr , vx_exec_unit_req.csr_address , vx_exec_unit_req.csr_immed , vx_exec_unit_req.csr_mask })
);
VX_generic_register #(.N(68 + `NW_BITS-1 + 1 + 33*(`NUM_THREADS))) gpu_inst_reg(
VX_generic_register #(
.N(68 + `NW_BITS-1 + 1 + 33*(`NUM_THREADS))
) gpu_inst_reg (
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(flush_rest),
.in ({VX_gpu_inst_req_temp.valid, VX_gpu_inst_req_temp.warp_num, VX_gpu_inst_req_temp.is_wspawn, VX_gpu_inst_req_temp.is_tmc, VX_gpu_inst_req_temp.is_split, VX_gpu_inst_req_temp.is_barrier, VX_gpu_inst_req_temp.pc_next, VX_gpu_inst_req_temp.a_reg_data, VX_gpu_inst_req_temp.rd2}),
.out ({VX_gpu_inst_req.valid , VX_gpu_inst_req.warp_num , VX_gpu_inst_req.is_wspawn , VX_gpu_inst_req.is_tmc , VX_gpu_inst_req.is_split , VX_gpu_inst_req.is_barrier , VX_gpu_inst_req.pc_next , VX_gpu_inst_req.a_reg_data , VX_gpu_inst_req.rd2 })
);
.in ({vx_gpu_inst_req_temp.valid, vx_gpu_inst_req_temp.warp_num, vx_gpu_inst_req_temp.is_wspawn, vx_gpu_inst_req_temp.is_tmc, vx_gpu_inst_req_temp.is_split, vx_gpu_inst_req_temp.is_barrier, vx_gpu_inst_req_temp.pc_next, vx_gpu_inst_req_temp.a_reg_data, vx_gpu_inst_req_temp.rd2}),
.out ({vx_gpu_inst_req.valid , vx_gpu_inst_req.warp_num , vx_gpu_inst_req.is_wspawn , vx_gpu_inst_req.is_tmc , vx_gpu_inst_req.is_split , vx_gpu_inst_req.is_barrier , vx_gpu_inst_req.pc_next , vx_gpu_inst_req.a_reg_data , vx_gpu_inst_req.rd2 })
);
VX_generic_register #(.N(`NW_BITS-1 + 1 + `NUM_THREADS + 58)) csr_reg(
VX_generic_register #(
.N(`NW_BITS-1 + 1 + `NUM_THREADS + 58)
) csr_reg (
.clk (clk),
.reset(reset),
.stall(stall_gpr_csr),
.flush(flush_rest),
.in ({VX_csr_req_temp.valid, VX_csr_req_temp.warp_num, VX_csr_req_temp.rd, VX_csr_req_temp.wb, VX_csr_req_temp.alu_op, VX_csr_req_temp.is_csr, VX_csr_req_temp.csr_address, VX_csr_req_temp.csr_immed, VX_csr_req_temp.csr_mask}),
.out ({VX_csr_req.valid , VX_csr_req.warp_num , VX_csr_req.rd , VX_csr_req.wb , VX_csr_req.alu_op , VX_csr_req.is_csr , VX_csr_req.csr_address , VX_csr_req.csr_immed , VX_csr_req.csr_mask })
);
.in ({vx_csr_req_temp.valid, vx_csr_req_temp.warp_num, vx_csr_req_temp.rd, vx_csr_req_temp.wb, vx_csr_req_temp.alu_op, vx_csr_req_temp.is_csr, vx_csr_req_temp.csr_address, vx_csr_req_temp.csr_immed, vx_csr_req_temp.csr_mask}),
.out ({vx_csr_req.valid , vx_csr_req.warp_num , vx_csr_req.rd , vx_csr_req.wb , vx_csr_req.alu_op , vx_csr_req.is_csr , vx_csr_req.csr_address , vx_csr_req.csr_immed , vx_csr_req.csr_mask })
);
`endif
`endif
endmodule : VX_gpr_stage

View File

@@ -3,9 +3,9 @@
module VX_gpr_wrapper (
input wire clk,
input wire reset,
VX_gpr_read_inter VX_gpr_read,
VX_wb_inter VX_writeback_inter,
VX_gpr_jal_inter VX_gpr_jal,
VX_gpr_read_inter vx_gpr_read,
VX_wb_inter vx_writeback_inter,
VX_gpr_jal_inter vx_gpr_jal,
output wire[`NUM_THREADS-1:0][31:0] out_a_reg_data,
output wire[`NUM_THREADS-1:0][31:0] out_b_reg_data
@@ -19,28 +19,30 @@ module VX_gpr_wrapper (
genvar index;
generate
for (index = 0; index < `NUM_THREADS; index = index + 1) begin : jal_data_assign
assign jal_data[index] = VX_gpr_jal.curr_PC;
assign jal_data[index] = vx_gpr_jal.curr_PC;
end
endgenerate
`ifndef ASIC
assign out_a_reg_data = (VX_gpr_jal.is_jal ? jal_data : (temp_a_reg_data[VX_gpr_read.warp_num]));
assign out_b_reg_data = (temp_b_reg_data[VX_gpr_read.warp_num]);
assign out_a_reg_data = (vx_gpr_jal.is_jal ? jal_data : (temp_a_reg_data[vx_gpr_read.warp_num]));
assign out_b_reg_data = (temp_b_reg_data[vx_gpr_read.warp_num]);
`else
wire zer = 0;
wire[`NW_BITS-1:0] old_warp_num;
VX_generic_register #(`NW_BITS-1+1) store_wn(
VX_generic_register #(
.N(`NW_BITS-1+1)
) store_wn (
.clk (clk),
.reset(reset),
.stall(zer),
.flush(zer),
.in (VX_gpr_read.warp_num),
.in (vx_gpr_read.warp_num),
.out (old_warp_num)
);
);
assign out_a_reg_data = (VX_gpr_jal.is_jal ? jal_data : (temp_a_reg_data[old_warp_num]));
assign out_a_reg_data = (vx_gpr_jal.is_jal ? jal_data : (temp_a_reg_data[old_warp_num]));
assign out_b_reg_data = (temp_b_reg_data[old_warp_num]);
`endif
@@ -50,13 +52,13 @@ module VX_gpr_wrapper (
for (warp_index = 0; warp_index < `NUM_WARPS; warp_index = warp_index + 1) begin : warp_gprs
wire valid_write_request = warp_index == VX_writeback_inter.wb_warp_num;
wire valid_write_request = warp_index == vx_writeback_inter.wb_warp_num;
VX_gpr vx_gpr(
.clk (clk),
.reset (reset),
.valid_write_request(valid_write_request),
.VX_gpr_read (VX_gpr_read),
.VX_writeback_inter (VX_writeback_inter),
.vx_gpr_read (vx_gpr_read),
.vx_writeback_inter (vx_writeback_inter),
.out_a_reg_data (temp_a_reg_data[warp_index]),
.out_b_reg_data (temp_b_reg_data[warp_index])
);
@@ -65,7 +67,6 @@ module VX_gpr_wrapper (
endgenerate
endmodule

View File

@@ -5,61 +5,56 @@ module VX_icache_stage (
input wire reset,
input wire total_freeze,
output wire icache_stage_delay,
output wire[`NW_BITS-1:0] icache_stage_wid,
output wire[`NUM_THREADS-1:0] icache_stage_valids,
output wire[`NW_BITS-1:0] icache_stage_wid,
output wire[`NUM_THREADS-1:0] icache_stage_valids,
VX_inst_meta_inter fe_inst_meta_fi,
VX_inst_meta_inter fe_inst_meta_id,
VX_gpu_dcache_res_inter VX_icache_rsp,
VX_gpu_dcache_req_inter VX_icache_req
VX_gpu_dcache_rsp_inter vx_icache_rsp,
VX_gpu_dcache_req_inter vx_icache_req
);
reg[`NUM_THREADS-1:0] threads_active[`NUM_WARPS-1:0];
reg[`NUM_THREADS-1:0] threads_active[`NUM_WARPS-1:0];
wire valid_inst = (|fe_inst_meta_fi.valid);
wire valid_inst = (|fe_inst_meta_fi.valid);
// Icache Request
assign VX_icache_req.core_req_valid = valid_inst && !total_freeze;
assign VX_icache_req.core_req_addr = fe_inst_meta_fi.inst_pc;
assign VX_icache_req.core_req_writedata = 32'b0;
assign VX_icache_req.core_req_mem_read = `LW_MEM_READ;
assign VX_icache_req.core_req_mem_write = `NO_MEM_WRITE;
assign VX_icache_req.core_req_rd = 5'b0;
assign VX_icache_req.core_req_wb = {1{2'b1}};
assign VX_icache_req.core_req_warp_num = fe_inst_meta_fi.warp_num;
assign VX_icache_req.core_req_pc = fe_inst_meta_fi.inst_pc;
// Icache Request
assign vx_icache_req.core_req_valid = valid_inst && !total_freeze;
assign vx_icache_req.core_req_addr = fe_inst_meta_fi.inst_pc;
assign vx_icache_req.core_req_writedata = 32'b0;
assign vx_icache_req.core_req_mem_read = `LW_MEM_READ;
assign vx_icache_req.core_req_mem_write = `NO_MEM_WRITE;
assign vx_icache_req.core_req_rd = 5'b0;
assign vx_icache_req.core_req_wb = {1{2'b1}};
assign vx_icache_req.core_req_warp_num = fe_inst_meta_fi.warp_num;
assign vx_icache_req.core_req_pc = fe_inst_meta_fi.inst_pc;
assign fe_inst_meta_id.instruction = vx_icache_rsp.core_wb_readdata[0][31:0];
assign fe_inst_meta_id.inst_pc = vx_icache_rsp.core_wb_pc[0];
assign fe_inst_meta_id.warp_num = vx_icache_rsp.core_wb_warp_num;
assign fe_inst_meta_id.valid = vx_icache_rsp.core_wb_valid ? threads_active[vx_icache_rsp.core_wb_warp_num] : 0;
assign fe_inst_meta_id.instruction = VX_icache_rsp.core_wb_readdata[0][31:0];
assign fe_inst_meta_id.inst_pc = VX_icache_rsp.core_wb_pc[0];
assign fe_inst_meta_id.warp_num = VX_icache_rsp.core_wb_warp_num;
/* verilator lint_off WIDTH */
assign fe_inst_meta_id.valid = VX_icache_rsp.core_wb_valid ? threads_active[VX_icache_rsp.core_wb_warp_num] : 0;
/* verilator lint_off WIDTH */
assign icache_stage_wid = fe_inst_meta_id.warp_num;
assign icache_stage_valids = fe_inst_meta_id.valid & {`NUM_THREADS{!icache_stage_delay}};
assign icache_stage_wid = fe_inst_meta_id.warp_num;
assign icache_stage_valids = fe_inst_meta_id.valid & {`NUM_THREADS{!icache_stage_delay}};
// Cache can't accept request
assign icache_stage_delay = vx_icache_rsp.delay_req;
// Cache can't accept request
assign icache_stage_delay = VX_icache_rsp.delay_req;
// Core can't accept response
assign vx_icache_req.core_no_wb_slot = total_freeze;
// Core can't accept response
assign VX_icache_req.core_no_wb_slot = total_freeze;
integer curr_w;
always @(posedge clk) begin
if (reset) begin
for (curr_w = 0; curr_w < `NUM_WARPS; curr_w=curr_w+1) threads_active[curr_w] <= 0;
end else begin
if (valid_inst && !icache_stage_delay) begin
/* verilator lint_off WIDTH */
threads_active[fe_inst_meta_fi.warp_num] <= fe_inst_meta_fi.valid;
/* verilator lint_on WIDTH */
end
integer curr_w;
always @(posedge clk) begin
if (reset) begin
for (curr_w = 0; curr_w < `NUM_WARPS; curr_w=curr_w+1) begin
threads_active[curr_w] <= 0;
end
end else begin
if (valid_inst && !icache_stage_delay) begin
threads_active[fe_inst_meta_fi.warp_num] <= fe_inst_meta_fi.valid;
end
end
end
endmodule

View File

@@ -2,23 +2,23 @@
module VX_inst_multiplex (
// Inputs
VX_frE_to_bckE_req_inter VX_bckE_req,
VX_gpr_data_inter VX_gpr_data,
VX_frE_to_bckE_req_inter vx_bckE_req,
VX_gpr_data_inter vx_gpr_data,
// Outputs
VX_exec_unit_req_inter VX_exec_unit_req,
VX_lsu_req_inter VX_lsu_req,
VX_gpu_inst_req_inter VX_gpu_inst_req,
VX_csr_req_inter VX_csr_req
VX_exec_unit_req_inter vx_exec_unit_req,
VX_lsu_req_inter vx_lsu_req,
VX_gpu_inst_req_inter vx_gpu_inst_req,
VX_csr_req_inter vx_csr_req
);
wire[`NUM_THREADS-1:0] is_mem_mask;
wire[`NUM_THREADS-1:0] is_gpu_mask;
wire[`NUM_THREADS-1:0] is_csr_mask;
wire is_mem = (VX_bckE_req.mem_write != `NO_MEM_WRITE) || (VX_bckE_req.mem_read != `NO_MEM_READ);
wire is_gpu = (VX_bckE_req.is_wspawn || VX_bckE_req.is_tmc || VX_bckE_req.is_barrier || VX_bckE_req.is_split);
wire is_csr = VX_bckE_req.is_csr;
wire is_mem = (vx_bckE_req.mem_write != `NO_MEM_WRITE) || (vx_bckE_req.mem_read != `NO_MEM_READ);
wire is_gpu = (vx_bckE_req.is_wspawn || vx_bckE_req.is_tmc || vx_bckE_req.is_barrier || vx_bckE_req.is_split);
wire is_csr = vx_bckE_req.is_csr;
// wire is_gpu = 0;
genvar currT;
@@ -31,64 +31,64 @@ module VX_inst_multiplex (
endgenerate
// LSU Unit
assign VX_lsu_req.valid = VX_bckE_req.valid & is_mem_mask;
assign VX_lsu_req.warp_num = VX_bckE_req.warp_num;
assign VX_lsu_req.base_address = VX_gpr_data.a_reg_data;
assign VX_lsu_req.store_data = VX_gpr_data.b_reg_data;
assign vx_lsu_req.valid = vx_bckE_req.valid & is_mem_mask;
assign vx_lsu_req.warp_num = vx_bckE_req.warp_num;
assign vx_lsu_req.base_address = vx_gpr_data.a_reg_data;
assign vx_lsu_req.store_data = vx_gpr_data.b_reg_data;
assign VX_lsu_req.offset = VX_bckE_req.itype_immed;
assign vx_lsu_req.offset = vx_bckE_req.itype_immed;
assign VX_lsu_req.mem_read = VX_bckE_req.mem_read;
assign VX_lsu_req.mem_write = VX_bckE_req.mem_write;
assign VX_lsu_req.rd = VX_bckE_req.rd;
assign VX_lsu_req.wb = VX_bckE_req.wb;
assign VX_lsu_req.lsu_pc = VX_bckE_req.curr_PC;
assign vx_lsu_req.mem_read = vx_bckE_req.mem_read;
assign vx_lsu_req.mem_write = vx_bckE_req.mem_write;
assign vx_lsu_req.rd = vx_bckE_req.rd;
assign vx_lsu_req.wb = vx_bckE_req.wb;
assign vx_lsu_req.lsu_pc = vx_bckE_req.curr_PC;
// Execute Unit
assign VX_exec_unit_req.valid = VX_bckE_req.valid & (~is_mem_mask & ~is_gpu_mask & ~is_csr_mask);
assign VX_exec_unit_req.warp_num = VX_bckE_req.warp_num;
assign VX_exec_unit_req.curr_PC = VX_bckE_req.curr_PC;
assign VX_exec_unit_req.PC_next = VX_bckE_req.PC_next;
assign VX_exec_unit_req.rd = VX_bckE_req.rd;
assign VX_exec_unit_req.wb = VX_bckE_req.wb;
assign VX_exec_unit_req.a_reg_data = VX_gpr_data.a_reg_data;
assign VX_exec_unit_req.b_reg_data = VX_gpr_data.b_reg_data;
assign VX_exec_unit_req.alu_op = VX_bckE_req.alu_op;
assign VX_exec_unit_req.rs1 = VX_bckE_req.rs1;
assign VX_exec_unit_req.rs2 = VX_bckE_req.rs2;
assign VX_exec_unit_req.rs2_src = VX_bckE_req.rs2_src;
assign VX_exec_unit_req.itype_immed = VX_bckE_req.itype_immed;
assign VX_exec_unit_req.upper_immed = VX_bckE_req.upper_immed;
assign VX_exec_unit_req.branch_type = VX_bckE_req.branch_type;
assign VX_exec_unit_req.jalQual = VX_bckE_req.jalQual;
assign VX_exec_unit_req.jal = VX_bckE_req.jal;
assign VX_exec_unit_req.jal_offset = VX_bckE_req.jal_offset;
assign VX_exec_unit_req.ebreak = VX_bckE_req.ebreak;
assign vx_exec_unit_req.valid = vx_bckE_req.valid & (~is_mem_mask & ~is_gpu_mask & ~is_csr_mask);
assign vx_exec_unit_req.warp_num = vx_bckE_req.warp_num;
assign vx_exec_unit_req.curr_PC = vx_bckE_req.curr_PC;
assign vx_exec_unit_req.PC_next = vx_bckE_req.PC_next;
assign vx_exec_unit_req.rd = vx_bckE_req.rd;
assign vx_exec_unit_req.wb = vx_bckE_req.wb;
assign vx_exec_unit_req.a_reg_data = vx_gpr_data.a_reg_data;
assign vx_exec_unit_req.b_reg_data = vx_gpr_data.b_reg_data;
assign vx_exec_unit_req.alu_op = vx_bckE_req.alu_op;
assign vx_exec_unit_req.rs1 = vx_bckE_req.rs1;
assign vx_exec_unit_req.rs2 = vx_bckE_req.rs2;
assign vx_exec_unit_req.rs2_src = vx_bckE_req.rs2_src;
assign vx_exec_unit_req.itype_immed = vx_bckE_req.itype_immed;
assign vx_exec_unit_req.upper_immed = vx_bckE_req.upper_immed;
assign vx_exec_unit_req.branch_type = vx_bckE_req.branch_type;
assign vx_exec_unit_req.jalQual = vx_bckE_req.jalQual;
assign vx_exec_unit_req.jal = vx_bckE_req.jal;
assign vx_exec_unit_req.jal_offset = vx_bckE_req.jal_offset;
assign vx_exec_unit_req.ebreak = vx_bckE_req.ebreak;
// GPR Req
assign VX_gpu_inst_req.valid = VX_bckE_req.valid & is_gpu_mask;
assign VX_gpu_inst_req.warp_num = VX_bckE_req.warp_num;
assign VX_gpu_inst_req.is_wspawn = VX_bckE_req.is_wspawn;
assign VX_gpu_inst_req.is_tmc = VX_bckE_req.is_tmc;
assign VX_gpu_inst_req.is_split = VX_bckE_req.is_split;
assign VX_gpu_inst_req.is_barrier = VX_bckE_req.is_barrier;
assign VX_gpu_inst_req.a_reg_data = VX_gpr_data.a_reg_data;
assign VX_gpu_inst_req.rd2 = VX_gpr_data.b_reg_data[0];
assign VX_gpu_inst_req.pc_next = VX_bckE_req.PC_next;
assign vx_gpu_inst_req.valid = vx_bckE_req.valid & is_gpu_mask;
assign vx_gpu_inst_req.warp_num = vx_bckE_req.warp_num;
assign vx_gpu_inst_req.is_wspawn = vx_bckE_req.is_wspawn;
assign vx_gpu_inst_req.is_tmc = vx_bckE_req.is_tmc;
assign vx_gpu_inst_req.is_split = vx_bckE_req.is_split;
assign vx_gpu_inst_req.is_barrier = vx_bckE_req.is_barrier;
assign vx_gpu_inst_req.a_reg_data = vx_gpr_data.a_reg_data;
assign vx_gpu_inst_req.rd2 = vx_gpr_data.b_reg_data[0];
assign vx_gpu_inst_req.pc_next = vx_bckE_req.PC_next;
// CSR Req
assign VX_csr_req.valid = VX_bckE_req.valid & is_csr_mask;
assign VX_csr_req.warp_num = VX_bckE_req.warp_num;
assign VX_csr_req.rd = VX_bckE_req.rd;
assign VX_csr_req.wb = VX_bckE_req.wb;
assign VX_csr_req.alu_op = VX_bckE_req.alu_op;
assign VX_csr_req.is_csr = VX_bckE_req.is_csr;
assign VX_csr_req.csr_address = VX_bckE_req.csr_address;
assign VX_csr_req.csr_immed = VX_bckE_req.csr_immed;
assign VX_csr_req.csr_mask = VX_bckE_req.csr_mask;
assign vx_csr_req.valid = vx_bckE_req.valid & is_csr_mask;
assign vx_csr_req.warp_num = vx_bckE_req.warp_num;
assign vx_csr_req.rd = vx_bckE_req.rd;
assign vx_csr_req.wb = vx_bckE_req.wb;
assign vx_csr_req.alu_op = vx_bckE_req.alu_op;
assign vx_csr_req.is_csr = vx_bckE_req.is_csr;
assign vx_csr_req.csr_address = vx_bckE_req.csr_address;
assign vx_csr_req.csr_immed = vx_bckE_req.csr_immed;
assign vx_csr_req.csr_mask = vx_bckE_req.csr_mask;
endmodule

View File

@@ -1,89 +1,87 @@
`include "VX_define.vh"
module VX_lsu (
input wire clk,
input wire reset,
input wire no_slot_mem,
VX_lsu_req_inter VX_lsu_req,
input wire clk,
input wire reset,
input wire no_slot_mem,
VX_lsu_req_inter vx_lsu_req,
// Write back to GPR
VX_inst_mem_wb_inter VX_mem_wb,
VX_gpu_dcache_res_inter VX_dcache_rsp,
VX_gpu_dcache_req_inter VX_dcache_req,
output wire out_delay
);
// Write back to GPR
VX_inst_mem_wb_inter vx_mem_wb,
VX_gpu_dcache_rsp_inter vx_dcache_rsp,
VX_gpu_dcache_req_inter vx_dcache_req,
output wire out_delay
);
// Generate Addresses
wire[`NUM_THREADS-1:0][31:0] address;
VX_lsu_addr_gen VX_lsu_addr_gen
(
.base_address(VX_lsu_req.base_address),
.offset (VX_lsu_req.offset),
.address (address)
VX_lsu_addr_gen VX_lsu_addr_gen (
.base_address (vx_lsu_req.base_address),
.offset (vx_lsu_req.offset),
.address (address)
);
wire[`NUM_THREADS-1:0][31:0] use_address;
wire[`NUM_THREADS-1:0][31:0] use_store_data;
wire[`NUM_THREADS-1:0] use_valid;
wire[`NUM_THREADS-1:0][31:0] use_address;
wire[`NUM_THREADS-1:0][31:0] use_store_data;
wire[`NUM_THREADS-1:0] use_valid;
wire[2:0] use_mem_read;
wire[2:0] use_mem_write;
wire[4:0] use_rd;
wire[`NW_BITS-1:0] use_warp_num;
wire[`NW_BITS-1:0] use_warp_num;
wire[1:0] use_wb;
wire[31:0] use_pc;
wire zero = 0;
VX_generic_register #(.N(45 + `NW_BITS-1 + 1 + `NUM_THREADS*65)) lsu_buffer(
VX_generic_register #(
.N(45 + `NW_BITS-1 + 1 + `NUM_THREADS*65)
) lsu_buffer(
.clk (clk),
.reset(reset),
.stall(out_delay),
.flush(zero),
.in ({address , VX_lsu_req.store_data, VX_lsu_req.valid, VX_lsu_req.mem_read, VX_lsu_req.mem_write, VX_lsu_req.rd, VX_lsu_req.warp_num, VX_lsu_req.wb, VX_lsu_req.lsu_pc}),
.in ({address , vx_lsu_req.store_data, vx_lsu_req.valid, vx_lsu_req.mem_read, vx_lsu_req.mem_write, vx_lsu_req.rd, vx_lsu_req.warp_num, vx_lsu_req.wb, vx_lsu_req.lsu_pc}),
.out ({use_address, use_store_data , use_valid , use_mem_read , use_mem_write , use_rd , use_warp_num , use_wb , use_pc })
);
);
// Core Request
assign VX_dcache_req.core_req_valid = use_valid;
assign VX_dcache_req.core_req_addr = use_address;
assign VX_dcache_req.core_req_writedata = use_store_data;
assign VX_dcache_req.core_req_mem_read = {`NUM_THREADS{use_mem_read}};
assign VX_dcache_req.core_req_mem_write = {`NUM_THREADS{use_mem_write}};
assign VX_dcache_req.core_req_rd = use_rd;
assign VX_dcache_req.core_req_wb = {`NUM_THREADS{use_wb}};
assign VX_dcache_req.core_req_warp_num = use_warp_num;
assign VX_dcache_req.core_req_pc = use_pc;
assign vx_dcache_req.core_req_valid = use_valid;
assign vx_dcache_req.core_req_addr = use_address;
assign vx_dcache_req.core_req_writedata = use_store_data;
assign vx_dcache_req.core_req_mem_read = {`NUM_THREADS{use_mem_read}};
assign vx_dcache_req.core_req_mem_write = {`NUM_THREADS{use_mem_write}};
assign vx_dcache_req.core_req_rd = use_rd;
assign vx_dcache_req.core_req_wb = {`NUM_THREADS{use_wb}};
assign vx_dcache_req.core_req_warp_num = use_warp_num;
assign vx_dcache_req.core_req_pc = use_pc;
// Core can't accept response
assign VX_dcache_req.core_no_wb_slot = no_slot_mem;
assign vx_dcache_req.core_no_wb_slot = no_slot_mem;
// Cache can't accept request
assign out_delay = VX_dcache_rsp.delay_req;
assign out_delay = vx_dcache_rsp.delay_req;
// Core Response
assign VX_mem_wb.rd = VX_dcache_rsp.core_wb_req_rd;
assign VX_mem_wb.wb = VX_dcache_rsp.core_wb_req_wb;
assign VX_mem_wb.wb_valid = VX_dcache_rsp.core_wb_valid;
assign VX_mem_wb.wb_warp_num = VX_dcache_rsp.core_wb_warp_num;
assign VX_mem_wb.loaded_data = VX_dcache_rsp.core_wb_readdata;
assign vx_mem_wb.rd = vx_dcache_rsp.core_wb_req_rd;
assign vx_mem_wb.wb = vx_dcache_rsp.core_wb_req_wb;
assign vx_mem_wb.wb_valid = vx_dcache_rsp.core_wb_valid;
assign vx_mem_wb.wb_warp_num = vx_dcache_rsp.core_wb_warp_num;
assign vx_mem_wb.loaded_data = vx_dcache_rsp.core_wb_readdata;
wire[(`LOG2UP(`NUM_THREADS))-1:0] use_pc_index;
/* verilator lint_off UNUSED */
wire found;
/* verilator lint_on UNUSED */
VX_generic_priority_encoder #(.N(`NUM_THREADS)) pick_first_pc(
.valids(VX_dcache_rsp.core_wb_valid),
.valids(vx_dcache_rsp.core_wb_valid),
.index (use_pc_index),
.found (found)
);
assign VX_mem_wb.mem_wb_pc = VX_dcache_rsp.core_wb_pc[use_pc_index];
assign vx_mem_wb.mem_wb_pc = vx_dcache_rsp.core_wb_pc[use_pc_index];
endmodule // Memory

View File

@@ -6,73 +6,75 @@ module VX_scheduler (
input wire memory_delay,
input wire exec_delay,
input wire gpr_stage_delay,
VX_frE_to_bckE_req_inter VX_bckE_req,
VX_wb_inter VX_writeback_inter,
VX_frE_to_bckE_req_inter vx_bckE_req,
VX_wb_inter vx_writeback_inter,
output wire schedule_delay,
output wire is_empty
);
/* verilator lint_off WIDTH */
reg[31:0] count_valid;
assign is_empty = count_valid == 0;
reg[31:0][`NUM_THREADS-1:0] rename_table[`NUM_WARPS-1:0];
wire valid_wb = (VX_writeback_inter.wb != 0) && (|VX_writeback_inter.wb_valid) && (VX_writeback_inter.rd != 0);
wire wb_inc = (VX_bckE_req.wb != 0) && (VX_bckE_req.rd != 0);
wire valid_wb = (vx_writeback_inter.wb != 0) && (|vx_writeback_inter.wb_valid) && (vx_writeback_inter.rd != 0);
wire wb_inc = (vx_bckE_req.wb != 0) && (vx_bckE_req.rd != 0);
wire rs1_rename = rename_table[VX_bckE_req.warp_num][VX_bckE_req.rs1] != 0;
wire rs2_rename = rename_table[VX_bckE_req.warp_num][VX_bckE_req.rs2] != 0;
wire rd_rename = rename_table[VX_bckE_req.warp_num][VX_bckE_req.rd ] != 0;
wire rs1_rename = rename_table[vx_bckE_req.warp_num][vx_bckE_req.rs1] != 0;
wire rs2_rename = rename_table[vx_bckE_req.warp_num][vx_bckE_req.rs2] != 0;
wire rd_rename = rename_table[vx_bckE_req.warp_num][vx_bckE_req.rd ] != 0;
wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE);
wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ);
wire is_store = (vx_bckE_req.mem_write != `NO_MEM_WRITE);
wire is_load = (vx_bckE_req.mem_read != `NO_MEM_READ);
// classify our next instruction.
wire is_mem = is_store || is_load;
wire is_gpu = (VX_bckE_req.is_wspawn || VX_bckE_req.is_tmc || VX_bckE_req.is_barrier || VX_bckE_req.is_split);
wire is_csr = VX_bckE_req.is_csr;
wire is_gpu = (vx_bckE_req.is_wspawn || vx_bckE_req.is_tmc || vx_bckE_req.is_barrier || vx_bckE_req.is_split);
wire is_csr = vx_bckE_req.is_csr;
wire is_exec = !is_mem && !is_gpu && !is_csr;
// wire rs1_pass = 0;
// wire rs2_pass = 0;
wire using_rs2 = (vx_bckE_req.rs2_src == `RS2_REG) || is_store || vx_bckE_req.is_barrier || vx_bckE_req.is_wspawn;
wire using_rs2 = (VX_bckE_req.rs2_src == `RS2_REG) || is_store || VX_bckE_req.is_barrier || VX_bckE_req.is_wspawn;
wire rs1_rename_qual = ((rs1_rename) && (VX_bckE_req.rs1 != 0));
wire rs2_rename_qual = ((rs2_rename) && (VX_bckE_req.rs2 != 0 && using_rs2));
wire rd_rename_qual = ((rd_rename ) && (VX_bckE_req.rd != 0));
wire rs1_rename_qual = ((rs1_rename) && (vx_bckE_req.rs1 != 0));
wire rs2_rename_qual = ((rs2_rename) && (vx_bckE_req.rs2 != 0 && using_rs2));
wire rd_rename_qual = ((rd_rename ) && (vx_bckE_req.rd != 0));
wire rename_valid = rs1_rename_qual || rs2_rename_qual || rd_rename_qual;
assign schedule_delay = ((rename_valid) && (|VX_bckE_req.valid))
|| (memory_delay && is_mem)
|| (gpr_stage_delay && (is_mem || is_exec))
|| (exec_delay && is_exec);
assign schedule_delay = ((rename_valid) && (|vx_bckE_req.valid))
|| (memory_delay && is_mem)
|| (gpr_stage_delay && (is_mem || is_exec))
|| (exec_delay && is_exec);
integer i;
integer w;
always @(posedge clk or posedge reset) begin
always @(posedge clk) begin
if (reset) begin
for (w = 0; w < `NUM_WARPS; w=w+1)
begin
for (i = 0; i < 32; i = i + 1)
begin
rename_table[w][i] <= 0;
for (w = 0; w < `NUM_WARPS; w=w+1) begin
for (i = 0; i < 32; i = i + 1) begin
rename_table[w][i] <= 0;
end
end
end else begin
if (valid_wb ) rename_table[VX_writeback_inter.wb_warp_num][VX_writeback_inter.rd] <= rename_table[VX_writeback_inter.wb_warp_num][VX_writeback_inter.rd] & (~VX_writeback_inter.wb_valid);
if (!schedule_delay && wb_inc) rename_table[VX_bckE_req.warp_num ][VX_bckE_req.rd ] <= VX_bckE_req.valid;
if (valid_wb) begin
rename_table[vx_writeback_inter.wb_warp_num][vx_writeback_inter.rd] <= rename_table[vx_writeback_inter.wb_warp_num][vx_writeback_inter.rd] & (~vx_writeback_inter.wb_valid);
end
if (!schedule_delay && wb_inc) begin
rename_table[vx_bckE_req.warp_num][vx_bckE_req.rd] <= vx_bckE_req.valid;
end
if (valid_wb && ((rename_table[VX_writeback_inter.wb_warp_num][VX_writeback_inter.rd] & (~VX_writeback_inter.wb_valid)) == 0)) count_valid = count_valid - 1;
if (!schedule_delay && wb_inc) count_valid = count_valid + 1;
if (valid_wb
&& (0 == (rename_table[vx_writeback_inter.wb_warp_num][vx_writeback_inter.rd] & ~vx_writeback_inter.wb_valid))) begin
count_valid <= count_valid - 1;
end
if (!schedule_delay && wb_inc) begin
count_valid <= count_valid + 1;
end
end
end
/* verilator lint_on WIDTH */
endmodule

View File

@@ -38,7 +38,7 @@ module VX_warp (
end
always @(posedge clk, posedge reset) begin
always @(posedge clk) begin
if (remove) begin
valid <= valid_zero;
end else if (in_change_mask) begin
@@ -69,7 +69,7 @@ module VX_warp (
assign use_PC = temp_PC;
assign out_PC = temp_PC;
always @(posedge clk or posedge reset) begin
always @(posedge clk) begin
if (reset) begin
real_PC <= 0;
end else if (in_wspawn == 1'b1) begin

View File

@@ -19,7 +19,9 @@ module VX_warp_scheduler (
input wire[`NW_BITS-1:0] whalt_warp_num,
input wire is_barrier,
/* verilator lint_off UNUSED */
input wire[31:0] barrier_id,
/* verilator lint_on UNUSED */
input wire[$clog2(`NUM_WARPS):0] num_warps,
input wire[`NW_BITS-1:0] barrier_warp_num,
@@ -60,10 +62,7 @@ module VX_warp_scheduler (
input wire[`NUM_THREADS-1:0] icache_stage_valids
);
/* verilator lint_off WIDTH */
wire update_use_wspawn;
wire update_visible_active;
wire[(1+32+`NUM_THREADS-1):0] d[`NUM_WARPS-1:0];
@@ -72,10 +71,12 @@ module VX_warp_scheduler (
wire[31:0] join_pc;
wire[`NUM_THREADS-1:0] join_tm;
/* verilator lint_off UNUSED */
wire in_wspawn = wspawn;
wire in_ctm = ctm;
wire in_whalt = whalt;
wire in_wstall = wstall;
/* verilator lint_on UNUSED */
reg[`NUM_WARPS-1:0] warp_active;
reg[`NUM_WARPS-1:0] warp_stalled;
@@ -114,13 +115,12 @@ module VX_warp_scheduler (
reg didnt_split;
/* verilator lint_off UNUSED */
// wire[$clog2(`NUM_WARPS):0] num_active;
/* verilator lint_on UNUSED */
integer curr_w_help;
integer curr_barrier;
always @(posedge clk or posedge reset) begin
always @(posedge clk) begin
if (reset) begin
for (curr_barrier = 0; curr_barrier < `NUM_BARRIERS; curr_barrier=curr_barrier+1) begin
barrier_stall_mask[curr_barrier] <= 0;

View File

@@ -4,61 +4,61 @@ module VX_writeback (
input wire clk,
input wire reset,
// Mem WB info
VX_inst_mem_wb_inter VX_mem_wb,
VX_inst_mem_wb_inter vx_mem_wb,
// EXEC Unit WB info
VX_inst_exec_wb_inter VX_inst_exec_wb,
VX_inst_exec_wb_inter vx_inst_exec_wb,
// CSR Unit WB info
VX_csr_wb_inter VX_csr_wb,
VX_csr_wb_inter vx_csr_wb,
// Actual WB to GPR
VX_wb_inter VX_writeback_inter,
VX_wb_inter vx_writeback_inter,
output wire no_slot_mem,
output wire no_slot_exec,
output wire no_slot_csr
);
VX_wb_inter VX_writeback_tempp();
VX_wb_inter vx_writeback_tempp();
wire exec_wb = (VX_inst_exec_wb.wb != 0) && (|VX_inst_exec_wb.wb_valid);
wire mem_wb = (VX_mem_wb.wb != 0) && (|VX_mem_wb.wb_valid);
wire csr_wb = (VX_csr_wb.wb != 0) && (|VX_csr_wb.valid);
wire exec_wb = (vx_inst_exec_wb.wb != 0) && (|vx_inst_exec_wb.wb_valid);
wire mem_wb = (vx_mem_wb.wb != 0) && (|vx_mem_wb.wb_valid);
wire csr_wb = (vx_csr_wb.wb != 0) && (|vx_csr_wb.valid);
assign no_slot_mem = mem_wb && (exec_wb || csr_wb);
assign no_slot_csr = csr_wb && (exec_wb);
assign no_slot_exec = 0;
assign VX_writeback_tempp.write_data = exec_wb ? VX_inst_exec_wb.alu_result :
csr_wb ? VX_csr_wb.csr_result :
mem_wb ? VX_mem_wb.loaded_data :
assign vx_writeback_tempp.write_data = exec_wb ? vx_inst_exec_wb.alu_result :
csr_wb ? vx_csr_wb.csr_result :
mem_wb ? vx_mem_wb.loaded_data :
0;
assign VX_writeback_tempp.wb_valid = exec_wb ? VX_inst_exec_wb.wb_valid :
csr_wb ? VX_csr_wb.valid :
mem_wb ? VX_mem_wb.wb_valid :
assign vx_writeback_tempp.wb_valid = exec_wb ? vx_inst_exec_wb.wb_valid :
csr_wb ? vx_csr_wb.valid :
mem_wb ? vx_mem_wb.wb_valid :
0;
assign VX_writeback_tempp.rd = exec_wb ? VX_inst_exec_wb.rd :
csr_wb ? VX_csr_wb.rd :
mem_wb ? VX_mem_wb.rd :
assign vx_writeback_tempp.rd = exec_wb ? vx_inst_exec_wb.rd :
csr_wb ? vx_csr_wb.rd :
mem_wb ? vx_mem_wb.rd :
0;
assign VX_writeback_tempp.wb = exec_wb ? VX_inst_exec_wb.wb :
csr_wb ? VX_csr_wb.wb :
mem_wb ? VX_mem_wb.wb :
assign vx_writeback_tempp.wb = exec_wb ? vx_inst_exec_wb.wb :
csr_wb ? vx_csr_wb.wb :
mem_wb ? vx_mem_wb.wb :
0;
assign VX_writeback_tempp.wb_warp_num = exec_wb ? VX_inst_exec_wb.wb_warp_num :
csr_wb ? VX_csr_wb.warp_num :
mem_wb ? VX_mem_wb.wb_warp_num :
assign vx_writeback_tempp.wb_warp_num = exec_wb ? vx_inst_exec_wb.wb_warp_num :
csr_wb ? vx_csr_wb.warp_num :
mem_wb ? vx_mem_wb.wb_warp_num :
0;
assign VX_writeback_tempp.wb_pc = exec_wb ? VX_inst_exec_wb.exec_wb_pc :
assign vx_writeback_tempp.wb_pc = exec_wb ? vx_inst_exec_wb.exec_wb_pc :
csr_wb ? 32'hdeadbeef :
mem_wb ? VX_mem_wb.mem_wb_pc :
mem_wb ? vx_mem_wb.mem_wb_pc :
32'hdeadbeef;
@@ -71,19 +71,19 @@ module VX_writeback (
.reset(reset),
.stall(zero),
.flush(zero),
.in ({VX_writeback_tempp.write_data, VX_writeback_tempp.wb_valid, VX_writeback_tempp.rd, VX_writeback_tempp.wb, VX_writeback_tempp.wb_warp_num, VX_writeback_tempp.wb_pc}),
.out ({use_wb_data , VX_writeback_inter.wb_valid, VX_writeback_inter.rd, VX_writeback_inter.wb, VX_writeback_inter.wb_warp_num, VX_writeback_inter.wb_pc})
.in ({vx_writeback_tempp.write_data, vx_writeback_tempp.wb_valid, vx_writeback_tempp.rd, vx_writeback_tempp.wb, vx_writeback_tempp.wb_warp_num, vx_writeback_tempp.wb_pc}),
.out ({use_wb_data , vx_writeback_inter.wb_valid, vx_writeback_inter.rd, vx_writeback_inter.wb, vx_writeback_inter.wb_warp_num, vx_writeback_inter.wb_pc})
);
reg[31:0] last_data_wb /* verilator public */ ;
always @(posedge clk) begin
if ((|VX_writeback_inter.wb_valid) && (VX_writeback_inter.wb != 0) && (VX_writeback_inter.rd == 28)) begin
if ((|vx_writeback_inter.wb_valid) && (vx_writeback_inter.wb != 0) && (vx_writeback_inter.rd == 28)) begin
last_data_wb <= use_wb_data[0];
end
end
assign VX_writeback_inter.write_data = use_wb_data;
assign vx_writeback_inter.write_data = use_wb_data;
endmodule : VX_writeback

View File

@@ -16,105 +16,82 @@ module Vortex
output wire [31:0] io_data,
// DRAM Dcache Req
output wire dram_req,
output wire dram_req_write,
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [31:0] dram_req_size,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
output wire [31:0] dram_expected_lat,
input wire dram_req_full,
input wire dram_req_delay,
// DRAM Dcache Res
output wire dram_fill_accept,
input wire dram_fill_rsp,
input wire [31:0] dram_fill_rsp_addr,
input wire [`DBANK_LINE_SIZE-1:0] dram_fill_rsp_data,
// DRAM Dcache Rsp
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire [`DBANK_LINE_SIZE-1:0] dram_rsp_data,
output wire dram_rsp_ready,
// DRAM Icache Req
output wire I_dram_req,
output wire I_dram_req_write,
output wire I_dram_req_read,
output wire [31:0] I_dram_req_addr,
output wire [31:0] I_dram_req_size,
output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data,
output wire [31:0] I_dram_expected_lat,
output wire I_dram_req_read,
output wire I_dram_req_write,
output wire [31:0] I_dram_req_addr,
output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data,
input wire I_dram_req_full,
// DRAM Icache Res
output wire I_dram_fill_accept,
input wire I_dram_fill_rsp,
input wire [31:0] I_dram_fill_rsp_addr,
input wire [`IBANK_LINE_SIZE-1:0] I_dram_fill_rsp_data,
// DRAM Icache Rsp
input wire I_dram_rsp_valid,
input wire [31:0] I_dram_rsp_addr,
input wire [`IBANK_LINE_SIZE-1:0] I_dram_rsp_data,
output wire I_dram_rsp_ready,
// LLC Snooping
input wire snp_req,
input wire [31:0] snp_req_addr,
output wire snp_req_delay,
input wire snp_req_valid,
input wire [31:0] snp_req_addr,
output wire snp_req_full,
input wire I_snp_req,
input wire [31:0] I_snp_req_addr,
output wire I_snp_req_delay,
output wire out_ebreak
output wire out_ebreak
`else
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// IO
output wire io_valid,
output wire[31:0] io_data,
output wire io_valid,
output wire[31:0] io_data,
// DRAM Dcache Req
output wire dram_req,
output wire dram_req_write,
output wire dram_req_read,
output wire [31:0] dram_req_addr,
output wire [31:0] dram_req_size,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
output wire [31:0] dram_expected_lat,
// DRAM Dcache Res
output wire dram_fill_accept,
input wire dram_fill_rsp,
input wire [31:0] dram_fill_rsp_addr,
input wire [`DBANK_LINE_SIZE-1:0] dram_fill_rsp_data,
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
input wire dram_req_full,
// DRAM Dcache Rsp
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire [`DBANK_LINE_SIZE-1:0] dram_rsp_data,
output wire dram_rsp_ready,
// DRAM Icache Req
output wire I_dram_req,
output wire I_dram_req_write,
output wire I_dram_req_read,
output wire [31:0] I_dram_req_addr,
output wire [31:0] I_dram_req_size,
output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data,
output wire [31:0] I_dram_expected_lat,
output wire I_dram_req_read,
output wire I_dram_req_write,
output wire [31:0] I_dram_req_addr,
output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data,
input wire I_dram_req_full,
// DRAM Icache Res
output wire I_dram_fill_accept,
input wire I_dram_fill_rsp,
input wire [31:0] I_dram_fill_rsp_addr,
input wire [`IBANK_LINE_SIZE-1:0] I_dram_fill_rsp_data,
// DRAM Icache Rsp
output wire I_dram_rsp_ready,
input wire I_dram_rsp_valid,
input wire [31:0] I_dram_rsp_addr,
input wire [`IBANK_LINE_SIZE-1:0] I_dram_rsp_data,
input wire dram_req_delay,
input wire snp_req_valid,
input wire [31:0] snp_req_addr,
output wire snp_req_full,
input wire snp_req,
input wire [31:0] snp_req_addr,
output wire snp_req_delay,
input wire I_snp_req,
input wire [31:0] I_snp_req_addr,
output wire I_snp_req_delay,
output wire out_ebreak
output wire out_ebreak
`endif
);
/* verilator lint_off UNUSED */
wire scheduler_empty;
wire out_ebreak_unqual;
// assign out_ebreak = out_ebreak_unqual && (scheduler_empty && 1);
assign out_ebreak = out_ebreak_unqual;
/* verilator lint_on UNUSED */
wire memory_delay;
wire exec_delay;
@@ -122,184 +99,165 @@ module Vortex
wire schedule_delay;
// Dcache Interface
VX_gpu_dcache_res_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) VX_dcache_rsp();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) VX_dcache_req();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) VX_dcache_req_qual();
VX_gpu_dcache_rsp_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) vx_dcache_rsp();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) vx_dcache_req();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`DNUM_REQUESTS)) vx_dcache_req_qual();
VX_gpu_dcache_dram_req_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) VX_gpu_dcache_dram_req();
VX_gpu_dcache_dram_res_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) VX_gpu_dcache_dram_res();
VX_gpu_dcache_dram_req_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) vx_gpu_dcache_dram_req();
VX_gpu_dcache_dram_rsp_inter #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) vx_gpu_dcache_dram_res();
assign vx_gpu_dcache_dram_res.dram_rsp_valid = dram_rsp_valid;
assign vx_gpu_dcache_dram_res.dram_rsp_addr = dram_rsp_addr;
assign VX_gpu_dcache_dram_res.dram_fill_rsp = dram_fill_rsp;
assign VX_gpu_dcache_dram_res.dram_fill_rsp_addr = dram_fill_rsp_addr;
assign dram_req_write = vx_gpu_dcache_dram_req.dram_req_write;
assign dram_req_read = vx_gpu_dcache_dram_req.dram_req_read;
assign dram_req_addr = vx_gpu_dcache_dram_req.dram_req_addr;
assign dram_rsp_ready = vx_gpu_dcache_dram_req.dram_rsp_ready;
assign dram_req = VX_gpu_dcache_dram_req.dram_req;
assign dram_req_write = VX_gpu_dcache_dram_req.dram_req_write;
assign dram_req_read = VX_gpu_dcache_dram_req.dram_req_read;
assign dram_req_addr = VX_gpu_dcache_dram_req.dram_req_addr;
assign dram_req_size = VX_gpu_dcache_dram_req.dram_req_size;
assign dram_expected_lat = `DSIMULATED_DRAM_LATENCY_CYCLES;
assign dram_fill_accept = VX_gpu_dcache_dram_req.dram_fill_accept;
assign VX_gpu_dcache_dram_req.dram_req_delay = dram_req_delay;
assign vx_gpu_dcache_dram_req.dram_req_full = dram_req_full;
genvar i;
generate
for (i = 0; i < `DBANK_LINE_WORDS; i=i+1) begin
assign VX_gpu_dcache_dram_res.dram_fill_rsp_data[i] = dram_fill_rsp_data[i * 32 +: 32];
assign dram_req_data[i * 32 +: 32] = VX_gpu_dcache_dram_req.dram_req_data[i];
assign vx_gpu_dcache_dram_res.dram_rsp_data[i] = dram_rsp_data[i * 32 +: 32];
assign dram_req_data[i * 32 +: 32] = vx_gpu_dcache_dram_req.dram_req_data[i];
end
endgenerate
wire temp_io_valid = (!memory_delay)
&& (|VX_dcache_req.core_req_valid)
&& (VX_dcache_req.core_req_mem_write[0] != `NO_MEM_WRITE)
&& (VX_dcache_req.core_req_addr[0] == 32'h00010000);
&& (|vx_dcache_req.core_req_valid)
&& (vx_dcache_req.core_req_mem_write[0] != `NO_MEM_WRITE)
&& (vx_dcache_req.core_req_addr[0] == 32'h00010000);
wire[31:0] temp_io_data = VX_dcache_req.core_req_writedata[0];
wire[31:0] temp_io_data = vx_dcache_req.core_req_writedata[0];
assign io_valid = temp_io_valid;
assign io_data = temp_io_data;
assign VX_dcache_req_qual.core_req_valid = VX_dcache_req.core_req_valid & {`NUM_THREADS{~io_valid}};
assign VX_dcache_req_qual.core_req_addr = VX_dcache_req.core_req_addr;
assign VX_dcache_req_qual.core_req_writedata = VX_dcache_req.core_req_writedata;
assign VX_dcache_req_qual.core_req_mem_read = VX_dcache_req.core_req_mem_read;
assign VX_dcache_req_qual.core_req_mem_write = VX_dcache_req.core_req_mem_write;
assign VX_dcache_req_qual.core_req_rd = VX_dcache_req.core_req_rd;
assign VX_dcache_req_qual.core_req_wb = VX_dcache_req.core_req_wb;
assign VX_dcache_req_qual.core_req_warp_num = VX_dcache_req.core_req_warp_num;
assign VX_dcache_req_qual.core_req_pc = VX_dcache_req.core_req_pc;
assign VX_dcache_req_qual.core_no_wb_slot = VX_dcache_req.core_no_wb_slot;
assign vx_dcache_req_qual.core_req_valid = vx_dcache_req.core_req_valid & {`NUM_THREADS{~io_valid}};
assign vx_dcache_req_qual.core_req_addr = vx_dcache_req.core_req_addr;
assign vx_dcache_req_qual.core_req_writedata = vx_dcache_req.core_req_writedata;
assign vx_dcache_req_qual.core_req_mem_read = vx_dcache_req.core_req_mem_read;
assign vx_dcache_req_qual.core_req_mem_write = vx_dcache_req.core_req_mem_write;
assign vx_dcache_req_qual.core_req_rd = vx_dcache_req.core_req_rd;
assign vx_dcache_req_qual.core_req_wb = vx_dcache_req.core_req_wb;
assign vx_dcache_req_qual.core_req_warp_num = vx_dcache_req.core_req_warp_num;
assign vx_dcache_req_qual.core_req_pc = vx_dcache_req.core_req_pc;
assign vx_dcache_req_qual.core_no_wb_slot = vx_dcache_req.core_no_wb_slot;
VX_gpu_dcache_rsp_inter #(.NUM_REQUESTS(`INUM_REQUESTS)) vx_icache_rsp();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`INUM_REQUESTS)) vx_icache_req();
VX_gpu_dcache_res_inter #(.NUM_REQUESTS(`INUM_REQUESTS)) VX_icache_rsp();
VX_gpu_dcache_req_inter #(.NUM_REQUESTS(`INUM_REQUESTS)) VX_icache_req();
VX_gpu_dcache_dram_req_inter #(.BANK_LINE_WORDS(`IBANK_LINE_WORDS)) vx_gpu_icache_dram_req();
VX_gpu_dcache_dram_rsp_inter #(.BANK_LINE_WORDS(`IBANK_LINE_WORDS)) vx_gpu_icache_dram_res();
VX_gpu_dcache_dram_req_inter #(.BANK_LINE_WORDS(`IBANK_LINE_WORDS)) VX_gpu_icache_dram_req();
VX_gpu_dcache_dram_res_inter #(.BANK_LINE_WORDS(`IBANK_LINE_WORDS)) VX_gpu_icache_dram_res();
assign vx_gpu_icache_dram_res.dram_rsp_valid = I_dram_rsp_valid;
assign vx_gpu_icache_dram_res.dram_rsp_addr = I_dram_rsp_addr;
assign I_dram_req_write = vx_gpu_icache_dram_req.dram_req_write;
assign I_dram_req_read = vx_gpu_icache_dram_req.dram_req_read;
assign I_dram_req_addr = vx_gpu_icache_dram_req.dram_req_addr;
assign I_dram_rsp_ready = vx_gpu_icache_dram_req.dram_rsp_ready;
assign VX_gpu_icache_dram_res.dram_fill_rsp = I_dram_fill_rsp;
assign VX_gpu_icache_dram_res.dram_fill_rsp_addr = I_dram_fill_rsp_addr;
assign I_dram_req = VX_gpu_icache_dram_req.dram_req;
assign I_dram_req_write = VX_gpu_icache_dram_req.dram_req_write;
assign I_dram_req_read = VX_gpu_icache_dram_req.dram_req_read;
assign I_dram_req_addr = VX_gpu_icache_dram_req.dram_req_addr;
assign I_dram_req_size = VX_gpu_icache_dram_req.dram_req_size;
assign I_dram_expected_lat = `ISIMULATED_DRAM_LATENCY_CYCLES;
assign I_dram_fill_accept = VX_gpu_icache_dram_req.dram_fill_accept;
assign VX_gpu_icache_dram_req.dram_req_delay = dram_req_delay;
assign vx_gpu_icache_dram_req.dram_req_full = I_dram_req_full;
genvar j;
generate
for (j = 0; j < `IBANK_LINE_WORDS; j = j + 1) begin
assign VX_gpu_icache_dram_res.dram_fill_rsp_data[j] = I_dram_fill_rsp_data[j * 32 +: 32];
assign I_dram_req_data[j * 32 +: 32] = VX_gpu_icache_dram_req.dram_req_data[j];
assign vx_gpu_icache_dram_res.dram_rsp_data[j] = I_dram_rsp_data[j * 32 +: 32];
assign I_dram_req_data[j * 32 +: 32] = vx_gpu_icache_dram_req.dram_req_data[j];
end
endgenerate
/////////////////////////////////////////////////////////////////////////
// Front-end to Back-end
VX_frE_to_bckE_req_inter VX_bckE_req(); // New instruction request to EXE/MEM
VX_frE_to_bckE_req_inter vx_bckE_req(); // New instruction request to EXE/MEM
// Back-end to Front-end
VX_wb_inter VX_writeback_inter(); // Writeback to GPRs
VX_branch_response_inter VX_branch_rsp(); // Branch Resolution to Fetch
VX_jal_response_inter VX_jal_rsp(); // Jump resolution to Fetch
VX_wb_inter vx_writeback_inter(); // Writeback to GPRs
VX_branch_response_inter vx_branch_rsp(); // Branch Resolution to Fetch
VX_jal_response_inter vx_jal_rsp(); // Jump resolution to Fetch
// CSR Buses
// VX_csr_write_request_inter VX_csr_w_req();
// VX_csr_write_request_inter vx_csr_w_req();
VX_warp_ctl_inter vx_warp_ctl();
VX_gpu_snp_req_rsp vx_gpu_icache_snp_req();
VX_gpu_snp_req_rsp vx_gpu_dcache_snp_req();
VX_warp_ctl_inter VX_warp_ctl();
VX_gpu_snp_req_rsp VX_gpu_icache_snp_req();
VX_gpu_snp_req_rsp VX_gpu_dcache_snp_req();
assign VX_gpu_icache_snp_req.snp_req = I_snp_req;
assign VX_gpu_icache_snp_req.snp_req_addr = I_snp_req_addr;
assign I_snp_req_delay = VX_gpu_icache_snp_req.snp_delay;
assign VX_gpu_dcache_snp_req.snp_req = snp_req;
assign VX_gpu_dcache_snp_req.snp_req_addr = snp_req_addr;
assign snp_req_delay = VX_gpu_dcache_snp_req.snp_delay;
assign vx_gpu_dcache_snp_req.snp_req_valid = snp_req_valid;
assign vx_gpu_dcache_snp_req.snp_req_addr = snp_req_addr;
assign snp_req_full = vx_gpu_dcache_snp_req.snp_req_full;
VX_front_end vx_front_end(
.clk (clk),
.reset (reset),
.VX_warp_ctl (VX_warp_ctl),
.VX_bckE_req (VX_bckE_req),
.vx_warp_ctl (vx_warp_ctl),
.vx_bckE_req (vx_bckE_req),
.schedule_delay (schedule_delay),
.VX_icache_rsp (VX_icache_rsp),
.VX_icache_req (VX_icache_req),
.VX_jal_rsp (VX_jal_rsp),
.VX_branch_rsp (VX_branch_rsp),
.fetch_ebreak (out_ebreak_unqual)
.vx_icache_rsp (vx_icache_rsp),
.vx_icache_req (vx_icache_req),
.vx_jal_rsp (vx_jal_rsp),
.vx_branch_rsp (vx_branch_rsp),
.fetch_ebreak (out_ebreak)
);
VX_scheduler schedule(
.clk (clk),
.reset (reset),
.memory_delay (memory_delay),
.exec_delay (exec_delay),
.gpr_stage_delay (gpr_stage_delay),
.VX_bckE_req (VX_bckE_req),
.VX_writeback_inter(VX_writeback_inter),
.schedule_delay (schedule_delay),
.is_empty (scheduler_empty)
.clk (clk),
.reset (reset),
.memory_delay (memory_delay),
.exec_delay (exec_delay),
.gpr_stage_delay (gpr_stage_delay),
.vx_bckE_req (vx_bckE_req),
.vx_writeback_inter (vx_writeback_inter),
.schedule_delay (schedule_delay),
.is_empty (scheduler_empty)
);
VX_back_end #(.CORE_ID(CORE_ID)) vx_back_end(
.clk (clk),
.reset (reset),
.schedule_delay (schedule_delay),
.VX_warp_ctl (VX_warp_ctl),
.VX_bckE_req (VX_bckE_req),
.VX_jal_rsp (VX_jal_rsp),
.VX_branch_rsp (VX_branch_rsp),
.VX_dcache_rsp (VX_dcache_rsp),
.VX_dcache_req (VX_dcache_req),
.VX_writeback_inter (VX_writeback_inter),
.vx_warp_ctl (vx_warp_ctl),
.vx_bckE_req (vx_bckE_req),
.vx_jal_rsp (vx_jal_rsp),
.vx_branch_rsp (vx_branch_rsp),
.vx_dcache_rsp (vx_dcache_rsp),
.vx_dcache_req (vx_dcache_req),
.vx_writeback_inter (vx_writeback_inter),
.out_mem_delay (memory_delay),
.out_exec_delay (exec_delay),
.gpr_stage_delay (gpr_stage_delay)
);
VX_dmem_controller VX_dmem_controller(
VX_dmem_controller vx_dmem_controller(
.clk (clk),
.reset (reset),
// Dram <-> Dcache
.VX_gpu_dcache_dram_req (VX_gpu_dcache_dram_req),
.VX_gpu_dcache_dram_res (VX_gpu_dcache_dram_res),
.VX_gpu_dcache_snp_req (VX_gpu_dcache_snp_req),
.vx_gpu_dcache_dram_req (vx_gpu_dcache_dram_req),
.vx_gpu_dcache_dram_res (vx_gpu_dcache_dram_res),
.vx_gpu_dcache_snp_req (vx_gpu_dcache_snp_req),
// Dram <-> Icache
.VX_gpu_icache_dram_req (VX_gpu_icache_dram_req),
.VX_gpu_icache_dram_res (VX_gpu_icache_dram_res),
.VX_gpu_icache_snp_req (VX_gpu_icache_snp_req),
.vx_gpu_icache_dram_req (vx_gpu_icache_dram_req),
.vx_gpu_icache_dram_res (vx_gpu_icache_dram_res),
.vx_gpu_icache_snp_req (vx_gpu_icache_snp_req),
// Core <-> Icache
.VX_icache_req (VX_icache_req),
.VX_icache_rsp (VX_icache_rsp),
.vx_icache_req (vx_icache_req),
.vx_icache_rsp (vx_icache_rsp),
// Core <-> Dcache
.VX_dcache_req (VX_dcache_req_qual),
.VX_dcache_rsp (VX_dcache_rsp)
.vx_dcache_req (vx_dcache_req_qual),
.vx_dcache_rsp (vx_dcache_rsp)
);
// VX_csr_handler vx_csr_handler(
// .clk (clk),
// .in_decode_csr_address(decode_csr_address),
// .VX_csr_w_req (VX_csr_w_req),
// .in_wb_valid (VX_writeback_inter.wb_valid[0]),
// .vx_csr_w_req (vx_csr_w_req),
// .in_wb_valid (vx_writeback_inter.wb_valid[0]),
// .out_decode_csr_data (csr_decode_csr_data)
// );

View File

@@ -15,57 +15,48 @@ module Vortex_Cluster
output wire[`NUM_CORES_PER_CLUSTER-1:0][31:0] io_data,
// DRAM Req
output wire out_dram_req,
output wire out_dram_req_write,
output wire out_dram_req_read,
output wire [31:0] out_dram_req_addr,
output wire [31:0] out_dram_req_size,
output wire [31:0] out_dram_req_data[`DBANK_LINE_WORDS-1:0],
output wire [31:0] out_dram_expected_lat,
input wire out_dram_req_delay,
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
input wire dram_req_full,
// DRAM Res
output wire out_dram_fill_accept,
input wire out_dram_fill_rsp,
input wire [31:0] out_dram_fill_rsp_addr,
input wire [31:0] out_dram_fill_rsp_data[`DBANK_LINE_WORDS-1:0],
// DRAM Rsp
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire [`DBANK_LINE_SIZE-1:0] dram_rsp_data,
output wire dram_rsp_ready,
// LLC Snooping
input wire llc_snp_req,
input wire llc_snp_req_valid,
input wire[31:0] llc_snp_req_addr,
output wire llc_snp_req_delay,
output wire llc_snp_req_full,
output wire out_ebreak
);
// DRAM Dcache Req
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_req;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_req_write;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_req_read;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_req_write;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_dram_req_addr;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_dram_req_size;
wire[`NUM_CORES_PER_CLUSTER-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_core_dram_req_data;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_dram_expected_lat;
// DRAM Dcache Res
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_fill_accept;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_fill_rsp;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_dram_fill_rsp_addr;
wire[`NUM_CORES_PER_CLUSTER-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_core_dram_fill_rsp_data;
// DRAM Dcache Rsp
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_rsp_valid;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_dram_rsp_addr;
wire[`NUM_CORES_PER_CLUSTER-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_core_dram_rsp_data;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_rsp_ready;
// DRAM Icache Req
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_req;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_req_write;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_req_read;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_req_write;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_I_dram_req_addr;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_I_dram_req_size;
wire[`NUM_CORES_PER_CLUSTER-1:0][`IBANK_LINE_WORDS-1:0][31:0] per_core_I_dram_req_data;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_I_dram_expected_lat;
// DRAM Icache Res
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_fill_accept;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_fill_rsp;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_I_dram_fill_rsp_addr;
wire[`NUM_CORES_PER_CLUSTER-1:0][`IBANK_LINE_WORDS-1:0][31:0] per_core_I_dram_fill_rsp_data;
// DRAM Icache Rsp
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_rsp_valid;
wire[`NUM_CORES_PER_CLUSTER-1:0] [31:0] per_core_I_dram_rsp_addr;
wire[`NUM_CORES_PER_CLUSTER-1:0][`IBANK_LINE_WORDS-1:0][31:0] per_core_I_dram_rsp_data;
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_I_dram_rsp_ready;
// Out ebreak
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_out_ebreak;
@@ -75,9 +66,9 @@ module Vortex_Cluster
wire l2c_core_accept;
wire snp_fwd;
wire snp_fwd_valid;
wire[31:0] snp_fwd_addr;
wire[`NUM_CORES_PER_CLUSTER-1:0] snp_fwd_delay;
wire[`NUM_CORES_PER_CLUSTER-1:0] snp_fwd_full;
assign out_ebreak = (&per_core_out_ebreak);
@@ -99,36 +90,28 @@ module Vortex_Cluster
.reset (reset),
.io_valid (per_core_io_valid [curr_core]),
.io_data (per_core_io_data [curr_core]),
.dram_req (per_core_dram_req [curr_core]),
.dram_req_write (per_core_dram_req_write [curr_core]),
.dram_req_read (per_core_dram_req_read [curr_core]),
.dram_req_write (per_core_dram_req_write [curr_core]),
.dram_req_addr (per_core_dram_req_addr [curr_core]),
.dram_req_size (per_core_dram_req_size [curr_core]),
.dram_req_data (curr_core_dram_req_data ),
.dram_expected_lat (per_core_dram_expected_lat [curr_core]),
.dram_fill_accept (per_core_dram_fill_accept [curr_core]),
.dram_fill_rsp (per_core_dram_fill_rsp [curr_core]),
.dram_fill_rsp_addr (per_core_dram_fill_rsp_addr [curr_core]),
.dram_fill_rsp_data (per_core_dram_fill_rsp_data [curr_core]),
.I_dram_req (per_core_I_dram_req [curr_core]),
.I_dram_req_write (per_core_I_dram_req_write [curr_core]),
.dram_req_full (l2c_core_accept ),
.dram_rsp_valid (per_core_dram_rsp_valid [curr_core]),
.dram_rsp_addr (per_core_dram_rsp_addr [curr_core]),
.dram_rsp_data (per_core_dram_rsp_data [curr_core]),
.dram_rsp_ready (per_core_dram_rsp_ready [curr_core]),
.I_dram_req_read (per_core_I_dram_req_read [curr_core]),
.I_dram_req_addr (per_core_I_dram_req_addr [curr_core]),
.I_dram_req_size (per_core_I_dram_req_size [curr_core]),
.I_dram_req_write (per_core_I_dram_req_write [curr_core]),
.I_dram_req_addr (per_core_I_dram_req_addr [curr_core]),
.I_dram_req_data (curr_core_I_dram_req_data ),
.I_dram_expected_lat (per_core_I_dram_expected_lat [curr_core]),
.I_dram_fill_accept (per_core_I_dram_fill_accept [curr_core]),
.I_dram_fill_rsp (per_core_I_dram_fill_rsp [curr_core]),
.I_dram_fill_rsp_addr (per_core_I_dram_fill_rsp_addr[curr_core]),
.I_dram_fill_rsp_data (per_core_I_dram_fill_rsp_data[curr_core]),
.dram_req_delay (l2c_core_accept ),
.out_ebreak (per_core_out_ebreak [curr_core]),
.snp_req (snp_fwd),
.I_dram_req_full (l2c_core_accept ),
.I_dram_rsp_valid (per_core_I_dram_rsp_valid [curr_core]),
.I_dram_rsp_addr (per_core_I_dram_rsp_addr [curr_core]),
.I_dram_rsp_data (per_core_I_dram_rsp_data [curr_core]),
.I_dram_rsp_ready (per_core_I_dram_rsp_ready [curr_core]),
.snp_req_valid (snp_fwd_valid),
.snp_req_addr (snp_fwd_addr),
.snp_req_delay (snp_fwd_delay[curr_core]),
.I_snp_req (0),
.I_snp_req_addr (),
.I_snp_req_delay ()
.snp_req_full (snp_fwd_full [curr_core]),
.out_ebreak (per_core_out_ebreak [curr_core])
);
assign per_core_dram_req_data [curr_core] = curr_core_dram_req_data;
@@ -137,27 +120,28 @@ module Vortex_Cluster
endgenerate
//////////////////// L2 Cache ////////////////////
wire[`L2NUM_REQUESTS-1:0] l2c_core_req;
wire[`L2NUM_REQUESTS-1:0][2:0] l2c_core_req_mem_write;
wire[`L2NUM_REQUESTS-1:0][2:0] l2c_core_req_mem_read;
wire[`L2NUM_REQUESTS-1:0][31:0] l2c_core_req_addr;
wire[`L2NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l2c_core_req_data;
wire[`L2NUM_REQUESTS-1:0][1:0] l2c_core_req_wb;
wire[`L2NUM_REQUESTS-1:0] l2c_core_req_valid;
wire[`L2NUM_REQUESTS-1:0][2:0] l2c_core_req_mem_write;
wire[`L2NUM_REQUESTS-1:0][2:0] l2c_core_req_mem_read;
wire[`L2NUM_REQUESTS-1:0][31:0] l2c_core_req_addr;
wire[`L2NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l2c_core_req_data;
wire[`L2NUM_REQUESTS-1:0][1:0] l2c_core_req_wb;
wire[`L2NUM_REQUESTS-1:0] l2c_core_no_wb_slot;
wire[`L2NUM_REQUESTS-1:0] l2c_core_no_wb_slot;
wire[`L2NUM_REQUESTS-1:0] l2c_wb;
wire[`L2NUM_REQUESTS-1:0] [31:0] l2c_wb_addr;
wire[`L2NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l2c_wb_data;
wire[`L2NUM_REQUESTS-1:0] l2c_wb;
wire[`L2NUM_REQUESTS-1:0] [31:0] l2c_wb_addr;
wire[`L2NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l2c_wb_data;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_req_data_port;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_fill_rsp_data_port;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_req_data_port;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_rsp_data_port;
genvar llb_index;
generate
for (llb_index = 0; llb_index < `DBANK_LINE_WORDS; llb_index=llb_index+1) begin
assign out_dram_req_data [llb_index] = dram_req_data_port[llb_index];
assign dram_fill_rsp_data_port[llb_index] = out_dram_fill_rsp_data[llb_index];
assign dram_req_data [llb_index * `DWORD_SIZE_BITS +: `DWORD_SIZE_BITS] = dram_req_data_port[llb_index];
assign dram_rsp_data_port [llb_index] = dram_rsp_data[llb_index * `DWORD_SIZE_BITS +: `DWORD_SIZE_BITS];
end
endgenerate
@@ -165,9 +149,9 @@ module Vortex_Cluster
generate
for (l2c_curr_core = 0; l2c_curr_core < `L2NUM_REQUESTS; l2c_curr_core=l2c_curr_core+2) begin
// Core Request
assign l2c_core_req [l2c_curr_core] = per_core_dram_req [(l2c_curr_core/2)];
assign l2c_core_req [l2c_curr_core+1] = per_core_I_dram_req[(l2c_curr_core/2)];
assign l2c_core_req_valid [l2c_curr_core] = (per_core_dram_req_read[(l2c_curr_core/2)] | per_core_dram_req_write[(l2c_curr_core/2)]);
assign l2c_core_req_valid [l2c_curr_core+1] = (per_core_I_dram_req_read[(l2c_curr_core/2)] | per_core_I_dram_req_write[(l2c_curr_core/2)]);
assign l2c_core_req_mem_write [l2c_curr_core] = per_core_dram_req_write[(l2c_curr_core/2)] ? `SW_MEM_WRITE : `NO_MEM_WRITE;
assign l2c_core_req_mem_write [l2c_curr_core+1] = `NO_MEM_WRITE; // I caches don't write
@@ -184,23 +168,21 @@ module Vortex_Cluster
assign l2c_core_req_data [l2c_curr_core+1] = per_core_I_dram_req_data[(l2c_curr_core/2)];
// Core can't accept Response
assign l2c_core_no_wb_slot [l2c_curr_core] = ~per_core_dram_fill_accept [(l2c_curr_core/2)];
assign l2c_core_no_wb_slot [l2c_curr_core+1] = ~per_core_I_dram_fill_accept[(l2c_curr_core/2)];
assign l2c_core_no_wb_slot [l2c_curr_core] = ~per_core_dram_rsp_ready [(l2c_curr_core/2)];
assign l2c_core_no_wb_slot [l2c_curr_core+1] = ~per_core_I_dram_rsp_ready[(l2c_curr_core/2)];
// Cache Fill Response
assign per_core_dram_fill_rsp [(l2c_curr_core/2)] = l2c_wb[l2c_curr_core];
assign per_core_I_dram_fill_rsp [(l2c_curr_core/2)] = l2c_wb[l2c_curr_core+1];
assign per_core_dram_rsp_valid [(l2c_curr_core/2)] = l2c_wb[l2c_curr_core];
assign per_core_I_dram_rsp_valid [(l2c_curr_core/2)] = l2c_wb[l2c_curr_core+1];
assign per_core_dram_fill_rsp_data[(l2c_curr_core/2)] = l2c_wb_data[l2c_curr_core];
assign per_core_I_dram_fill_rsp_data[(l2c_curr_core/2)] = l2c_wb_data[l2c_curr_core+1];
assign per_core_dram_rsp_data [(l2c_curr_core/2)] = l2c_wb_data[l2c_curr_core];
assign per_core_I_dram_rsp_data [(l2c_curr_core/2)] = l2c_wb_data[l2c_curr_core+1];
assign per_core_dram_fill_rsp_addr[(l2c_curr_core/2)] = l2c_wb_addr[l2c_curr_core];
assign per_core_I_dram_fill_rsp_addr[(l2c_curr_core/2)] = l2c_wb_addr[l2c_curr_core+1];
assign per_core_dram_rsp_addr [(l2c_curr_core/2)] = l2c_wb_addr[l2c_curr_core];
assign per_core_I_dram_rsp_addr [(l2c_curr_core/2)] = l2c_wb_addr[l2c_curr_core+1];
end
endgenerate
wire dram_snp_full;
wire dram_req_because_of_wb;
VX_cache #(
.CACHE_SIZE_BYTES (`L2CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (`L2BANK_LINE_SIZE_BYTES),
@@ -223,64 +205,60 @@ module Vortex_Cluster
.FILL_INVALIDAOR_SIZE (`L2FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(`L2SIMULATED_DRAM_LATENCY_CYCLES)
) gpu_l2cache (
.clk (clk),
.reset (reset),
.clk (clk),
.reset (reset),
// Core Req (DRAM Fills/WB) To L2 Request
.core_req_valid (l2c_core_req),
.core_req_addr (l2c_core_req_addr),
.core_req_writedata({l2c_core_req_data}),
.core_req_mem_read (l2c_core_req_mem_read),
.core_req_mem_write(l2c_core_req_mem_write),
.core_req_rd (0),
.core_req_wb (l2c_core_req_wb),
.core_req_warp_num (0),
.core_req_pc (0),
.core_req_valid (l2c_core_req_valid),
.core_req_mem_read (l2c_core_req_mem_read),
.core_req_mem_write (l2c_core_req_mem_write),
.core_req_addr (l2c_core_req_addr),
.core_req_writedata ({l2c_core_req_data}),
.core_req_rd (0),
.core_req_wb (l2c_core_req_wb),
.core_req_warp_num (0),
.core_req_pc (0),
// L2 can't accept Core Request
.delay_req (l2c_core_accept),
.delay_req (l2c_core_accept),
// Core can't accept L2 Request
.core_no_wb_slot (|l2c_core_no_wb_slot),
.core_no_wb_slot (|l2c_core_no_wb_slot),
// Core Writeback
.core_wb_valid (l2c_wb),
.core_wb_req_rd (),
.core_wb_req_wb (),
.core_wb_warp_num (),
.core_wb_readdata ({l2c_wb_data}),
.core_wb_address (l2c_wb_addr),
.core_wb_pc (),
.core_wb_valid (l2c_wb),
/* verilator lint_off PINCONNECTEMPTY */
.core_wb_req_rd (),
.core_wb_req_wb (),
.core_wb_warp_num (),
.core_wb_pc (),
/* verilator lint_on PINCONNECTEMPTY */
.core_wb_readdata ({l2c_wb_data}),
.core_wb_address (l2c_wb_addr),
// L2 Cache DRAM Fill response
.dram_fill_rsp (out_dram_fill_rsp),
.dram_fill_rsp_addr(out_dram_fill_rsp_addr),
.dram_fill_rsp_data({dram_fill_rsp_data_port}),
.dram_rsp_valid (dram_rsp_valid),
.dram_rsp_addr (dram_rsp_addr),
.dram_rsp_data ({dram_rsp_data_port}),
// L2 Cache can't accept Fill Response
.dram_fill_accept (out_dram_fill_accept),
.dram_rsp_ready (dram_rsp_ready),
// L2 Cache DRAM Fill Request
.dram_req (out_dram_req),
.dram_req_write (out_dram_req_write),
.dram_req_read (out_dram_req_read),
.dram_req_addr (out_dram_req_addr),
.dram_req_size (out_dram_req_size),
.dram_req_data ({dram_req_data_port}),
.dram_req_delay (out_dram_req_delay),
// Snoop Response
.dram_req_because_of_wb(dram_req_because_of_wb),
.dram_snp_full (dram_snp_full),
.dram_req_read (dram_req_read),
.dram_req_write (dram_req_write),
.dram_req_addr (dram_req_addr),
.dram_req_data ({dram_req_data_port}),
.dram_req_full (dram_req_full),
// Snoop Request
.snp_req (llc_snp_req),
.snp_req_addr (llc_snp_req_addr),
.snp_req_delay (llc_snp_req_delay),
.snp_req_valid (llc_snp_req_valid),
.snp_req_addr (llc_snp_req_addr),
.snp_req_full (llc_snp_req_full),
.snp_fwd (snp_fwd),
.snp_fwd_addr (snp_fwd_addr),
.snp_fwd_delay (|snp_fwd_delay)
.snp_fwd_valid (snp_fwd_valid),
.snp_fwd_addr (snp_fwd_addr),
.snp_fwd_full (|snp_fwd_full)
);
endmodule

View File

@@ -11,33 +11,26 @@ module Vortex_Socket (
output wire io_valid[`NUM_CORES-1:0],
output wire[31:0] io_data [`NUM_CORES-1:0],
output wire[31:0] number_cores,
// DRAM Req
output wire out_dram_req,
output wire out_dram_req_write,
output wire out_dram_req_read,
output wire [31:0] out_dram_req_addr,
output wire [31:0] out_dram_req_size,
output wire [31:0] out_dram_req_data[`DBANK_LINE_WORDS-1:0],
output wire [31:0] out_dram_expected_lat,
input wire out_dram_req_delay,
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
input wire dram_req_full,
// DRAM Res
output wire out_dram_fill_accept,
input wire out_dram_fill_rsp,
input wire [31:0] out_dram_fill_rsp_addr,
input wire [31:0] out_dram_fill_rsp_data[`DBANK_LINE_WORDS-1:0],
// DRAM Rsp
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire [`DBANK_LINE_SIZE-1:0] dram_rsp_data,
output wire dram_rsp_ready,
// LLC Snooping
input wire llc_snp_req,
input wire llc_snp_req_valid,
input wire[31:0] llc_snp_req_addr,
output wire llc_snp_req_delay,
output wire llc_snp_req_full,
output wire out_ebreak
);
assign number_cores = `NUM_CORES;
if (`NUM_CLUSTERS == 1) begin
wire[`NUM_CORES-1:0] cluster_io_valid;
@@ -51,59 +44,55 @@ module Vortex_Socket (
end
Vortex_Cluster #(.CLUSTER_ID(0)) Vortex_Cluster(
.clk (clk),
.reset (reset),
.io_valid (cluster_io_valid),
.io_data (cluster_io_data),
.clk (clk),
.reset (reset),
.io_valid (cluster_io_valid),
.io_data (cluster_io_data),
.out_dram_req (out_dram_req),
.out_dram_req_write (out_dram_req_write),
.out_dram_req_read (out_dram_req_read),
.out_dram_req_addr (out_dram_req_addr),
.out_dram_req_size (out_dram_req_size),
.out_dram_req_data (out_dram_req_data),
.out_dram_expected_lat (out_dram_expected_lat),
.out_dram_req_delay (out_dram_req_delay),
.dram_req_read (dram_req_read),
.dram_req_write (dram_req_write),
.dram_req_addr (dram_req_addr),
.dram_req_data (dram_req_data),
.dram_req_full (dram_req_full),
.out_dram_fill_accept (out_dram_fill_accept),
.out_dram_fill_rsp (out_dram_fill_rsp),
.out_dram_fill_rsp_addr(out_dram_fill_rsp_addr),
.out_dram_fill_rsp_data(out_dram_fill_rsp_data),
.dram_rsp_valid (dram_rsp_valid),
.dram_rsp_addr (dram_rsp_addr),
.dram_rsp_data (dram_rsp_data),
.dram_rsp_ready (dram_rsp_ready),
.llc_snp_req (llc_snp_req),
.llc_snp_req_addr (llc_snp_req_addr),
.llc_snp_req_delay (llc_snp_req_delay),
.out_ebreak (out_ebreak)
.llc_snp_req_valid (llc_snp_req_valid),
.llc_snp_req_addr (llc_snp_req_addr),
.llc_snp_req_full (llc_snp_req_full),
.out_ebreak (out_ebreak)
);
end else begin
wire snp_fwd;
wire[31:0] snp_fwd_addr;
wire[`NUM_CLUSTERS-1:0] snp_fwd_delay;
wire snp_fwd_valid;
wire[31:0] snp_fwd_addr;
wire[`NUM_CLUSTERS-1:0] snp_fwd_full;
wire[`NUM_CLUSTERS-1:0] per_cluster_out_ebreak;
assign out_ebreak = (&per_cluster_out_ebreak);
// // DRAM Dcache Req
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_req;
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid;
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_req_write;
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_req_read;
wire[`NUM_CLUSTERS-1:0] [31:0] per_cluster_dram_req_addr;
wire[`NUM_CLUSTERS-1:0] [31:0] per_cluster_dram_req_size;
wire[`NUM_CLUSTERS-1:0] [31:0] per_cluster_dram_expected_lat;
wire[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_cluster_dram_req_data;
wire[31:0] per_cluster_dram_req_data_up[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0];
wire l3c_core_accept;
wire l3c_core_req_full;
// // DRAM Dcache Res
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_fill_accept;
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_fill_rsp;
wire[`NUM_CLUSTERS-1:0] [31:0] per_cluster_dram_fill_rsp_addr;
wire[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_cluster_dram_fill_rsp_data;
wire[31:0] per_cluster_dram_fill_rsp_data_up[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0];
// // DRAM Dcache Rsp
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready;
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_valid;
wire[`NUM_CLUSTERS-1:0] [31:0] per_cluster_dram_rsp_addr;
wire[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_cluster_dram_rsp_data;
wire[31:0] per_cluster_dram_rsp_data_up[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0];
wire[`NUM_CLUSTERS-1:0][`NUM_CORES_PER_CLUSTER-1:0] per_cluster_io_valid;
wire[`NUM_CLUSTERS-1:0][`NUM_CORES_PER_CLUSTER-1:0][31:0] per_cluster_io_data;
@@ -115,96 +104,83 @@ module Vortex_Socket (
assign io_data [curr_cc+(curr_c*`NUM_CORES_PER_CLUSTER)] = per_cluster_io_data [curr_c][curr_cc];
end
for (curr_word = 0; curr_word < `DBANK_LINE_WORDS; curr_word = curr_word+1) begin
assign per_cluster_dram_req_data [curr_c][curr_word] = per_cluster_dram_req_data_up [curr_c][curr_word];
assign per_cluster_dram_fill_rsp_data_up[curr_c][curr_word] = per_cluster_dram_fill_rsp_data[curr_c][curr_word];
assign per_cluster_dram_rsp_data_up[curr_c][curr_word] = per_cluster_dram_rsp_data[curr_c][curr_word];
end
end
genvar curr_cluster;
for (curr_cluster = 0; curr_cluster < `NUM_CLUSTERS; curr_cluster=curr_cluster+1) begin
Vortex_Cluster #(.CLUSTER_ID(curr_cluster)) Vortex_Cluster(
.clk (clk),
.reset (reset),
.io_valid (per_cluster_io_valid [curr_cluster]),
.io_data (per_cluster_io_data [curr_cluster]),
.clk (clk),
.reset (reset),
.io_valid (per_cluster_io_valid [curr_cluster]),
.io_data (per_cluster_io_data [curr_cluster]),
.out_dram_req (per_cluster_dram_req [curr_cluster]),
.out_dram_req_write (per_cluster_dram_req_write [curr_cluster]),
.out_dram_req_read (per_cluster_dram_req_read [curr_cluster]),
.out_dram_req_addr (per_cluster_dram_req_addr [curr_cluster]),
.out_dram_req_size (per_cluster_dram_req_size [curr_cluster]),
.out_dram_req_data (per_cluster_dram_req_data_up [curr_cluster]),
.out_dram_expected_lat (per_cluster_dram_expected_lat [curr_cluster]),
.out_dram_req_delay (l3c_core_accept),
.dram_req_write (per_cluster_dram_req_write [curr_cluster]),
.dram_req_read (per_cluster_dram_req_read [curr_cluster]),
.dram_req_addr (per_cluster_dram_req_addr [curr_cluster]),
.dram_req_data (per_cluster_dram_req_data_up [curr_cluster]),
.dram_req_full (l3c_core_req_full),
.out_dram_fill_accept (per_cluster_dram_fill_accept [curr_cluster]),
.out_dram_fill_rsp (per_cluster_dram_fill_rsp [curr_cluster]),
.out_dram_fill_rsp_addr(per_cluster_dram_fill_rsp_addr [curr_cluster]),
.out_dram_fill_rsp_data(per_cluster_dram_fill_rsp_data_up[curr_cluster]),
.dram_rsp_valid (per_cluster_dram_rsp_valid [curr_cluster]),
.dram_rsp_addr (per_cluster_dram_rsp_addr [curr_cluster]),
.dram_rsp_data (per_cluster_dram_rsp_data_up [curr_cluster]),
.dram_rsp_ready (per_cluster_dram_rsp_ready [curr_cluster]),
.llc_snp_req (snp_fwd),
.llc_snp_req_addr (snp_fwd_addr),
.llc_snp_req_delay (snp_fwd_delay[curr_cluster]),
.llc_snp_req_valid (snp_fwd_valid),
.llc_snp_req_addr (snp_fwd_addr),
.llc_snp_req_full (snp_fwd_full[curr_cluster]),
.out_ebreak (per_cluster_out_ebreak [curr_cluster])
.out_ebreak (per_cluster_out_ebreak [curr_cluster])
);
end
//////////////////// L3 Cache ////////////////////
wire[`L3NUM_REQUESTS-1:0] l3c_core_req;
wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_write;
wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_read;
wire[`L3NUM_REQUESTS-1:0][31:0] l3c_core_req_addr;
wire[`L3NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l3c_core_req_data;
wire[`L3NUM_REQUESTS-1:0][1:0] l3c_core_req_wb;
wire[`L3NUM_REQUESTS-1:0] l3c_core_req_valid;
wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_write;
wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_read;
wire[`L3NUM_REQUESTS-1:0][31:0] l3c_core_req_addr;
wire[`L3NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l3c_core_req_data;
wire[`L3NUM_REQUESTS-1:0][1:0] l3c_core_req_wb;
wire[`L3NUM_REQUESTS-1:0] l3c_core_no_wb_slot;
wire[`L3NUM_REQUESTS-1:0] l3c_core_no_wb_slot;
wire[`L3NUM_REQUESTS-1:0] l3c_wb;
wire[`L3NUM_REQUESTS-1:0] [31:0] l3c_wb_addr;
wire[`L3NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l3c_wb_data;
wire[`L3NUM_REQUESTS-1:0] l3c_wb;
wire[`L3NUM_REQUESTS-1:0] [31:0] l3c_wb_addr;
wire[`L3NUM_REQUESTS-1:0][`IBANK_LINE_WORDS-1:0][31:0] l3c_wb_data;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_req_data_port;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_fill_rsp_data_port;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_req_data_port;
wire[`DBANK_LINE_WORDS-1:0][31:0] dram_rsp_data_port;
genvar llb_index;
for (llb_index = 0; llb_index < `DBANK_LINE_WORDS; llb_index=llb_index+1) begin
assign out_dram_req_data [llb_index] = dram_req_data_port[llb_index];
assign dram_fill_rsp_data_port[llb_index] = out_dram_fill_rsp_data[llb_index];
end
for (llb_index = 0; llb_index < `DBANK_LINE_WORDS; llb_index=llb_index+1) begin
assign dram_req_data [llb_index] = dram_req_data_port[llb_index];
assign dram_rsp_data_port[llb_index] = dram_rsp_data[llb_index];
end
//
genvar l3c_curr_cluster;
for (l3c_curr_cluster = 0; l3c_curr_cluster < `L3NUM_REQUESTS; l3c_curr_cluster=l3c_curr_cluster+1) begin
// Core Request
assign l3c_core_req [l3c_curr_cluster] = per_cluster_dram_req [l3c_curr_cluster];
assign l3c_core_req_mem_write [l3c_curr_cluster] = per_cluster_dram_req_write[l3c_curr_cluster] ? `SW_MEM_WRITE : `NO_MEM_WRITE;
assign l3c_core_req_valid [l3c_curr_cluster] = per_cluster_dram_req_valid[l3c_curr_cluster];
assign l3c_core_req_mem_read [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? `LW_MEM_READ : `NO_MEM_READ;
assign l3c_core_req_mem_write [l3c_curr_cluster] = per_cluster_dram_req_write[l3c_curr_cluster] ? `SW_MEM_WRITE : `NO_MEM_WRITE;
assign l3c_core_req_wb [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? 1 : 0;
assign l3c_core_req_addr [l3c_curr_cluster] = per_cluster_dram_req_addr [l3c_curr_cluster];
assign l3c_core_req_data [l3c_curr_cluster] = per_cluster_dram_req_data [l3c_curr_cluster];
// Core can't accept Response
assign l3c_core_no_wb_slot [l3c_curr_cluster] = ~per_cluster_dram_fill_accept[l3c_curr_cluster];
assign l3c_core_no_wb_slot [l3c_curr_cluster] = ~per_cluster_dram_rsp_ready[l3c_curr_cluster];
// Cache Fill Response
assign per_cluster_dram_fill_rsp [l3c_curr_cluster] = l3c_wb [l3c_curr_cluster];
assign per_cluster_dram_fill_rsp_data[l3c_curr_cluster] = l3c_wb_data[l3c_curr_cluster];
assign per_cluster_dram_fill_rsp_addr[l3c_curr_cluster] = l3c_wb_addr[l3c_curr_cluster];
assign per_cluster_dram_rsp_valid [l3c_curr_cluster] = l3c_wb [l3c_curr_cluster];
assign per_cluster_dram_rsp_data [l3c_curr_cluster] = l3c_wb_data [l3c_curr_cluster];
assign per_cluster_dram_rsp_addr [l3c_curr_cluster] = l3c_wb_addr [l3c_curr_cluster];
end
wire dram_snp_full;
wire dram_req_because_of_wb;
VX_cache #(
.CACHE_SIZE_BYTES (`L3CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (`L3BANK_LINE_SIZE_BYTES),
@@ -230,62 +206,58 @@ module Vortex_Socket (
.clk (clk),
.reset (reset),
// Core Req (DRAM Fills/WB) To L2 Request
.core_req_valid (l3c_core_req),
.core_req_addr (l3c_core_req_addr),
.core_req_writedata({l3c_core_req_data}),
.core_req_mem_read (l3c_core_req_mem_read),
.core_req_mem_write(l3c_core_req_mem_write),
.core_req_rd (0),
.core_req_wb (l3c_core_req_wb),
.core_req_warp_num (0),
.core_req_pc (0),
// Core Req (DRAM Fills/WB) To L2 Request
.core_req_valid (l3c_core_req_valid),
.core_req_mem_read (l3c_core_req_mem_read),
.core_req_mem_write (l3c_core_req_mem_write),
.core_req_addr (l3c_core_req_addr),
.core_req_writedata ({l3c_core_req_data}),
.core_req_rd (0),
.core_req_wb (l3c_core_req_wb),
.core_req_warp_num (0),
.core_req_pc (0),
// L2 can't accept Core Request
.delay_req (l3c_core_accept),
.delay_req (l3c_core_req_full),
// Core can't accept L2 Request
.core_no_wb_slot (|l3c_core_no_wb_slot),
.core_no_wb_slot (|l3c_core_no_wb_slot),
// Core Writeback
.core_wb_valid (l3c_wb),
.core_wb_req_rd (),
.core_wb_req_wb (),
.core_wb_warp_num (),
.core_wb_readdata ({l3c_wb_data}),
.core_wb_address (l3c_wb_addr),
.core_wb_pc (),
.core_wb_valid (l3c_wb),
/* verilator lint_off PINCONNECTEMPTY */
.core_wb_req_rd (),
.core_wb_req_wb (),
.core_wb_warp_num (),
.core_wb_pc (),
/* verilator lint_on PINCONNECTEMPTY */
.core_wb_readdata ({l3c_wb_data}),
.core_wb_address (l3c_wb_addr),
// L2 Cache DRAM Fill response
.dram_fill_rsp (out_dram_fill_rsp),
.dram_fill_rsp_addr(out_dram_fill_rsp_addr),
.dram_fill_rsp_data({dram_fill_rsp_data_port}),
.dram_rsp_valid (dram_rsp_valid),
.dram_rsp_addr (dram_rsp_addr),
.dram_rsp_data ({dram_rsp_data_port}),
// L2 Cache can't accept Fill Response
.dram_fill_accept (out_dram_fill_accept),
.dram_rsp_ready (dram_rsp_ready),
// L2 Cache DRAM Fill Request
.dram_req (out_dram_req),
.dram_req_write (out_dram_req_write),
.dram_req_read (out_dram_req_read),
.dram_req_addr (out_dram_req_addr),
.dram_req_size (out_dram_req_size),
.dram_req_data ({dram_req_data_port}),
.dram_req_delay (out_dram_req_delay),
// Snoop Response
.dram_req_because_of_wb(dram_req_because_of_wb),
.dram_snp_full (dram_snp_full),
.dram_req_write (dram_req_write),
.dram_req_read (dram_req_read),
.dram_req_addr (dram_req_addr),
.dram_req_data ({dram_req_data_port}),
.dram_req_full (dram_req_full),
// Snoop Request
.snp_req (llc_snp_req),
.snp_req_addr (llc_snp_req_addr),
.snp_req_delay (llc_snp_req_delay),
.snp_req_valid (llc_snp_req_valid),
.snp_req_addr (llc_snp_req_addr),
.snp_req_full (llc_snp_req_full),
// Snoop Forward
.snp_fwd (snp_fwd),
.snp_fwd_addr (snp_fwd_addr),
.snp_fwd_delay (|snp_fwd_delay)
.snp_fwd_valid (snp_fwd_valid),
.snp_fwd_addr (snp_fwd_addr),
.snp_fwd_full (|snp_fwd_full)
);
end

View File

@@ -17,29 +17,28 @@ module byte_enabled_simple_dual_port_ram
// Thread Byte Bit
logic [`NUM_THREADS-1:0][3:0][7:0] GPR[31:0];
// initial begin
// for (ini = 0; ini < 32; ini = ini + 1) GPR[ini] = 0;
// end
integer ini;
always @(posedge clk) begin
if (we) begin
integer thread_ind;
for (thread_ind = 0; thread_ind < `NUM_THREADS; thread_ind = thread_ind + 1) begin
if (be[thread_ind]) begin
GPR[waddr][thread_ind][0] <= wdata[thread_ind][7:0];
GPR[waddr][thread_ind][1] <= wdata[thread_ind][15:8];
GPR[waddr][thread_ind][2] <= wdata[thread_ind][23:16];
GPR[waddr][thread_ind][3] <= wdata[thread_ind][31:24];
if (reset) begin
//--
end else begin
if (we) begin
integer thread_ind;
for (thread_ind = 0; thread_ind < `NUM_THREADS; thread_ind = thread_ind + 1) begin
if (be[thread_ind]) begin
GPR[waddr][thread_ind][0] <= wdata[thread_ind][7:0];
GPR[waddr][thread_ind][1] <= wdata[thread_ind][15:8];
GPR[waddr][thread_ind][2] <= wdata[thread_ind][23:16];
GPR[waddr][thread_ind][3] <= wdata[thread_ind][31:24];
end
end
end
end
// $display("^^^^^^^^^^^^^^^^^^^^^^^");
// for (regi = 0; regi <= 31; regi = regi + 1) begin
// for (threadi = 0; threadi < `NUM_THREADS; threadi = threadi + 1) begin
// if (GPR[regi][threadi] != 0) $display("$%d: %h",regi, GPR[regi][threadi]);
// end
// end
end
// $display("^^^^^^^^^^^^^^^^^^^^^^^");
// for (regi = 0; regi <= 31; regi = regi + 1) begin
// for (threadi = 0; threadi < `NUM_THREADS; threadi = threadi + 1) begin
// if (GPR[regi][threadi] != 0) $display("$%d: %h",regi, GPR[regi][threadi]);
// end
// end
end
end
assign q1 = GPR[raddr1];

View File

@@ -1,51 +1,46 @@
`include "VX_define.vh"
module VX_cache_data
#(
parameter NUM_IND = 8,
parameter NUM_WORDS_PER_BLOCK = 4,
parameter TAG_SIZE_START = 0,
parameter TAG_SIZE_END = 16,
parameter IND_SIZE_START = 0,
parameter IND_SIZE_END = 7
)
(
module VX_cache_data #(
parameter NUM_IND = 8,
parameter NUM_WORDS_PER_BLOCK = 4,
parameter TAG_SIZE_START = 0,
parameter TAG_SIZE_END = 16,
parameter IND_SIZE_START = 0,
parameter IND_SIZE_END = 7
) (
input wire clk, rst, // Clock
// `ifdef PARAM
// Addr
input wire[IND_SIZE_END:IND_SIZE_START] addr,
// WE
input wire[NUM_WORDS_PER_BLOCK-1:0][3:0] we,
input wire evict,
// Data
input wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_write,
input wire[TAG_SIZE_END:TAG_SIZE_START] tag_write,
// `ifdef PARAM
// Addr
input wire[IND_SIZE_END:IND_SIZE_START] addr,
// WE
input wire[NUM_WORDS_PER_BLOCK-1:0][3:0] we,
input wire evict,
// Data
input wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_write,
input wire[TAG_SIZE_END:TAG_SIZE_START] tag_write,
output wire[TAG_SIZE_END:TAG_SIZE_START] tag_use,
output wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_use,
output wire valid_use,
output wire dirty_use
// `else
// // Addr
// input wire[7:0] addr,
// // WE
// input wire[NUM_WORDS_PER_BLOCK-1:0][3:0] we,
// input wire evict,
// // Data
// input wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_write, // Update Data
// input wire[16:0] tag_write,
output wire[TAG_SIZE_END:TAG_SIZE_START] tag_use,
output wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_use,
output wire valid_use,
output wire dirty_use
// `else
// // Addr
// input wire[7:0] addr,
// // WE
// input wire[NUM_WORDS_PER_BLOCK-1:0][3:0] we,
// input wire evict,
// // Data
// input wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_write, // Update Data
// input wire[16:0] tag_write,
// output wire[16:0] tag_use,
// output wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_use,
// output wire valid_use,
// output wire dirty_use
// `endif
// output wire[16:0] tag_use,
// output wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_use,
// output wire valid_use,
// output wire dirty_use
// `endif
);
//localparam NUM_BANKS = CACHE_BANKS;
//localparam CACHE_BLOCK_PER_BANK = (CACHE_BLOCK / CACHE_BANKS);
// localparam NUM_WORDS_PER_BLOCK = CACHE_BLOCK / (CACHE_BANKS*4);
@@ -53,179 +48,165 @@ module VX_cache_data
wire currently_writing = (|we);
wire update_dirty = ((!dirty_use) && currently_writing) || (evict);
wire dirt_new = evict ? 0 : (|we);
`ifndef SYN
// (3:0) 4 bytes
reg[NUM_WORDS_PER_BLOCK-1:0][3:0][7:0] data[NUM_IND-1:0]; // Actual Data
reg[TAG_SIZE_END:TAG_SIZE_START] tag[NUM_IND-1:0];
reg valid[NUM_IND-1:0];
reg dirty[NUM_IND-1:0];
`ifndef SYN
// 16 bytes
assign data_use = data[addr]; // Read Port
assign tag_use = tag[addr];
assign valid_use = valid[addr];
assign dirty_use = dirty[addr];
// (3:0) 4 bytes
reg[NUM_WORDS_PER_BLOCK-1:0][3:0][7:0] data[NUM_IND-1:0]; // Actual Data
reg[TAG_SIZE_END:TAG_SIZE_START] tag[NUM_IND-1:0];
reg valid[NUM_IND-1:0];
reg dirty[NUM_IND-1:0];
integer f;
integer ini_ind;
always @(posedge clk, posedge rst) begin : update_all
if (rst) begin
for (ini_ind = 0; ini_ind < NUM_IND; ini_ind=ini_ind+1) begin
//data[ini_ind] <= 0;
//tag[ini_ind] <= 0;
valid[ini_ind] <= 0;
//dirty[ini_ind] <= 0;
end
end else begin
if (update_dirty) dirty[addr] <= dirt_new; // WRite Port
if (evict) tag[addr] <= tag_write;
if (evict) valid[addr] <= 1;
// 16 bytes
assign data_use = data[addr]; // Read Port
assign tag_use = tag[addr];
assign valid_use = valid[addr];
assign dirty_use = dirty[addr];
integer f;
integer ini_ind;
always @(posedge clk, posedge rst) begin : update_all
if (rst) begin
for (ini_ind = 0; ini_ind < NUM_IND; ini_ind=ini_ind+1) begin
//data[ini_ind] <= 0;
//tag[ini_ind] <= 0;
valid[ini_ind] <= 0;
//dirty[ini_ind] <= 0;
for (f = 0; f < NUM_WORDS_PER_BLOCK; f = f + 1) begin
if (we[f][0]) data[addr][f][0] <= data_write[f][7 :0 ];
if (we[f][1]) data[addr][f][1] <= data_write[f][15:8 ];
if (we[f][2]) data[addr][f][2] <= data_write[f][23:16];
if (we[f][3]) data[addr][f][3] <= data_write[f][31:24];
end
end else begin
if (update_dirty) dirty[addr] <= dirt_new; // WRite Port
if (evict) tag[addr] <= tag_write;
if (evict) valid[addr] <= 1;
for (f = 0; f < NUM_WORDS_PER_BLOCK; f = f + 1) begin
if (we[f][0]) data[addr][f][0] <= data_write[f][7 :0 ];
if (we[f][1]) data[addr][f][1] <= data_write[f][15:8 ];
if (we[f][2]) data[addr][f][2] <= data_write[f][23:16];
if (we[f][3]) data[addr][f][3] <= data_write[f][31:24];
end
end
end
end
`else
`else
wire[IND_SIZE_END:IND_SIZE_START] use_addr = addr;
wire[IND_SIZE_END:IND_SIZE_START] use_addr = addr;
wire cena = 1;
wire cena = 1;
wire cenb_d = (|we);
wire[NUM_WORDS_PER_BLOCK-1:0][31:0] wdata_d = data_write;
wire[NUM_WORDS_PER_BLOCK-1:0][31:0] write_bit_mask_d;
wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_out_d;
genvar cur_b;
for (cur_b = 0; cur_b < NUM_WORDS_PER_BLOCK; cur_b=cur_b+1) begin
assign write_bit_mask_d[cur_b] = {32{~we[cur_b]}};
end
assign data_use = data_out_d;
wire cenb_d = (|we);
wire[NUM_WORDS_PER_BLOCK-1:0][31:0] wdata_d = data_write;
wire[NUM_WORDS_PER_BLOCK-1:0][31:0] write_bit_mask_d;
wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_out_d;
genvar cur_b;
for (cur_b = 0; cur_b < NUM_WORDS_PER_BLOCK; cur_b=cur_b+1) begin
assign write_bit_mask_d[cur_b] = {32{~we[cur_b]}};
end
assign data_use = data_out_d;
// Using ASIC MEM
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 data (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(data_out_d),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena),
.AA(use_addr),
.CLKB(clk),
.CENB(cenb_d),
.WENB(write_bit_mask_d),
.AB(use_addr),
.DB(wdata_d),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
wire[16:0] old_tag;
wire old_valid;
wire old_dirty;
wire[16:0] new_tag = evict ? tag_write : old_tag;
wire new_valid = evict ? 1 : old_valid;
wire new_dirty = update_dirty ? dirt_new : old_dirty;
wire cenb_m = (evict || update_dirty);
wire[19-1:0][31:0] write_bit_mask_m = cenb_m ? 19'b0 : 19'b1;
// Try to fix the error in memory conneciton, modified by Lingjun Zhu on Oct. 28 2019
// wire[NUM_WORDS_PER_BLOCK-1:0][31:0] wdata_m = {new_tag, new_dirty, new_valid};
// wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_out_m;
wire[19-1:0] wdata_m = {new_tag, new_dirty, new_valid};
wire[19-1:0] data_out_m;
assign {old_tag, old_dirty, old_valid} = data_out_m;
// Using ASIC MEM
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x128_wm1 data (
.CENYA(),
.AYA(),
.CENYB(),
.WENYB(),
.AYB(),
.QA(data_out_d),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena),
.AA(use_addr),
.CLKB(clk),
.CENB(cenb_d),
.WENB(write_bit_mask_d),
.AB(use_addr),
.DB(wdata_d),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
.TWENB(128'b0),
.TAB(5'b0),
.TDB(128'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
assign dirty_use = old_dirty;
assign valid_use = old_valid;
assign tag_use = old_tag;
wire[16:0] old_tag;
wire old_valid;
wire old_dirty;
wire[16:0] new_tag = evict ? tag_write : old_tag;
wire new_valid = evict ? 1 : old_valid;
wire new_dirty = update_dirty ? dirt_new : old_dirty;
wire cenb_m = (evict || update_dirty);
wire[19-1:0][31:0] write_bit_mask_m = cenb_m ? 19'b0 : 19'b1;
// Try to fix the error in memory conneciton, modified by Lingjun Zhu on Oct. 28 2019
// wire[NUM_WORDS_PER_BLOCK-1:0][31:0] wdata_m = {new_tag, new_dirty, new_valid};
// wire[NUM_WORDS_PER_BLOCK-1:0][31:0] data_out_m;
wire[19-1:0] wdata_m = {new_tag, new_dirty, new_valid};
wire[19-1:0] data_out_m;
assign {old_tag, old_dirty, old_valid} = data_out_m;
assign dirty_use = old_dirty;
assign valid_use = old_valid;
assign tag_use = old_tag;
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x19_wm0 meta (
.CENYA(),
.AYA(),
.CENYB(),
// .WENYB(),
.AYB(),
.QA(data_out_m),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena),
.AA(use_addr),
.CLKB(clk),
.CENB(cenb_m),
// .WENB(write_bit_mask_m),
.AB(use_addr),
.DB(wdata_m),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
// .TWENB(128'b0),
.TAB(5'b0),
.TDB(19'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
`endif
/* verilator lint_off PINCONNECTEMPTY */
rf2_32x19_wm0 meta (
.CENYA(),
.AYA(),
.CENYB(),
// .WENYB(),
.AYB(),
.QA(data_out_m),
.SOA(),
.SOB(),
.CLKA(clk),
.CENA(cena),
.AA(use_addr),
.CLKB(clk),
.CENB(cenb_m),
// .WENB(write_bit_mask_m),
.AB(use_addr),
.DB(wdata_m),
.EMAA(3'b011),
.EMASA(1'b0),
.EMAB(3'b011),
.TENA(1'b1),
.TCENA(1'b0),
.TAA(5'b0),
.TENB(1'b1),
.TCENB(1'b0),
// .TWENB(128'b0),
.TAB(5'b0),
.TDB(19'b0),
.RET1N(1'b1),
.SIA(2'b0),
.SEA(1'b0),
.DFTRAMBYP(1'b0),
.SIB(2'b0),
.SEB(1'b0),
.COLLDISN(1'b1)
);
/* verilator lint_on PINCONNECTEMPTY */
`endif
endmodule

View File

@@ -1,21 +1,19 @@
module VX_divide
#(
parameter WIDTHN=1,
parameter WIDTHD=1,
parameter NREP="UNSIGNED",
parameter DREP="UNSIGNED",
parameter SPEED="MIXED", // "MIXED" or "HIGHEST"
parameter PIPELINE=0
)
(
input clock, aclr, clken,
module VX_divide #(
parameter WIDTHN=1,
parameter WIDTHD=1,
parameter NREP="UNSIGNED",
parameter DREP="UNSIGNED",
parameter SPEED="MIXED", // "MIXED" or "HIGHEST"
parameter PIPELINE=0
) (
input clock, aclr, clken,
input [WIDTHN-1:0] numer,
input [WIDTHD-1:0] denom,
input [WIDTHN-1:0] numer,
input [WIDTHD-1:0] denom,
output reg [WIDTHN-1:0] quotient,
output reg [WIDTHD-1:0] remainder
);
output reg [WIDTHN-1:0] quotient,
output reg [WIDTHD-1:0] remainder
);
// synthesis read_comments_as_HDL on
// localparam IMPL = "quartus";
@@ -27,14 +25,16 @@ module VX_divide
generate
if (NREP != DREP) begin
/* verilator lint_off DECLFILENAME */
different_nrep_drep_not_yet_supported non_existing_module();
/* verilator lint_on DECLFILENAME */
end
if (IMPL == "quartus") begin
localparam lpm_speed=SPEED == "HIGHEST" ? 9:5;
lpm_divide#(
lpm_divide #(
.LPM_WIDTHN(WIDTHN),
.LPM_WIDTHD(WIDTHD),
.LPM_NREPRESENTATION(NREP),
@@ -42,7 +42,7 @@ module VX_divide
.LPM_PIPELINE(PIPELINE),
.LPM_REMAINDERPOSITIVE("FALSE"), // emulate verilog % operator
.MAXIMIZE_SPEED(lpm_speed)
) quartus_divider(
) quartus_divider (
.clock(clock),
.aclr(aclr),
.clken(clken),
@@ -51,7 +51,6 @@ module VX_divide
.quotient(quotient),
.remain(remainder)
);
end
else begin

View File

@@ -1,21 +1,19 @@
module VX_mult
#(
parameter WIDTHA=1,
parameter WIDTHB=1,
parameter WIDTHP=1,
parameter REP="UNSIGNED",
parameter SPEED="MIXED", // "MIXED" or "HIGHEST"
parameter PIPELINE=0,
parameter FORCE_LE="NO"
)
(
input clock, aclr, clken,
module VX_mult #(
parameter WIDTHA=1,
parameter WIDTHB=1,
parameter WIDTHP=1,
parameter REP="UNSIGNED",
parameter SPEED="MIXED", // "MIXED" or "HIGHEST"
parameter PIPELINE=0,
parameter FORCE_LE="NO"
) (
input clock, aclr, clken,
input [WIDTHA-1:0] dataa,
input [WIDTHB-1:0] datab,
input [WIDTHA-1:0] dataa,
input [WIDTHB-1:0] datab,
output reg [WIDTHP-1:0] result
);
output reg [WIDTHP-1:0] result
);
// synthesis read_comments_as_HDL on
// localparam IMPL = "quartus";
@@ -29,10 +27,11 @@ module VX_mult
if (IMPL == "quartus") begin
localparam lpm_speed=SPEED == "HIGHEST" ? 10:5;
localparam lpm_speed = (SPEED == "HIGHEST") ? 10 : 5;
if (FORCE_LE == "YES") begin
lpm_mult#(
/* verilator lint_off DECLFILENAME */
lpm_mult #(
.LPM_WIDTHA(WIDTHA),
.LPM_WIDTHB(WIDTHB),
.LPM_WIDTHP(WIDTHP),
@@ -40,7 +39,7 @@ module VX_mult
.LPM_PIPELINE(PIPELINE),
.DSP_BLOCK_BALANCING("LOGIC ELEMENTS"),
.MAXIMIZE_SPEED(lpm_speed)
) quartus_mult(
) quartus_mult (
.clock(clock),
.aclr(aclr),
.clken(clken),
@@ -48,6 +47,7 @@ module VX_mult
.datab(datab),
.result(result)
);
/* verilator lint_on DECLFILENAME */
end
else begin
lpm_mult#(

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
`include "VX_define.vh"
module VX_bank
#(
module VX_bank #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -17,8 +16,7 @@ module VX_bank
// Function ID, {Dcache=0, Icache=1, Sharedmemory=2}
parameter FUNC_ID = 0,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
// Miss Reserv Queue Knob
@@ -28,7 +26,7 @@ module VX_bank
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -43,12 +41,9 @@ module VX_bank
// Fill Invalidator Size {Fill invalidator must be active}
parameter FILL_INVALIDAOR_SIZE = 16,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
input wire clk,
input wire reset,
@@ -77,30 +72,29 @@ module VX_bank
output wire [31:0] bank_wb_address,
// Dram Fill Requests
output wire dram_fill_req,
output wire dram_fill_req_valid,
output wire[31:0] dram_fill_req_addr,
output wire dram_because_of_snp,
output wire dram_snp_full,
output wire dram_fill_req_is_snp,
input wire dram_fill_req_queue_full,
// Dram Fill Response
input wire dram_fill_rsp,
input wire [31:0] dram_fill_addr,
input wire[`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] dram_fill_rsp_data,
output wire dram_fill_accept,
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire[`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] dram_rsp_data,
output wire dram_rsp_ready,
// Dram WB Requests
input wire dram_wb_queue_pop,
output wire dram_wb_req,
output wire dram_wb_req_valid,
output wire[31:0] dram_wb_req_addr,
output wire[`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] dram_wb_req_data,
// Snp Request
input wire snp_req,
input wire snp_req_valid,
input wire[31:0] snp_req_addr,
output wire snrq_full,
output wire snp_req_full,
output wire snp_fwd,
output wire snp_fwd_valid,
output wire[31:0] snp_fwd_addr,
input wire snp_fwd_pop
);
@@ -111,7 +105,7 @@ module VX_bank
if (reset) begin
snoop_state <= 0;
end else begin
snoop_state <= (snoop_state | snp_req) && ((FUNC_ID == `L2FUNC_ID) || (FUNC_ID == `L3FUNC_ID));
snoop_state <= (snoop_state | snp_req_valid) && ((FUNC_ID == `L2FUNC_ID) || (FUNC_ID == `L3FUNC_ID));
end
end
@@ -123,16 +117,20 @@ module VX_bank
wire[31:0] snrq_addr_st0;
assign snrq_valid_st0 = !snrq_empty;
VX_generic_queue_ll #(.DATAW(32), .SIZE(SNRQ_SIZE)) snr_queue(
VX_generic_queue_ll #(
.DATAW(32),
.SIZE(SNRQ_SIZE)
) snr_queue (
.clk (clk),
.reset (reset),
.push (snp_req),
.push (snp_req_valid),
.in_data (snp_req_addr),
.pop (snrq_pop),
.out_data(snrq_addr_st0),
.empty (snrq_empty),
.full (snrq_full)
);
.full (snp_req_full)
);
wire dfpq_pop;
wire dfpq_empty;
@@ -140,13 +138,16 @@ module VX_bank
wire[31:0] dfpq_addr_st0;
wire[`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] dfpq_filldata_st0;
assign dram_fill_accept = !dfpq_full;
assign dram_rsp_ready = !dfpq_full;
VX_generic_queue_ll #(.DATAW(32+(`BANK_LINE_WORDS*`WORD_SIZE)), .SIZE(DFPQ_SIZE)) dfp_queue(
VX_generic_queue_ll #(
.DATAW(32+(`BANK_LINE_WORDS*`WORD_SIZE)),
.SIZE(DFPQ_SIZE)
) dfp_queue (
.clk (clk),
.reset (reset),
.push (dram_fill_rsp),
.in_data ({dram_fill_addr, dram_fill_rsp_data}),
.push (dram_rsp_valid),
.in_data ({dram_rsp_addr, dram_rsp_data}),
.pop (dfpq_pop),
.out_data({dfpq_addr_st0, dfpq_filldata_st0}),
.empty (dfpq_empty),
@@ -186,9 +187,7 @@ module VX_bank
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
req_queue
(
) req_queue (
.clk (clk),
.reset (reset),
// Enqueue
@@ -217,7 +216,7 @@ module VX_bank
.reqq_req_pc_st0 (reqq_req_pc_st0),
.reqq_empty (reqq_empty),
.reqq_full (reqq_full)
);
);
wire mrvq_pop;
wire mrvq_full;
@@ -265,9 +264,7 @@ module VX_bank
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
mrvq_queue
(
) mrvq_queue (
.clk (clk),
.reset (reset),
// Enqueue
@@ -300,35 +297,39 @@ module VX_bank
.miss_resrv_warp_num_st0 (mrvq_warp_num_st0),
.miss_resrv_mem_read_st0 (mrvq_mem_read_st0),
.miss_resrv_mem_write_st0(mrvq_mem_write_st0)
);
);
wire stall_bank_pipe;
reg is_fill_in_pipe;
wire valid_st1 [STAGE_1_CYCLES-1:0];
wire is_fill_st1 [STAGE_1_CYCLES-1:0];
/* verilator lint_off UNUSED */
wire going_to_write_st1[STAGE_1_CYCLES-1:0];
/* verilator lint_on UNUSED */
wire [31:0] addr_st1 [STAGE_1_CYCLES-1:0];
integer p_stage;
always @(*) begin
is_fill_in_pipe = 0;
for (p_stage = 0; p_stage < STAGE_1_CYCLES; p_stage=p_stage+1) begin
if (is_fill_st1[p_stage]) is_fill_in_pipe = 1;
if (is_fill_st1[p_stage]) begin
is_fill_in_pipe = 1;
end
end
if (is_fill_st2) is_fill_in_pipe = 1;
if (is_fill_st2) begin
is_fill_in_pipe = 1;
end
end
// assign is_fill_in_pipe = (|is_fill_st1) || is_fill_st2;
// assign is_fill_in_pipe = (|is_fill_st1) || is_fill_st2;
assign mrvq_pop = mrvq_valid_st0 && !stall_bank_pipe;
assign dfpq_pop = !mrvq_pop && !dfpq_empty && !stall_bank_pipe;
assign reqq_pop = !mrvq_stop && !mrvq_pop && !dfpq_pop && !reqq_empty && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0] && !is_fill_in_pipe;
assign snrq_pop = !reqq_pop && !reqq_pop && !mrvq_pop && !dfpq_pop && snrq_valid_st0 && !stall_bank_pipe;
integer st1_cycle;
wire qual_is_fill_st0;
wire qual_valid_st0;
wire [31:0] qual_addr_st0;
@@ -384,13 +385,15 @@ module VX_bank
reqq_pop ? reqq_req_writeword_st0 :
0;
VX_generic_register #(.N( 1 + 1 + 1 + `WORD_SIZE + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_WORDS*`WORD_SIZE) + 1 + 32)) s0_1_c0 (
.clk (clk),
.reset(reset),
.stall(stall_bank_pipe),
.flush(0),
.in ({qual_is_snp , qual_going_to_write_st0, qual_valid_st0, qual_addr_st0, qual_writeword_st0, qual_inst_meta_st0, qual_is_fill_st0, qual_writedata_st0, qual_pc_st0 }),
.out ({is_snp_st1[0], going_to_write_st1[0] , valid_st1[0] , addr_st1[0] , writeword_st1[0] , inst_meta_st1[0] , is_fill_st1[0] , writedata_st1[0] , pc_st1[0]})
VX_generic_register #(
.N( 1 + 1 + 1 + `WORD_SIZE + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_WORDS*`WORD_SIZE) + 1 + 32)
) s0_1_c0 (
.clk (clk),
.reset (reset),
.stall (stall_bank_pipe),
.flush (0),
.in ({qual_is_snp , qual_going_to_write_st0, qual_valid_st0, qual_addr_st0, qual_writeword_st0, qual_inst_meta_st0, qual_is_fill_st0, qual_writedata_st0, qual_pc_st0 }),
.out ({is_snp_st1[0], going_to_write_st1[0] , valid_st1[0] , addr_st1[0] , writeword_st1[0] , inst_meta_st1[0] , is_fill_st1[0] , writedata_st1[0] , pc_st1[0]})
);
genvar curr_stage;
@@ -414,14 +417,14 @@ module VX_bank
wire miss_st1e;
wire dirty_st1e;
wire[31:0] pc_st1e;
/* verilator lint_off UNUSED */
wire [4:0] rd_st1e;
wire [1:0] wb_st1e;
wire [`NW_BITS-1:0] warp_num_st1e;
wire [2:0] mem_read_st1e;
wire [2:0] mem_write_st1e;
wire [`LOG2UP(NUM_REQUESTS)-1:0] tid_st1e;
/* verilator lint_on UNUSED */
wire [2:0] mem_read_st1e;
wire [2:0] mem_write_st1e;
wire fill_saw_dirty_st1e;
wire is_snp_st1e;
@@ -429,7 +432,6 @@ module VX_bank
assign pc_st1e = pc_st1[STAGE_1_CYCLES-1];
assign {rd_st1e, wb_st1e, warp_num_st1e, mem_read_st1e, mem_write_st1e, tid_st1e} = inst_meta_st1[STAGE_1_CYCLES-1];
VX_tag_data_access #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
@@ -448,9 +450,7 @@ module VX_bank
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
VX_tag_data_access
(
) vx_tag_data_access (
.clk (clk),
.reset (reset),
.stall (stall_bank_pipe),
@@ -494,16 +494,16 @@ module VX_bank
wire is_snp_st2;
wire [31:0] pc_st2;
VX_generic_register #(.N( 1+1+1+1+32+`WORD_SIZE+`WORD_SIZE+(`BANK_LINE_WORDS * `WORD_SIZE) + `REQ_INST_META_SIZE + `TAG_SELECT_NUM_BITS + 32 + 2)) st_1e_2 (
VX_generic_register #(
.N( 1+1+1+1+32+`WORD_SIZE+`WORD_SIZE+(`BANK_LINE_WORDS * `WORD_SIZE) + `REQ_INST_META_SIZE + `TAG_SELECT_NUM_BITS + 32 + 2)
) st_1e_2 (
.clk (clk),
.reset(reset),
.stall(stall_bank_pipe),
.flush(0),
.in ({is_snp_st1e, fill_saw_dirty_st1e, is_fill_st1[STAGE_1_CYCLES-1] , qual_valid_st1e_2, addr_st1[STAGE_1_CYCLES-1], writeword_st1[STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, pc_st1e, inst_meta_st1[STAGE_1_CYCLES-1]}),
.out ({is_snp_st2 , fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , pc_st2 , inst_meta_st2 })
);
);
wire should_flush;
wire dwbq_push;
@@ -520,7 +520,6 @@ module VX_bank
assign miss_add_data = writeword_st2;
assign {miss_add_rd, miss_add_wb, miss_add_warp_num, miss_add_mem_read, miss_add_mem_write, miss_add_tid} = inst_meta_st2;
// Enqueue to CWB Queue
wire cwbq_push = (valid_st2 && !miss_st2) && !cwbq_full && !((FUNC_ID == `L2FUNC_ID) && (miss_add_wb == 0)) && !((is_snp_st2 && valid_st2 && ffsq_full) || (((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && dwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full));
wire [`WORD_SIZE_RNG] cwbq_data = readword_st2;
@@ -532,7 +531,10 @@ module VX_bank
wire cwbq_empty;
assign bank_wb_valid = !cwbq_empty;
VX_generic_queue_ll #(.DATAW( `LOG2UP(NUM_REQUESTS) + 5 + 2 + (`NW_BITS-1+1) + `WORD_SIZE + 32 + 32), .SIZE(CWBQ_SIZE)) cwb_queue(
VX_generic_queue_ll #(
.DATAW( `LOG2UP(NUM_REQUESTS) + 5 + 2 + (`NW_BITS-1+1) + `WORD_SIZE + 32 + 32),
.SIZE(CWBQ_SIZE)
) cwb_queue(
.clk (clk),
.reset (reset),
@@ -543,13 +545,13 @@ module VX_bank
.out_data({bank_wb_tid, bank_wb_rd, bank_wb_wb, bank_wb_warp_num, bank_wb_data, bank_wb_pc, bank_wb_address}),
.empty (cwbq_empty),
.full (cwbq_full)
);
);
assign should_flush = snoop_state && valid_st2 && (miss_add_mem_write != `NO_MEM_WRITE) && !is_snp_st2 && !is_fill_st2;
// Enqueue to DWB Queue
assign dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2 || should_flush) && !dwbq_full && !((is_snp_st2 && valid_st2 && ffsq_full) ||((valid_st2 && !miss_st2) && cwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full));
wire[31:0] dwbq_req_addr;
wire dwbq_empty;
assign dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2 || should_flush) && !dwbq_full && !((is_snp_st2 && valid_st2 && ffsq_full) ||((valid_st2 && !miss_st2) && cwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full));
wire[31:0] dwbq_req_addr;
wire dwbq_empty;
wire[`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] dwbq_req_data;
if ((FUNC_ID == `L2FUNC_ID) || (FUNC_ID == `L3FUNC_ID)) begin
@@ -560,10 +562,9 @@ module VX_bank
assign dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]} & `BASE_ADDR_MASK;
end
wire possible_fill = valid_st2 && miss_st2 && !dram_fill_req_queue_full && !is_snp_st2;
wire[31:0] fill_invalidator_addr = addr_st2 & `BASE_ADDR_MASK;
VX_fill_invalidator #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
@@ -581,9 +582,7 @@ module VX_bank
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
VX_fill_invalidator
(
) vx_fill_invalidator (
.clk (clk),
.reset (reset),
.possible_fill (possible_fill),
@@ -591,16 +590,19 @@ module VX_bank
.fill_addr (fill_invalidator_addr),
.invalidate_fill (invalidate_fill)
);
);
// Enqueu in dram_fill_req
assign dram_fill_req = possible_fill && !invalidate_fill;
assign dram_because_of_snp = is_snp_st2 && valid_st2 && miss_st2;
assign dram_snp_full = snrq_full && snp_req;
assign dram_fill_req_addr = addr_st2 & `BASE_ADDR_MASK;
// Enqueue in dram_fill_req
assign dram_fill_req_valid = possible_fill && !invalidate_fill;
assign dram_fill_req_is_snp = is_snp_st2 && valid_st2 && miss_st2;
assign dram_fill_req_addr = addr_st2 & `BASE_ADDR_MASK;
assign dram_wb_req = !dwbq_empty;
VX_generic_queue_ll #(.DATAW( 32 + (`BANK_LINE_WORDS * `WORD_SIZE)), .SIZE(DWBQ_SIZE)) dwb_queue(
assign dram_wb_req_valid = !dwbq_empty;
VX_generic_queue_ll #(
.DATAW( 32 + (`BANK_LINE_WORDS * `WORD_SIZE)),
.SIZE(DWBQ_SIZE)
) dwb_queue (
.clk (clk),
.reset (reset),
@@ -611,14 +613,18 @@ module VX_bank
.out_data({dram_wb_req_addr, dram_wb_req_data}),
.empty (dwbq_empty),
.full (dwbq_full)
);
);
wire snp_fwd_push;
wire ffsq_empty;
assign snp_fwd_push = is_snp_st2 && valid_st2 && !ffsq_full && !(((valid_st2 && !miss_st2) && cwbq_full) || (((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && dwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full));
assign snp_fwd = !ffsq_empty;
VX_generic_queue_ll #(.DATAW(32), .SIZE(FFSQ_SIZE)) ffs_queue(
assign snp_fwd_push = is_snp_st2 && valid_st2 && !ffsq_full && !(((valid_st2 && !miss_st2) && cwbq_full) || (((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && dwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full));
assign snp_fwd_valid = !ffsq_empty;
VX_generic_queue_ll #(
.DATAW(32),
.SIZE(FFSQ_SIZE)
) ffs_queue (
.clk (clk),
.reset (reset),
.push (snp_fwd_push),
@@ -627,7 +633,7 @@ module VX_bank
.out_data({snp_fwd_addr}),
.empty (ffsq_empty),
.full (ffsq_full)
);
);
assign stall_bank_pipe = (is_snp_st2 && valid_st2 && ffsq_full) || ((valid_st2 && !miss_st2) && cwbq_full) || (((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && dwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && dram_fill_req_queue_full);

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
module VX_cache
#(
module VX_cache #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -17,7 +16,7 @@ module VX_cache
// Function ID, {Dcache=0, Icache=1, Sharedmemory=2}
parameter FUNC_ID = 3,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
@@ -28,7 +27,7 @@ module VX_cache
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -47,21 +46,18 @@ module VX_cache
parameter PRFQ_SIZE = 64,
parameter PRFQ_STRIDE = 0,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
input wire clk,
input wire reset,
// Req Info
// Req Info
input wire [NUM_REQUESTS-1:0] core_req_valid,
input wire [NUM_REQUESTS-1:0][31:0] core_req_addr,
input wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_req_writedata,
input wire[NUM_REQUESTS-1:0][2:0] core_req_mem_read,
input wire[NUM_REQUESTS-1:0][2:0] core_req_mem_write,
input wire [NUM_REQUESTS-1:0][2:0] core_req_mem_read,
input wire [NUM_REQUESTS-1:0][2:0] core_req_mem_write,
// Req meta
input wire [4:0] core_req_rd,
@@ -80,39 +76,31 @@ module VX_cache
output wire [NUM_REQUESTS-1:0][31:0] core_wb_pc,
output wire [NUM_REQUESTS-1:0][31:0] core_wb_address,
// Dram Fill Response
input wire dram_fill_rsp,
input wire [31:0] dram_fill_rsp_addr,
input wire [`IBANK_LINE_WORDS-1:0][31:0] dram_fill_rsp_data,
output wire dram_fill_accept,
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire [`IBANK_LINE_WORDS-1:0][31:0] dram_rsp_data,
output wire dram_rsp_ready,
// Dram request
output wire dram_req,
output wire dram_req_write,
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [31:0] dram_req_size,
output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data,
output wire dram_req_because_of_wb,
input wire dram_req_delay,
output wire dram_snp_full,
input wire dram_req_full,
// Snoop Req
input wire snp_req,
input wire[31:0] snp_req_addr,
output wire snp_req_delay,
input wire snp_req_valid,
input wire [31:0] snp_req_addr,
output wire snp_req_full,
// Snoop Forward
output wire snp_fwd,
output wire[31:0] snp_fwd_addr,
input wire snp_fwd_delay
output wire snp_fwd_valid,
output wire [31:0] snp_fwd_addr,
input wire snp_fwd_full
);
wire [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valids;
wire [NUM_BANKS-1:0] per_bank_wb_pop;
wire [NUM_BANKS-1:0] per_bank_wb_valid;
@@ -124,104 +112,90 @@ module VX_cache
wire [NUM_BANKS-1:0][31:0] per_bank_wb_pc;
wire [NUM_BANKS-1:0][31:0] per_bank_wb_address;
wire dfqq_full;
wire[NUM_BANKS-1:0] per_bank_dram_fill_req;
wire[NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr;
wire[NUM_BANKS-1:0] per_bank_dram_fill_accept;
wire [NUM_BANKS-1:0] per_bank_dram_fill_req_valid;
wire [NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr;
/* verilator lint_off UNUSED */
wire [NUM_BANKS-1:0] per_bank_dram_fill_req_is_snp;
/* verilator lint_on UNUSED */
wire [NUM_BANKS-1:0] per_bank_dram_rsp_ready;
wire[NUM_BANKS-1:0] per_bank_dram_wb_queue_pop;
wire[NUM_BANKS-1:0] per_bank_dram_wb_req;
wire[NUM_BANKS-1:0] per_bank_dram_because_of_snp;
wire[NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr;
wire[NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data;
wire [NUM_BANKS-1:0] per_bank_dram_wb_queue_pop;
wire [NUM_BANKS-1:0] per_bank_dram_wb_req_valid;
wire [NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr;
wire [NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data;
wire[NUM_BANKS-1:0] per_bank_reqq_full;
wire[NUM_BANKS-1:0] per_bank_snrq_full;
wire[NUM_BANKS-1:0] per_bank_snp_fwd;
wire[NUM_BANKS-1:0][31:0] per_bank_snp_fwd_addr;
wire[NUM_BANKS-1:0] per_bank_snp_fwd_pop;
wire [NUM_BANKS-1:0] per_bank_reqq_full;
wire [NUM_BANKS-1:0] per_bank_snrq_full;
wire [NUM_BANKS-1:0] per_bank_snp_fwd;
wire [NUM_BANKS-1:0][31:0] per_bank_snp_fwd_addr;
wire [NUM_BANKS-1:0] per_bank_snp_fwd_pop;
assign delay_req = (|per_bank_reqq_full);
assign snp_req_full = (|per_bank_snrq_full);
assign snp_req_delay = (|per_bank_snrq_full);
// assign dram_fill_accept = (NUM_BANKS == 1) ? per_bank_dram_fill_accept[0] : per_bank_dram_fill_accept[dram_fill_rsp_addr[`BANK_SELECT_ADDR_RNG]];
assign dram_fill_accept = (|per_bank_dram_fill_accept);
// assign dram_rsp_ready = (NUM_BANKS == 1) ? per_bank_dram_rsp_ready[0] : per_bank_dram_rsp_ready[dram_rsp_addr[`BANK_SELECT_ADDR_RNG]];
assign dram_rsp_ready = (|per_bank_dram_rsp_ready);
VX_cache_dram_req_arb #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE_BYTES (WORD_SIZE_BYTES),
.NUM_REQUESTS (NUM_REQUESTS),
.STAGE_1_CYCLES (STAGE_1_CYCLES),
.REQQ_SIZE (REQQ_SIZE),
.MRVQ_SIZE (MRVQ_SIZE),
.DFPQ_SIZE (DFPQ_SIZE),
.SNRQ_SIZE (SNRQ_SIZE),
.CWBQ_SIZE (CWBQ_SIZE),
.DWBQ_SIZE (DWBQ_SIZE),
.DFQQ_SIZE (DFQQ_SIZE),
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.PRFQ_SIZE (PRFQ_SIZE),
.PRFQ_STRIDE (PRFQ_STRIDE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
VX_cache_dram_req_arb
(
.clk (clk),
.reset (reset),
.dfqq_full (dfqq_full),
.per_bank_dram_fill_req (per_bank_dram_fill_req),
.per_bank_dram_fill_req_addr(per_bank_dram_fill_req_addr),
.per_bank_dram_wb_queue_pop (per_bank_dram_wb_queue_pop),
.per_bank_dram_wb_req (per_bank_dram_wb_req),
.per_bank_dram_because_of_snp(per_bank_dram_because_of_snp),
.per_bank_dram_wb_req_addr (per_bank_dram_wb_req_addr),
.per_bank_dram_wb_req_data (per_bank_dram_wb_req_data),
.dram_req (dram_req),
.dram_req_write (dram_req_write),
.dram_req_read (dram_req_read),
.dram_req_addr (dram_req_addr),
.dram_req_size (dram_req_size),
.dram_req_data (dram_req_data),
.dram_req_because_of_wb (dram_req_because_of_wb),
.dram_req_delay (dram_req_delay)
);
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE_BYTES (WORD_SIZE_BYTES),
.NUM_REQUESTS (NUM_REQUESTS),
.STAGE_1_CYCLES (STAGE_1_CYCLES),
.REQQ_SIZE (REQQ_SIZE),
.MRVQ_SIZE (MRVQ_SIZE),
.DFPQ_SIZE (DFPQ_SIZE),
.SNRQ_SIZE (SNRQ_SIZE),
.CWBQ_SIZE (CWBQ_SIZE),
.DWBQ_SIZE (DWBQ_SIZE),
.DFQQ_SIZE (DFQQ_SIZE),
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.PRFQ_SIZE (PRFQ_SIZE),
.PRFQ_STRIDE (PRFQ_STRIDE),
.SIMULATED_DRAM_LATENCY_CYCLES (SIMULATED_DRAM_LATENCY_CYCLES)
) vx_cache_dram_req_arb (
.clk (clk),
.reset (reset),
.dfqq_full (dfqq_full),
.per_bank_dram_fill_req_valid(per_bank_dram_fill_req_valid),
.per_bank_dram_fill_req_addr (per_bank_dram_fill_req_addr),
.per_bank_dram_wb_queue_pop (per_bank_dram_wb_queue_pop),
.per_bank_dram_wb_req_valid (per_bank_dram_wb_req_valid),
.per_bank_dram_wb_req_addr (per_bank_dram_wb_req_addr),
.per_bank_dram_wb_req_data (per_bank_dram_wb_req_data),
.dram_req_read (dram_req_read),
.dram_req_write (dram_req_write),
.dram_req_addr (dram_req_addr),
.dram_req_data (dram_req_data),
.dram_req_full (dram_req_full)
);
VX_cache_core_req_bank_sel #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE_BYTES (WORD_SIZE_BYTES),
.NUM_REQUESTS (NUM_REQUESTS),
.STAGE_1_CYCLES (STAGE_1_CYCLES),
.REQQ_SIZE (REQQ_SIZE),
.MRVQ_SIZE (MRVQ_SIZE),
.DFPQ_SIZE (DFPQ_SIZE),
.SNRQ_SIZE (SNRQ_SIZE),
.CWBQ_SIZE (CWBQ_SIZE),
.DWBQ_SIZE (DWBQ_SIZE),
.DFQQ_SIZE (DFQQ_SIZE),
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
VX_cache_core_req_bank_sell
(
.core_req_valid (core_req_valid),
.core_req_addr (core_req_addr),
.per_bank_valids(per_bank_valids)
);
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE_BYTES (WORD_SIZE_BYTES),
.NUM_REQUESTS (NUM_REQUESTS),
.STAGE_1_CYCLES (STAGE_1_CYCLES),
.REQQ_SIZE (REQQ_SIZE),
.MRVQ_SIZE (MRVQ_SIZE),
.DFPQ_SIZE (DFPQ_SIZE),
.SNRQ_SIZE (SNRQ_SIZE),
.CWBQ_SIZE (CWBQ_SIZE),
.DWBQ_SIZE (DWBQ_SIZE),
.DFQQ_SIZE (DFQQ_SIZE),
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES (SIMULATED_DRAM_LATENCY_CYCLES)
) vx_cache_core_req_bank_sell (
.core_req_valid (core_req_valid),
.core_req_addr (core_req_addr),
.per_bank_valids (per_bank_valids)
);
VX_cache_wb_sel_merge #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
@@ -241,9 +215,7 @@ module VX_cache
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
VX_cache_core_wb_sel_merge
(
) vx_cache_core_wb_sel_merge (
.per_bank_wb_valid (per_bank_wb_valid),
.per_bank_wb_tid (per_bank_wb_tid),
.per_bank_wb_rd (per_bank_wb_rd),
@@ -262,28 +234,27 @@ module VX_cache
.core_wb_readdata (core_wb_readdata),
.core_wb_address (core_wb_address),
.core_wb_pc (core_wb_pc)
);
);
// Snoop Forward Logic
VX_snp_fwd_arb #(.NUM_BANKS(NUM_BANKS)) VX_snp_fwd_arb(
VX_snp_fwd_arb #(
.NUM_BANKS(NUM_BANKS)
) vx_snp_fwd_arb(
.per_bank_snp_fwd (per_bank_snp_fwd),
.per_bank_snp_fwd_addr(per_bank_snp_fwd_addr),
.per_bank_snp_fwd_pop (per_bank_snp_fwd_pop),
.snp_fwd (snp_fwd),
.snp_fwd_valid (snp_fwd_valid),
.snp_fwd_addr (snp_fwd_addr),
.snp_fwd_delay (snp_fwd_delay)
);
.snp_fwd_full (snp_fwd_full)
);
// Snoop Forward Logic
genvar curr_bank;
generate
for (curr_bank = 0; curr_bank < NUM_BANKS; curr_bank=curr_bank+1) begin
wire [NUM_REQUESTS-1:0] curr_bank_valids;
wire [NUM_REQUESTS-1:0][31:0] curr_bank_addr;
wire [NUM_REQUESTS-1:0] curr_bank_valids;
wire [NUM_REQUESTS-1:0][31:0] curr_bank_addr;
wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] curr_bank_writedata;
wire [4:0] curr_bank_rd;
wire [NUM_REQUESTS-1:0][1:0] curr_bank_wb;
@@ -294,7 +265,7 @@ module VX_cache
wire curr_bank_wb_pop;
wire curr_bank_wb_valid;
wire [`LOG2UP(NUM_REQUESTS)-1:0] curr_bank_wb_tid;
wire [`LOG2UP(NUM_REQUESTS)-1:0] curr_bank_wb_tid;
wire [31:0] curr_bank_wb_pc;
wire [4:0] curr_bank_wb_rd;
wire [1:0] curr_bank_wb_wb;
@@ -302,19 +273,18 @@ module VX_cache
wire [`WORD_SIZE_RNG] curr_bank_wb_data;
wire [31:0] curr_bank_wb_address;
wire curr_bank_dram_fill_rsp;
wire [31:0] curr_bank_dram_fill_rsp_addr;
wire [`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] curr_bank_dram_fill_rsp_data;
wire curr_bank_dram_fill_accept;
wire curr_bank_dram_rsp_valid;
wire [31:0] curr_bank_dram_rsp_addr;
wire [`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] curr_bank_dram_rsp_data;
wire curr_bank_dram_rsp_ready;
wire curr_bank_dfqq_full;
wire curr_bank_dram_fill_req;
wire curr_bank_dram_because_of_snp;
wire curr_bank_dram_snp_full;
wire curr_bank_dram_fill_req_valid;
wire curr_bank_dram_fill_req_is_snp;
wire[31:0] curr_bank_dram_fill_req_addr;
wire curr_bank_dram_wb_queue_pop;
wire curr_bank_dram_wb_req;
wire curr_bank_dram_wb_req_valid;
wire[31:0] curr_bank_dram_wb_req_addr;
wire[`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] curr_bank_dram_wb_req_data;
@@ -326,9 +296,7 @@ module VX_cache
wire curr_bank_snp_fwd;
wire[31:0] curr_bank_snp_fwd_addr;
wire curr_bank_snp_fwd_pop;
wire curr_bank_snrq_full;
wire curr_bank_snp_req_full;
// Core Req
assign curr_bank_valids = per_bank_valids[curr_bank];
@@ -354,56 +322,53 @@ module VX_cache
assign per_bank_wb_address [curr_bank] = curr_bank_wb_address;
// Dram fill request
assign curr_bank_dfqq_full = dfqq_full;
assign per_bank_dram_fill_req[curr_bank] = curr_bank_dram_fill_req;
assign per_bank_dram_fill_req_addr[curr_bank] = curr_bank_dram_fill_req_addr;
assign curr_bank_dfqq_full = dfqq_full;
assign per_bank_dram_fill_req_valid[curr_bank] = curr_bank_dram_fill_req_valid;
assign per_bank_dram_fill_req_addr[curr_bank] = curr_bank_dram_fill_req_addr;
assign per_bank_dram_fill_req_is_snp[curr_bank] = curr_bank_dram_fill_req_is_snp;
// Dram fill response
assign curr_bank_dram_fill_rsp = (NUM_BANKS == 1) || (dram_fill_rsp && (curr_bank_dram_fill_rsp_addr[`BANK_SELECT_ADDR_RNG] == curr_bank));
assign curr_bank_dram_fill_rsp_addr = dram_fill_rsp_addr;
assign curr_bank_dram_fill_rsp_data = dram_fill_rsp_data;
assign per_bank_dram_fill_accept[curr_bank] = curr_bank_dram_fill_accept;
assign curr_bank_dram_rsp_valid = (NUM_BANKS == 1) || (dram_rsp_valid && (curr_bank_dram_rsp_addr[`BANK_SELECT_ADDR_RNG] == curr_bank));
assign curr_bank_dram_rsp_addr = dram_rsp_addr;
assign curr_bank_dram_rsp_data = dram_rsp_data;
assign per_bank_dram_rsp_ready[curr_bank] = curr_bank_dram_rsp_ready;
// Dram writeback request
assign curr_bank_dram_wb_queue_pop = per_bank_dram_wb_queue_pop[curr_bank];
assign per_bank_dram_wb_req[curr_bank] = curr_bank_dram_wb_req;
assign per_bank_dram_because_of_snp[curr_bank] = curr_bank_dram_because_of_snp;
assign per_bank_dram_wb_req_valid[curr_bank] = curr_bank_dram_wb_req_valid;
assign per_bank_dram_wb_req_addr[curr_bank] = curr_bank_dram_wb_req_addr;
assign per_bank_dram_wb_req_data[curr_bank] = curr_bank_dram_wb_req_data;
// Snoop Request
assign curr_bank_snp_req = snp_req && (snp_req_addr[`BANK_SELECT_ADDR_RNG] == curr_bank);
assign curr_bank_snp_req_addr = snp_req_addr;
assign per_bank_snrq_full[curr_bank] = curr_bank_snrq_full;
assign curr_bank_snp_req = snp_req_valid && (snp_req_addr[`BANK_SELECT_ADDR_RNG] == curr_bank);
assign curr_bank_snp_req_addr = snp_req_addr;
assign per_bank_snrq_full[curr_bank] = curr_bank_snp_req_full;
// Snoop Fwd
assign curr_bank_snp_fwd_pop = per_bank_snp_fwd_pop[curr_bank];
assign per_bank_snp_fwd[curr_bank] = curr_bank_snp_fwd;
assign per_bank_snp_fwd_addr[curr_bank] = curr_bank_snp_fwd_addr;
VX_bank #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE_BYTES (WORD_SIZE_BYTES),
.NUM_REQUESTS (NUM_REQUESTS),
.STAGE_1_CYCLES (STAGE_1_CYCLES),
.FUNC_ID (FUNC_ID),
.REQQ_SIZE (REQQ_SIZE),
.MRVQ_SIZE (MRVQ_SIZE),
.DFPQ_SIZE (DFPQ_SIZE),
.SNRQ_SIZE (SNRQ_SIZE),
.CWBQ_SIZE (CWBQ_SIZE),
.DWBQ_SIZE (DWBQ_SIZE),
.DFQQ_SIZE (DFQQ_SIZE),
.LLVQ_SIZE (LLVQ_SIZE),
.FFSQ_SIZE (FFSQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
bank
(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE_BYTES (WORD_SIZE_BYTES),
.NUM_REQUESTS (NUM_REQUESTS),
.STAGE_1_CYCLES (STAGE_1_CYCLES),
.FUNC_ID (FUNC_ID),
.REQQ_SIZE (REQQ_SIZE),
.MRVQ_SIZE (MRVQ_SIZE),
.DFPQ_SIZE (DFPQ_SIZE),
.SNRQ_SIZE (SNRQ_SIZE),
.CWBQ_SIZE (CWBQ_SIZE),
.DWBQ_SIZE (DWBQ_SIZE),
.DFQQ_SIZE (DFQQ_SIZE),
.LLVQ_SIZE (LLVQ_SIZE),
.FFSQ_SIZE (FFSQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
) bank (
.clk (clk),
.reset (reset),
// Core req
@@ -431,39 +396,35 @@ module VX_cache
.bank_wb_address (curr_bank_wb_address),
// Dram fill req
.dram_fill_req (curr_bank_dram_fill_req),
.dram_fill_req_valid (curr_bank_dram_fill_req_valid),
.dram_fill_req_addr (curr_bank_dram_fill_req_addr),
.dram_fill_req_is_snp (curr_bank_dram_fill_req_is_snp),
.dram_fill_req_queue_full(curr_bank_dfqq_full),
// Dram fill rsp
.dram_fill_rsp (curr_bank_dram_fill_rsp),
.dram_fill_addr (curr_bank_dram_fill_rsp_addr),
.dram_fill_rsp_data (curr_bank_dram_fill_rsp_data),
.dram_fill_accept (curr_bank_dram_fill_accept),
.dram_rsp_valid (curr_bank_dram_rsp_valid),
.dram_rsp_addr (curr_bank_dram_rsp_addr),
.dram_rsp_data (curr_bank_dram_rsp_data),
.dram_rsp_ready (curr_bank_dram_rsp_ready),
// Dram writeback
.dram_wb_queue_pop (curr_bank_dram_wb_queue_pop),
.dram_wb_req (curr_bank_dram_wb_req),
.dram_wb_req_valid (curr_bank_dram_wb_req_valid),
.dram_wb_req_addr (curr_bank_dram_wb_req_addr),
.dram_wb_req_data (curr_bank_dram_wb_req_data),
.dram_because_of_snp (curr_bank_dram_because_of_snp),
.dram_snp_full (curr_bank_dram_snp_full),
.dram_wb_req_data (curr_bank_dram_wb_req_data),
// Snoop Request
.snp_req (curr_bank_snp_req),
.snp_req_valid (curr_bank_snp_req),
.snp_req_addr (curr_bank_snp_req_addr),
.snrq_full (curr_bank_snrq_full),
.snp_req_full (curr_bank_snp_req_full),
// Snoop Fwd
.snp_fwd (curr_bank_snp_fwd),
.snp_fwd_valid (curr_bank_snp_fwd),
.snp_fwd_addr (curr_bank_snp_fwd_addr),
.snp_fwd_pop (curr_bank_snp_fwd_pop)
);
);
end
endgenerate
endmodule

View File

@@ -9,24 +9,24 @@
// 5 + 2 + 4 + 3 + 3 + 1
`define REQ_INST_META_SIZE (5 + 2 + (`NW_BITS-1+1) + 3 + 3 + `LOG2UP(NUM_REQUESTS))
`define WORD_SIZE (8*WORD_SIZE_BYTES)
`define WORD_SIZE (8 * WORD_SIZE_BYTES)
`define WORD_SIZE_RNG (`WORD_SIZE)-1:0
// 128
`define BANK_SIZE_BYTES CACHE_SIZE_BYTES/NUM_BANKS
`define BANK_SIZE_BYTES (CACHE_SIZE_BYTES / NUM_BANKS)
// 8
`define BANK_LINE_COUNT (`BANK_SIZE_BYTES/BANK_LINE_SIZE_BYTES)
`define BANK_LINE_COUNT (`BANK_SIZE_BYTES / BANK_LINE_SIZE_BYTES)
// 4
`define BANK_LINE_WORDS (BANK_LINE_SIZE_BYTES / WORD_SIZE_BYTES)
// Offset is fixed
`define OFFSET_ADDR_NUM_BITS 2
`define OFFSET_SIZE_END 1
`define OFFSET_ADDR_START 0
`define OFFSET_ADDR_END 1
`define OFFSET_ADDR_RNG `OFFSET_ADDR_END:`OFFSET_ADDR_START
`define OFFSET_SIZE_RNG `OFFSET_SIZE_END:0
`define OFFSET_ADDR_NUM_BITS 2
`define OFFSET_SIZE_END 1
`define OFFSET_ADDR_START 0
`define OFFSET_ADDR_END 1
`define OFFSET_ADDR_RNG `OFFSET_ADDR_END:`OFFSET_ADDR_START
`define OFFSET_SIZE_RNG `OFFSET_SIZE_END:0
// 2
`define WORD_SELECT_NUM_BITS (`LOG2UP(`BANK_LINE_WORDS))
@@ -55,17 +55,14 @@
// 3
`define LINE_SELECT_NUM_BITS (`LOG2UP(`BANK_LINE_COUNT))
// 3
`define LINE_SELECT_SIZE_END (`LINE_SELECT_NUM_BITS)
// 7
`define LINE_SELECT_ADDR_START (1+`BANK_SELECT_ADDR_END)
// 9
`define LINE_SELECT_ADDR_END (`LINE_SELECT_SIZE_END+`LINE_SELECT_ADDR_START-1)
`define LINE_SELECT_ADDR_END (`LINE_SELECT_NUM_BITS+`LINE_SELECT_ADDR_START-1)
// 9:7
`define LINE_SELECT_ADDR_RNG `LINE_SELECT_ADDR_END:`LINE_SELECT_ADDR_START
// 2:0
`define LINE_SELECT_SIZE_RNG `LINE_SELECT_SIZE_END-1:0
`define LINE_SELECT_SIZE_RNG `LINE_SELECT_NUM_BITS-1:0
// 10
`define TAG_SELECT_ADDR_START (1+`LINE_SELECT_ADDR_END)
@@ -76,9 +73,10 @@
// 22
`define TAG_SELECT_SIZE_END (`TAG_SELECT_NUM_BITS)
// 21:0
`define TAG_SELECT_SIZE_RNG `TAG_SELECT_SIZE_END-1:0
`define TAG_SELECT_SIZE_RNG `TAG_SELECT_NUM_BITS-1:0
`define TAG_LINE_SELECT_BITS (`TAG_SELECT_NUM_BITS+`LINE_SELECT_NUM_BITS)
`define BASE_ADDR_MASK (~((1<<(`WORD_SELECT_ADDR_END+1))-1))
`endif

View File

@@ -54,8 +54,6 @@ module VX_cache_core_req_bank_sel
output reg [NUM_BANKS-1:0][NUM_REQUESTS-1:0] per_bank_valids
);
wire[31:0] req_address;
generate
integer curr_req;
always @(*) begin

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
module VX_cache_dfq_queue
#(
module VX_cache_dfq_queue #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -15,8 +14,7 @@ module VX_cache_dfq_queue
// Number of cycles to complete stage 1 (read from memory)
parameter STAGE_1_CYCLES = 2,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
// Miss Reserv Queue Knob
@@ -26,7 +24,7 @@ module VX_cache_dfq_queue
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -39,16 +37,13 @@ module VX_cache_dfq_queue
// Fill Invalidator Size {Fill invalidator must be active}
parameter FILL_INVALIDAOR_SIZE = 16,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
input wire clk,
input wire reset,
input wire dfqq_push,
input wire[NUM_BANKS-1:0] per_bank_dram_fill_req,
input wire[NUM_BANKS-1:0] per_bank_dram_fill_req_valid,
input wire[NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr,
input wire dfqq_pop,
@@ -61,17 +56,14 @@ module VX_cache_dfq_queue
wire[NUM_BANKS-1:0] out_per_bank_dram_fill_req;
wire[NUM_BANKS-1:0][31:0] out_per_bank_dram_fill_req_addr;
reg [NUM_BANKS-1:0] use_per_bank_dram_fill_req;
reg [NUM_BANKS-1:0][31:0] use_per_bank_dram_fill_req_addr;
wire[NUM_BANKS-1:0] qual_bank_dram_fill_req;
wire[NUM_BANKS-1:0][31:0] qual_bank_dram_fill_req_addr;
wire[NUM_BANKS-1:0] updated_bank_dram_fill_req;
wire o_empty;
wire use_empty = !(|use_per_bank_dram_fill_req);
@@ -79,27 +71,34 @@ module VX_cache_dfq_queue
wire push_qual = dfqq_push && !dfqq_full;
wire pop_qual = dfqq_pop && use_empty && !out_empty;
VX_generic_queue_ll #(.DATAW(NUM_BANKS * (1+32)), .SIZE(DFQQ_SIZE)) dfqq_queue(
VX_generic_queue_ll #(
.DATAW(NUM_BANKS * (1+32)),
.SIZE(DFQQ_SIZE)
) dfqq_queue (
.clk (clk),
.reset (reset),
.push (push_qual),
.in_data ({per_bank_dram_fill_req, per_bank_dram_fill_req_addr}),
.in_data ({per_bank_dram_fill_req_valid, per_bank_dram_fill_req_addr}),
.pop (pop_qual),
.out_data({out_per_bank_dram_fill_req, out_per_bank_dram_fill_req_addr}),
.empty (o_empty),
.full (dfqq_full)
);
);
assign qual_bank_dram_fill_req = use_empty ? (out_per_bank_dram_fill_req & {NUM_BANKS{!o_empty}}) : (use_per_bank_dram_fill_req & {NUM_BANKS{!use_empty}});
assign qual_bank_dram_fill_req_addr = use_empty ? out_per_bank_dram_fill_req_addr : use_per_bank_dram_fill_req_addr;
wire[`LOG2UP(NUM_BANKS)-1:0] qual_request_index;
wire qual_has_request;
VX_generic_priority_encoder #(.N(NUM_BANKS)) VX_sel_bank(
wire qual_has_request;
VX_generic_priority_encoder #(
.N(NUM_BANKS)
) vx_sel_bank (
.valids(qual_bank_dram_fill_req),
.index (qual_request_index),
.found (qual_has_request)
);
);
assign dfqq_empty = !qual_has_request;
assign dfqq_req = qual_bank_dram_fill_req [qual_request_index];
@@ -119,5 +118,4 @@ module VX_cache_dfq_queue
end
end
endmodule

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
module VX_cache_dram_req_arb
#(
module VX_cache_dram_req_arb #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -15,7 +14,7 @@ module VX_cache_dram_req_arb
// Number of cycles to complete stage 1 (read from memory)
parameter STAGE_1_CYCLES = 2,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
@@ -26,7 +25,7 @@ module VX_cache_dram_req_arb
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -45,39 +44,29 @@ module VX_cache_dram_req_arb
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
input wire clk,
input wire reset,
// Fill Request
output wire dfqq_full,
input wire[NUM_BANKS-1:0] per_bank_dram_fill_req,
input wire[NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr,
// DFQ Request
output wire[NUM_BANKS-1:0] per_bank_dram_wb_queue_pop,
input wire[NUM_BANKS-1:0] per_bank_dram_wb_req,
input wire[NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr,
input wire[NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data,
input wire[NUM_BANKS-1:0] per_bank_dram_because_of_snp,
// real Dram request
output wire dram_req,
output wire dram_req_write,
output wire dram_req_read,
output wire [31:0] dram_req_addr,
output wire [31:0] dram_req_size,
output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data,
output wire dram_req_because_of_wb,
input wire dram_req_delay
output wire dfqq_full,
input wire[NUM_BANKS-1:0] per_bank_dram_fill_req_valid,
input wire[NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr,
);
// DFQ Request
output wire[NUM_BANKS-1:0] per_bank_dram_wb_queue_pop,
input wire[NUM_BANKS-1:0] per_bank_dram_wb_req_valid,
input wire[NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr,
input wire[NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data,
// real Dram request
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data,
input wire dram_req_full
);
wire pref_pop;
wire pref_valid;
@@ -86,66 +75,62 @@ module VX_cache_dram_req_arb
wire dwb_valid;
wire dfqq_req;
assign pref_pop = !dwb_valid && !dfqq_req && !dram_req_delay && pref_valid;
assign pref_pop = !dwb_valid && !dfqq_req && !dram_req_full && pref_valid;
VX_prefetcher #(
.PRFQ_SIZE (PRFQ_SIZE),
.PRFQ_STRIDE (PRFQ_STRIDE),
.BANK_LINE_SIZE_BYTES(BANK_LINE_SIZE_BYTES),
.WORD_SIZE_BYTES (WORD_SIZE_BYTES)
)
prfqq
(
) prfqq (
.clk (clk),
.reset (reset),
.dram_req (dram_req && dram_req_read),
.dram_req (dram_req_read),
.dram_req_addr(dram_req_addr),
.pref_pop (pref_pop),
.pref_valid (pref_valid),
.pref_addr (pref_addr)
);
);
wire[31:0] dfqq_req_addr;
/* verilator lint_off UNUSED */
wire dfqq_empty;
wire dfqq_pop = !dwb_valid && dfqq_req && !dram_req_delay; // If no dwb, and dfqq has valids, then pop
wire dfqq_push = (|per_bank_dram_fill_req);
/* verilator lint_on UNUSED */
wire dfqq_pop = !dwb_valid && dfqq_req && !dram_req_full; // If no dwb, and dfqq has valids, then pop
wire dfqq_push = (|per_bank_dram_fill_req_valid);
VX_cache_dfq_queue VX_cache_dfq_queue(
.clk (clk),
.reset (reset),
.dfqq_push (dfqq_push),
.per_bank_dram_fill_req (per_bank_dram_fill_req),
.per_bank_dram_fill_req_addr(per_bank_dram_fill_req_addr),
.dfqq_pop (dfqq_pop),
.dfqq_req (dfqq_req),
.dfqq_req_addr (dfqq_req_addr),
.dfqq_empty (dfqq_empty),
.dfqq_full (dfqq_full)
);
VX_cache_dfq_queue vx_cache_dfq_queue(
.clk (clk),
.reset (reset),
.dfqq_push (dfqq_push),
.per_bank_dram_fill_req_valid (per_bank_dram_fill_req_valid),
.per_bank_dram_fill_req_addr (per_bank_dram_fill_req_addr),
.dfqq_pop (dfqq_pop),
.dfqq_req (dfqq_req),
.dfqq_req_addr (dfqq_req_addr),
.dfqq_empty (dfqq_empty),
.dfqq_full (dfqq_full)
);
wire[`LOG2UP(NUM_BANKS)-1:0] dwb_bank;
// wire[NUM_BANKS-1:0] use_wb_valid = per_bank_dram_wb_req | per_bank_dram_because_of_snp;
wire[NUM_BANKS-1:0] use_wb_valid = per_bank_dram_wb_req;
VX_generic_priority_encoder #(.N(NUM_BANKS)) VX_sel_dwb(
wire[NUM_BANKS-1:0] use_wb_valid = per_bank_dram_wb_req_valid;
VX_generic_priority_encoder #(
.N(NUM_BANKS)
) vx_sel_dwb (
.valids(use_wb_valid),
.index (dwb_bank),
.found (dwb_valid)
);
);
assign per_bank_dram_wb_queue_pop = dram_req_full ? 0 : use_wb_valid & ((1 << dwb_bank));
assign per_bank_dram_wb_queue_pop = dram_req_delay ? 0 : use_wb_valid & ((1 << dwb_bank));
assign dram_req = dwb_valid || dfqq_req || pref_pop;
assign dram_req_write = dwb_valid && dram_req;
assign dram_req_read = ((dfqq_req && !dwb_valid) || pref_pop) && dram_req;
assign dram_req_addr = (dwb_valid ? per_bank_dram_wb_req_addr[dwb_bank] : (dfqq_req ? dfqq_req_addr : pref_addr)) & `BASE_ADDR_MASK;
assign dram_req_size = BANK_LINE_SIZE_BYTES;
assign {dram_req_data} = dwb_valid ? {per_bank_dram_wb_req_data[dwb_bank] }: 0;
// assign dram_req_because_of_wb = dwb_valid ? per_bank_dram_because_of_snp[dwb_bank] : 0;
assign dram_req_because_of_wb = 0;
wire dram_req = dwb_valid || dfqq_req || pref_pop;
assign dram_req_read = ((dfqq_req && !dwb_valid) || pref_pop) && dram_req;
assign dram_req_write = dwb_valid && dram_req;
assign dram_req_addr = (dwb_valid ? per_bank_dram_wb_req_addr[dwb_bank] : (dfqq_req ? dfqq_req_addr : pref_addr)) & `BASE_ADDR_MASK;
assign {dram_req_data} = dwb_valid ? {per_bank_dram_wb_req_data[dwb_bank] }: 0;
endmodule

View File

@@ -1,8 +1,7 @@
`include "VX_cache_config.vh"
module VX_cache_miss_resrv
#(
module VX_cache_miss_resrv #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -16,8 +15,7 @@ module VX_cache_miss_resrv
// Number of cycles to complete stage 1 (read from memory)
parameter STAGE_1_CYCLES = 2,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
// Miss Reserv Queue Knob
@@ -27,7 +25,7 @@ module VX_cache_miss_resrv
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -40,12 +38,9 @@ module VX_cache_miss_resrv
// Fill Invalidator Size {Fill invalidator must be active}
parameter FILL_INVALIDAOR_SIZE = 16,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
input wire clk,
input wire reset,
@@ -65,7 +60,11 @@ module VX_cache_miss_resrv
// Broadcast Fill
input wire is_fill_st1,
/* verilator lint_off UNUSED */
// TODO: should fix this
input wire[31:0] fill_addr_st1,
/* verilator lint_on UNUSED */
// Miss dequeue
input wire miss_resrv_pop,
@@ -81,96 +80,91 @@ module VX_cache_miss_resrv
output wire[2:0] miss_resrv_mem_write_st0
);
// Size of metadata = 32 + `LOG2UP(NUM_REQUESTS) + 5 + 2 + (`NW_BITS-1 + 1)
reg [`MRVQ_METADATA_SIZE-1:0] metadata_table[MRVQ_SIZE-1:0];
reg [MRVQ_SIZE-1:0][31:0] addr_table;
reg [MRVQ_SIZE-1:0][31:0] pc_table;
reg [MRVQ_SIZE-1:0] valid_table;
reg [MRVQ_SIZE-1:0] ready_table;
reg [`LOG2UP(MRVQ_SIZE)-1:0] head_ptr;
reg [`LOG2UP(MRVQ_SIZE)-1:0] tail_ptr;
// Size of metadata = 32 + `LOG2UP(NUM_REQUESTS) + 5 + 2 + (`NW_BITS-1 + 1)
reg[`MRVQ_METADATA_SIZE-1:0] metadata_table[MRVQ_SIZE-1:0];
reg[MRVQ_SIZE-1:0][31:0] addr_table;
reg[MRVQ_SIZE-1:0][31:0] pc_table;
reg[MRVQ_SIZE-1:0] valid_table;
reg[MRVQ_SIZE-1:0] ready_table;
reg[`LOG2UP(MRVQ_SIZE)-1:0] head_ptr;
reg[`LOG2UP(MRVQ_SIZE)-1:0] tail_ptr;
reg [31:0] size;
reg[31:0] size;
// assign miss_resrv_full = (MRVQ_SIZE != 2) && (tail_ptr+1) == head_ptr;
assign miss_resrv_full = (MRVQ_SIZE != 2) && (size == MRVQ_SIZE );
assign miss_resrv_stop = (MRVQ_SIZE != 2) && (size > (MRVQ_SIZE-5));
wire enqueue_possible = !miss_resrv_full;
wire [`LOG2UP(MRVQ_SIZE)-1:0] enqueue_index = tail_ptr;
// assign miss_resrv_full = (MRVQ_SIZE != 2) && (tail_ptr+1) == head_ptr;
assign miss_resrv_full = (MRVQ_SIZE != 2) && (size == MRVQ_SIZE );
assign miss_resrv_stop = (MRVQ_SIZE != 2) && (size > (MRVQ_SIZE-5));
reg [MRVQ_SIZE-1:0] make_ready;
genvar curr_e;
generate
for (curr_e = 0; curr_e < MRVQ_SIZE; curr_e=curr_e+1) begin
assign make_ready[curr_e] = is_fill_st1 && valid_table[curr_e]
&& addr_table[curr_e][31:`LINE_SELECT_ADDR_START] == fill_addr_st1[31:`LINE_SELECT_ADDR_START];
end
endgenerate
wire enqueue_possible = !miss_resrv_full;
wire[`LOG2UP(MRVQ_SIZE)-1:0] enqueue_index = tail_ptr;
wire dequeue_possible = valid_table[head_ptr] && ready_table[head_ptr];
wire [`LOG2UP(MRVQ_SIZE)-1:0] dequeue_index = head_ptr;
reg[MRVQ_SIZE-1:0] make_ready;
genvar curr_e;
generate
for (curr_e = 0; curr_e < MRVQ_SIZE; curr_e=curr_e+1) begin
assign make_ready[curr_e] = is_fill_st1 && valid_table[curr_e]
&& addr_table[curr_e][31:`LINE_SELECT_ADDR_START] == fill_addr_st1[31:`LINE_SELECT_ADDR_START];
assign miss_resrv_valid_st0 = (MRVQ_SIZE != 2) && dequeue_possible;
assign miss_resrv_pc_st0 = pc_table[dequeue_index];
assign miss_resrv_addr_st0 = addr_table[dequeue_index];
assign {miss_resrv_data_st0, miss_resrv_tid_st0, miss_resrv_rd_st0, miss_resrv_wb_st0, miss_resrv_warp_num_st0, miss_resrv_mem_read_st0, miss_resrv_mem_write_st0} = metadata_table[dequeue_index];
wire mrvq_push = miss_add && enqueue_possible && (MRVQ_SIZE != 2);
wire mrvq_pop = miss_resrv_pop && dequeue_possible;
wire update_ready = (|make_ready);
integer i;
always @(posedge clk) begin
if (reset) begin
for (i = 0; i < MRVQ_SIZE; i=i+1) begin
metadata_table[i] <= 0;
end
valid_table <= 0;
ready_table <= 0;
addr_table <= 0;
pc_table <= 0;
size <= 0;
head_ptr <= 0;
tail_ptr <= 0;
end else begin
if (mrvq_push) begin
valid_table[enqueue_index] <= 1;
ready_table[enqueue_index] <= 0;
pc_table[enqueue_index] <= miss_add_pc;
addr_table[enqueue_index] <= miss_add_addr;
metadata_table[enqueue_index] <= {miss_add_data, miss_add_tid, miss_add_rd, miss_add_wb, miss_add_warp_num, miss_add_mem_read, miss_add_mem_write};
tail_ptr <= tail_ptr + 1;
end
endgenerate
if (update_ready) begin
ready_table <= ready_table | make_ready;
end
wire dequeue_possible = valid_table[head_ptr] && ready_table[head_ptr];
wire[`LOG2UP(MRVQ_SIZE)-1:0] dequeue_index = head_ptr;
if (mrvq_pop) begin
valid_table[dequeue_index] <= 0;
ready_table[dequeue_index] <= 0;
addr_table[dequeue_index] <= 0;
metadata_table[dequeue_index] <= 0;
pc_table[dequeue_index] <= 0;
head_ptr <= head_ptr + 1;
end
assign miss_resrv_valid_st0 = (MRVQ_SIZE != 2) && dequeue_possible;
assign miss_resrv_pc_st0 = pc_table[dequeue_index];
assign miss_resrv_addr_st0 = addr_table[dequeue_index];
assign {miss_resrv_data_st0, miss_resrv_tid_st0, miss_resrv_rd_st0, miss_resrv_wb_st0, miss_resrv_warp_num_st0, miss_resrv_mem_read_st0, miss_resrv_mem_write_st0} = metadata_table[dequeue_index];
wire mrvq_push = miss_add && enqueue_possible && (MRVQ_SIZE != 2);
wire mrvq_pop = miss_resrv_pop && dequeue_possible;
wire update_ready = (|make_ready);
integer i;
always @(posedge clk) begin
if (reset) begin
for (i = 0; i < MRVQ_SIZE; i=i+1) metadata_table[i] <= 0;
valid_table <= 0;
ready_table <= 0;
addr_table <= 0;
pc_table <= 0;
size <= 0;
head_ptr <= 0;
tail_ptr <= 0;
end else begin
if (!(mrvq_push && mrvq_pop)) begin
if (mrvq_push) begin
valid_table[enqueue_index] <= 1;
ready_table[enqueue_index] <= 0;
pc_table[enqueue_index] <= miss_add_pc;
addr_table[enqueue_index] <= miss_add_addr;
metadata_table[enqueue_index] <= {miss_add_data, miss_add_tid, miss_add_rd, miss_add_wb, miss_add_warp_num, miss_add_mem_read, miss_add_mem_write};
tail_ptr <= tail_ptr + 1;
end
if (update_ready) begin
ready_table <= ready_table | make_ready;
size <= size + 1;
end
if (mrvq_pop) begin
valid_table[dequeue_index] <= 0;
ready_table[dequeue_index] <= 0;
addr_table[dequeue_index] <= 0;
metadata_table[dequeue_index] <= 0;
pc_table[dequeue_index] <= 0;
head_ptr <= head_ptr + 1;
size <= size - 1;
end
if (!(mrvq_push && mrvq_pop)) begin
if (mrvq_push) begin
size <= size + 1;
end
if (mrvq_pop) begin
size <= size - 1;
end
end
end
end
end
endmodule

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
module VX_cache_req_queue
#(
module VX_cache_req_queue #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -15,8 +14,7 @@ module VX_cache_req_queue
// Number of cycles to complete stage 1 (read from memory)
parameter STAGE_1_CYCLES = 2,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
// Miss Reserv Queue Knob
@@ -26,7 +24,7 @@ module VX_cache_req_queue
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -39,12 +37,9 @@ module VX_cache_req_queue
// Fill Invalidator Size {Fill invalidator must be active}
parameter FILL_INVALIDAOR_SIZE = 16,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
input wire clk,
input wire reset,
@@ -88,7 +83,6 @@ module VX_cache_req_queue
wire [NUM_REQUESTS-1:0][2:0] out_per_mem_write;
wire [31:0] out_per_pc;
reg [NUM_REQUESTS-1:0] use_per_valids;
reg [NUM_REQUESTS-1:0][31:0] use_per_addr;
reg [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] use_per_writedata;
@@ -99,7 +93,6 @@ module VX_cache_req_queue
reg [NUM_REQUESTS-1:0][2:0] use_per_mem_read;
reg [NUM_REQUESTS-1:0][2:0] use_per_mem_write;
wire [NUM_REQUESTS-1:0] qual_valids;
wire [NUM_REQUESTS-1:0][31:0] qual_addr;
wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] qual_writedata;
@@ -110,7 +103,9 @@ module VX_cache_req_queue
wire [NUM_REQUESTS-1:0][2:0] qual_mem_write;
wire [31:0] qual_pc;
/* verilator lint_off UNUSED */
reg [NUM_REQUESTS-1:0] updated_valids;
/* verilator lint_on UNUSED */
wire o_empty;
@@ -120,17 +115,19 @@ module VX_cache_req_queue
wire push_qual = reqq_push && !reqq_full;
wire pop_qual = !out_empty && use_empty;
VX_generic_queue_ll #(.DATAW( (NUM_REQUESTS * (1+32+`WORD_SIZE)) + 5 + (NUM_REQUESTS*2) + (`NW_BITS-1+1) + (NUM_REQUESTS * (3 + 3)) + 32 ), .SIZE(REQQ_SIZE)) reqq_queue(
.clk (clk),
.reset (reset),
.push (push_qual),
.in_data ({bank_valids , bank_addr , bank_writedata , bank_rd , bank_wb , bank_warp_num , bank_mem_read , bank_mem_write , bank_pc}),
.pop (pop_qual),
.out_data({out_per_valids, out_per_addr, out_per_writedata, out_per_rd, out_per_wb, out_per_warp_num, out_per_mem_read, out_per_mem_write, out_per_pc}),
.empty (o_empty),
.full (reqq_full)
);
VX_generic_queue_ll #(
.DATAW( (NUM_REQUESTS * (1+32+`WORD_SIZE)) + 5 + (NUM_REQUESTS*2) + (`NW_BITS-1+1) + (NUM_REQUESTS * (3 + 3)) + 32 ),
.SIZE(REQQ_SIZE)
) reqq_queue (
.clk (clk),
.reset (reset),
.push (push_qual),
.in_data ({bank_valids , bank_addr , bank_writedata , bank_rd , bank_wb , bank_warp_num , bank_mem_read , bank_mem_write , bank_pc}),
.pop (pop_qual),
.out_data ({out_per_valids, out_per_addr, out_per_writedata, out_per_rd, out_per_wb, out_per_warp_num, out_per_mem_read, out_per_mem_write, out_per_pc}),
.empty (o_empty),
.full (reqq_full)
);
wire[NUM_REQUESTS-1:0] real_out_per_valids = out_per_valids & {NUM_REQUESTS{~out_empty}};
@@ -146,11 +143,13 @@ module VX_cache_req_queue
wire[`LOG2UP(NUM_REQUESTS)-1:0] qual_request_index;
wire qual_has_request;
VX_generic_priority_encoder #(.N(NUM_REQUESTS)) VX_sel_bank(
VX_generic_priority_encoder #(
.N(NUM_REQUESTS)
) vx_sel_bank (
.valids(qual_valids),
.index (qual_request_index),
.found (qual_has_request)
);
);
assign reqq_empty = !qual_has_request;
assign reqq_req_st0 = qual_has_request;
@@ -164,7 +163,6 @@ module VX_cache_req_queue
assign reqq_req_mem_write_st0 = qual_mem_write[qual_request_index];
assign reqq_req_pc_st0 = qual_pc;
always @(*) begin
updated_valids = qual_valids;
if (qual_has_request) begin
@@ -172,7 +170,6 @@ module VX_cache_req_queue
end
end
always @(posedge clk) begin
if (reset) begin
use_per_valids <= 0;
@@ -204,5 +201,4 @@ module VX_cache_req_queue
end
end
endmodule

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
module VX_cache_wb_sel_merge
#(
module VX_cache_wb_sel_merge #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -17,8 +16,7 @@ module VX_cache_wb_sel_merge
// Function ID, {Dcache=0, Icache=1, Sharedmemory=2}
parameter FUNC_ID = 0,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
// Miss Reserv Queue Knob
@@ -28,7 +26,7 @@ module VX_cache_wb_sel_merge
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -41,35 +39,29 @@ module VX_cache_wb_sel_merge
// Fill Invalidator Size {Fill invalidator must be active}
parameter FILL_INVALIDAOR_SIZE = 16,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
// Per Bank WB
input wire [NUM_BANKS-1:0] per_bank_wb_valid,
input wire [NUM_BANKS-1:0][`LOG2UP(NUM_REQUESTS)-1:0] per_bank_wb_tid,
input wire [NUM_BANKS-1:0][4:0] per_bank_wb_rd,
input wire [NUM_BANKS-1:0][1:0] per_bank_wb_wb,
input wire [NUM_BANKS-1:0][`NW_BITS-1:0] per_bank_wb_warp_num,
input wire [NUM_BANKS-1:0][`WORD_SIZE_RNG] per_bank_wb_data,
input wire [NUM_BANKS-1:0][31:0] per_bank_wb_pc,
input wire [NUM_BANKS-1:0][31:0] per_bank_wb_address,
output wire [NUM_BANKS-1:0] per_bank_wb_pop,
input wire [NUM_BANKS-1:0] per_bank_wb_valid,
input wire [NUM_BANKS-1:0][`LOG2UP(NUM_REQUESTS)-1:0] per_bank_wb_tid,
input wire [NUM_BANKS-1:0][4:0] per_bank_wb_rd,
input wire [NUM_BANKS-1:0][1:0] per_bank_wb_wb,
input wire [NUM_BANKS-1:0][`NW_BITS-1:0] per_bank_wb_warp_num,
input wire [NUM_BANKS-1:0][`WORD_SIZE_RNG] per_bank_wb_data,
input wire [NUM_BANKS-1:0][31:0] per_bank_wb_pc,
input wire [NUM_BANKS-1:0][31:0] per_bank_wb_address,
output wire [NUM_BANKS-1:0] per_bank_wb_pop,
// Core Writeback
input wire core_no_wb_slot,
output reg [NUM_REQUESTS-1:0] core_wb_valid,
output reg [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata,
output reg [NUM_REQUESTS-1:0][31:0] core_wb_pc,
output wire [4:0] core_wb_req_rd,
output wire [1:0] core_wb_req_wb,
output wire [`NW_BITS-1:0] core_wb_warp_num,
output reg [NUM_REQUESTS-1:0][31:0] core_wb_address
input wire core_no_wb_slot,
output reg [NUM_REQUESTS-1:0] core_wb_valid,
output reg [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata,
output reg [NUM_REQUESTS-1:0][31:0] core_wb_pc,
output wire [4:0] core_wb_req_rd,
output wire [1:0] core_wb_req_wb,
output wire [`NW_BITS-1:0] core_wb_warp_num,
output reg [NUM_REQUESTS-1:0][31:0] core_wb_address
);
reg [NUM_BANKS-1:0] per_bank_wb_pop_unqual;
@@ -83,15 +75,16 @@ module VX_cache_wb_sel_merge
// end
// endgenerate
wire [`LOG2UP(NUM_BANKS)-1:0] main_bank_index;
wire found_bank;
wire found_bank;
VX_generic_priority_encoder #(.N(NUM_BANKS)) VX_sel_bank(
VX_generic_priority_encoder #(
.N(NUM_BANKS)
) vx_sel_bank (
.valids(per_bank_wb_valid),
.index (main_bank_index),
.found (found_bank)
);
);
assign core_wb_req_rd = per_bank_wb_rd[main_bank_index];
assign core_wb_req_wb = per_bank_wb_wb[main_bank_index];
@@ -106,42 +99,36 @@ module VX_cache_wb_sel_merge
core_wb_address = 0;
for (this_bank = 0; this_bank < NUM_BANKS; this_bank = this_bank + 1) begin
if ((FUNC_ID == `L2FUNC_ID) || (FUNC_ID == `L3FUNC_ID)) begin
if (found_bank
&& !core_wb_valid[per_bank_wb_tid[this_bank]]
&& per_bank_wb_valid[this_bank]
&& ((main_bank_index == `LOG2UP(NUM_BANKS)'(this_bank))
|| (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index]))) begin
core_wb_valid[per_bank_wb_tid[this_bank]] = 1;
core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank];
core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank];
core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank];
per_bank_wb_pop_unqual[this_bank] = 1;
end else begin
per_bank_wb_pop_unqual[this_bank] = 0;
end
if (found_bank
&& !core_wb_valid[per_bank_wb_tid[this_bank]]
&& per_bank_wb_valid[this_bank]
&& ((main_bank_index == `LOG2UP(NUM_BANKS)'(this_bank))
|| (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index]))) begin
core_wb_valid[per_bank_wb_tid[this_bank]] = 1;
core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank];
core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank];
core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank];
per_bank_wb_pop_unqual[this_bank] = 1;
end else begin
per_bank_wb_pop_unqual[this_bank] = 0;
end
end else begin
if (((main_bank_index == `LOG2UP(NUM_BANKS)'(this_bank))
|| (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index]))
&& found_bank
&& !core_wb_valid[per_bank_wb_tid[this_bank]]
&& (per_bank_wb_valid[this_bank])
&& (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index])
&& (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index])) begin
core_wb_valid[per_bank_wb_tid[this_bank]] = 1;
core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank];
core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank];
core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank];
per_bank_wb_pop_unqual[this_bank] = 1;
end else begin
per_bank_wb_pop_unqual[this_bank] = 0;
end
if (((main_bank_index == `LOG2UP(NUM_BANKS)'(this_bank))
|| (per_bank_wb_tid[this_bank] != per_bank_wb_tid[main_bank_index]))
&& found_bank
&& !core_wb_valid[per_bank_wb_tid[this_bank]]
&& (per_bank_wb_valid[this_bank])
&& (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index])
&& (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index])) begin
core_wb_valid[per_bank_wb_tid[this_bank]] = 1;
core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank];
core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank];
core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank];
per_bank_wb_pop_unqual[this_bank] = 1;
end else begin
per_bank_wb_pop_unqual[this_bank] = 0;
end
end
end
end
endgenerate

View File

@@ -47,14 +47,14 @@ module VX_dcache_llv_resp_bank_sel
(
output reg [NUM_BANKS-1:0] per_bank_llvq_pop,
input wire[NUM_BANKS-1:0] per_bank_llvq_valid,
input wire[NUM_BANKS-1:0][31:0] per_bank_llvq_res_addr,
input wire[NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][31:0] per_bank_llvq_res_data,
input wire[NUM_BANKS-1:0][`LOG2UP(NUM_REQUESTS)-1:0] per_bank_llvq_res_tid,
input wire[NUM_BANKS-1:0][31:0] per_bank_llvq_rsp_addr,
input wire[NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][31:0] per_bank_llvq_rsp_data,
input wire[NUM_BANKS-1:0][`LOG2UP(NUM_REQUESTS)-1:0] per_bank_llvq_rsp_tid,
input wire llvq_pop,
output reg[NUM_REQUESTS-1:0] llvq_valid,
output reg[NUM_REQUESTS-1:0][31:0] llvq_res_addr,
output reg[NUM_REQUESTS-1:0][`BANK_LINE_WORDS-1:0][31:0] llvq_res_data
output reg[NUM_REQUESTS-1:0][31:0] llvq_rsp_addr,
output reg[NUM_REQUESTS-1:0][`BANK_LINE_WORDS-1:0][31:0] llvq_rsp_data
);
@@ -62,7 +62,7 @@ module VX_dcache_llv_resp_bank_sel
wire [(`LOG2UP(NUM_BANKS))-1:0] main_bank_index;
wire found_bank;
VX_generic_priority_encoder #(.N(NUM_BANKS)) VX_sel_bank(
VX_generic_priority_encoder #(.N(NUM_BANKS)) vx_sel_bank(
.valids(per_bank_llvq_valid),
.index (main_bank_index),
.found (found_bank)
@@ -71,13 +71,13 @@ module VX_dcache_llv_resp_bank_sel
always @(*) begin
llvq_valid = 0;
llvq_res_addr = 0;
llvq_res_data = 0;
llvq_rsp_addr = 0;
llvq_rsp_data = 0;
per_bank_llvq_pop = 0;
if (found_bank && llvq_pop) begin
llvq_valid [per_bank_llvq_res_tid[main_bank_index]] = 1'b1;
llvq_res_addr[per_bank_llvq_res_tid[main_bank_index]] = per_bank_llvq_res_addr[main_bank_index];
llvq_res_data[per_bank_llvq_res_tid[main_bank_index]] = per_bank_llvq_res_data[main_bank_index];
llvq_valid [per_bank_llvq_rsp_tid[main_bank_index]] = 1'b1;
llvq_rsp_addr[per_bank_llvq_rsp_tid[main_bank_index]] = per_bank_llvq_rsp_addr[main_bank_index];
llvq_rsp_data[per_bank_llvq_rsp_tid[main_bank_index]] = per_bank_llvq_rsp_data[main_bank_index];
per_bank_llvq_pop[main_bank_index] = 1'b1;
end
end

View File

@@ -82,17 +82,18 @@ module VX_fill_invalidator
wire [(`LOG2UP(FILL_INVALIDAOR_SIZE))-1:0] enqueue_index;
wire enqueue_found;
VX_generic_priority_encoder #(.N(FILL_INVALIDAOR_SIZE)) VX_sel_bank(
wire enqueue_found;
VX_generic_priority_encoder #(
.N(FILL_INVALIDAOR_SIZE)
) vx_sel_bank (
.valids(~fills_active),
.index (enqueue_index),
.found (enqueue_found)
);
);
assign invalidate_fill = possible_fill && matched;
always @(posedge clk) begin
if (reset) begin
fills_active <= 0;
@@ -109,7 +110,6 @@ module VX_fill_invalidator
end
end
// reg success_found;
// reg[(`LOG2UP(FILL_INVALIDAOR_SIZE))-1:0] success_index;
@@ -133,21 +133,15 @@ module VX_fill_invalidator
// end
// end
// wire [(`LOG2UP(FILL_INVALIDAOR_SIZE))-1:0] enqueue_index;
// wire enqueue_found;
// VX_generic_priority_encoder #(.N(FILL_INVALIDAOR_SIZE)) VX_sel_bank(
// VX_generic_priority_encoder #(.N(FILL_INVALIDAOR_SIZE)) vx_sel_bank(
// .valids(~fills_active),
// .index (enqueue_index),
// .found (enqueue_found)
// );
// always @(posedge clk) begin
// if (reset) begin
// fills_active <= 0;
@@ -165,8 +159,6 @@ module VX_fill_invalidator
// end
// end
end
endmodule

View File

@@ -1,15 +1,13 @@
`include "VX_cache_config.vh"
module VX_prefetcher
#(
parameter PRFQ_SIZE = 64,
parameter PRFQ_STRIDE = 2,
// Size of line inside a bank in bytes
parameter BANK_LINE_SIZE_BYTES = 16,
// Size of a word in bytes
parameter WORD_SIZE_BYTES = 4
)
(
module VX_prefetcher #(
parameter PRFQ_SIZE = 64,
parameter PRFQ_STRIDE = 2,
// Size of line inside a bank in bytes
parameter BANK_LINE_SIZE_BYTES = 16,
// Size of a word in bytes
parameter WORD_SIZE_BYTES = 4
) (
input wire clk,
input wire reset,
@@ -21,24 +19,23 @@ module VX_prefetcher
output wire[31:0] pref_addr
);
reg[`LOG2UP(PRFQ_STRIDE):0] use_valid;
reg[31:0] use_addr;
wire current_valid;
wire[31:0] current_addr;
wire current_full;
wire current_empty;
assign current_valid = ~current_empty;
wire update_use = ((use_valid == 0) || ((use_valid-1) == 0)) && current_valid;
VX_generic_queue_ll #(.DATAW(32), .SIZE(PRFQ_SIZE)) pfq_queue(
VX_generic_queue_ll #(
.DATAW(32),
.SIZE(PRFQ_SIZE)
) pfq_queue (
.clk (clk),
.reset (reset),
@@ -50,14 +47,11 @@ module VX_prefetcher
.empty (current_empty),
.full (current_full)
);
);
assign pref_valid = use_valid != 0;
assign pref_addr = use_addr;
always @(posedge clk) begin
if (reset) begin
use_valid <= 0;
@@ -70,7 +64,6 @@ module VX_prefetcher
use_valid <= use_valid - 1;
use_addr <= use_addr + BANK_LINE_SIZE_BYTES;
end
end
end

View File

@@ -5,28 +5,30 @@ module VX_snp_fwd_arb
parameter NUM_BANKS = 8
)
(
input wire[NUM_BANKS-1:0] per_bank_snp_fwd,
input wire[NUM_BANKS-1:0][31:0] per_bank_snp_fwd_addr,
output reg[NUM_BANKS-1:0] per_bank_snp_fwd_pop,
input wire [NUM_BANKS-1:0] per_bank_snp_fwd,
input wire [NUM_BANKS-1:0][31:0] per_bank_snp_fwd_addr,
output reg [NUM_BANKS-1:0] per_bank_snp_fwd_pop,
output wire snp_fwd,
output wire[31:0] snp_fwd_addr,
input wire snp_fwd_delay
output wire snp_fwd_valid,
output wire [31:0] snp_fwd_addr,
input wire snp_fwd_full
);
wire[NUM_BANKS-1:0] qual_per_bank_snp_fwd = per_bank_snp_fwd & {NUM_BANKS{!snp_fwd_delay}};
wire[NUM_BANKS-1:0] qual_per_bank_snp_fwd = per_bank_snp_fwd & {NUM_BANKS{!snp_fwd_full}};
wire[`LOG2UP(NUM_BANKS)-1:0] fsq_bank;
wire fsq_valid;
wire fsq_valid;
VX_generic_priority_encoder #(.N(NUM_BANKS)) VX_sel_ffsq(
VX_generic_priority_encoder #(
.N(NUM_BANKS)
) vx_sel_ffsq(
.valids(qual_per_bank_snp_fwd),
.index (fsq_bank),
.found (fsq_valid)
);
);
assign snp_fwd = fsq_valid;
assign snp_fwd_valid = fsq_valid;
assign snp_fwd_addr = per_bank_snp_fwd_addr[fsq_bank];
always @(*) begin

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
module VX_tag_data_access
#(
module VX_tag_data_access #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -17,8 +16,7 @@ module VX_tag_data_access
// Function ID, {Dcache=0, Icache=1, Sharedmemory=2}
parameter FUNC_ID = 0,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
// Miss Reserv Queue Knob
@@ -28,7 +26,7 @@ module VX_tag_data_access
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -41,24 +39,26 @@ module VX_tag_data_access
// Fill Invalidator Size {Fill invalidator must be active}
parameter FILL_INVALIDAOR_SIZE = 16,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
)
(
) (
input wire clk,
input wire reset,
input wire stall,
input wire is_snp_st1e,
input wire stall_bank_pipe,
// Initial Reading
/* verilator lint_off UNUSED */
// TODO:
input wire[31:0] readaddr_st10,
/* verilator lint_on UNUSED */
// Write/Read Logic
input wire valid_req_st1e,
input wire writefill_st1e,
/* verilator lint_off UNUSED */
// TODO:
input wire[31:0] writeaddr_st1e,
/* verilator lint_on UNUSED */
input wire[`WORD_SIZE_RNG] writeword_st1e,
input wire[`DBANK_LINE_WORDS-1:0][31:0] writedata_st1e,
input wire[2:0] mem_write_st1e,
@@ -69,19 +69,14 @@ module VX_tag_data_access
output wire[`TAG_SELECT_SIZE_RNG] readtag_st1e,
output wire miss_st1e,
output wire dirty_st1e,
output wire fill_saw_dirty_st1e
output wire fill_saw_dirty_st1e
);
reg[`DBANK_LINE_WORDS-1:0][31:0] readdata_st[STAGE_1_CYCLES-1:0];
reg read_valid_st1c[STAGE_1_CYCLES-1:0];
reg read_dirty_st1c[STAGE_1_CYCLES-1:0];
reg[`TAG_SELECT_SIZE_RNG] read_tag_st1c [STAGE_1_CYCLES-1:0];
reg[`DBANK_LINE_WORDS-1:0][31:0] read_data_st1c [STAGE_1_CYCLES-1:0];
wire qual_read_valid_st1;
wire qual_read_dirty_st1;
wire[`TAG_SELECT_SIZE_RNG] qual_read_tag_st1;
@@ -98,9 +93,9 @@ module VX_tag_data_access
wire real_writefill = writefill_st1e && ((valid_req_st1e && !use_read_valid_st1e) || (valid_req_st1e && use_read_valid_st1e && (writeaddr_st1e[`TAG_SELECT_ADDR_RNG] != use_read_tag_st1e)));
wire fill_sent;
wire invalidate_line;
VX_tag_data_structure #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),
.BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES),
@@ -119,14 +114,12 @@ module VX_tag_data_access
.LLVQ_SIZE (LLVQ_SIZE),
.FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES)
)
VX_tag_data_structure
(
) vx_tag_data_structure (
.clk (clk),
.reset (reset),
.stall_bank_pipe(stall_bank_pipe),
.read_addr (readaddr_st10),
.read_addr (readaddr_st10[`LINE_SELECT_ADDR_RNG]),
.read_valid (qual_read_valid_st1),
.read_dirty (qual_read_dirty_st1),
.read_tag (qual_read_tag_st1),
@@ -135,13 +128,17 @@ module VX_tag_data_access
.invalidate (invalidate_line),
.write_enable(use_write_enable),
.write_fill (real_writefill),
.write_addr (writeaddr_st1e),
.write_addr (writeaddr_st1e[`LINE_SELECT_ADDR_RNG]),
.tag_index (writeaddr_st1e[`TAG_SELECT_ADDR_RNG]),
.write_data (use_write_data),
.fill_sent (fill_sent)
);
);
// VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_WORDS*32) )) s0_1_c0 (
VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_WORDS*32) ), .Valid(0)) s0_1_c0 (
VX_generic_register #(
.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_WORDS*32) ),
.PassThru(1)
) s0_1_c0 (
.clk (clk),
.reset(reset),
.stall(stall),
@@ -153,7 +150,9 @@ module VX_tag_data_access
genvar curr_stage;
generate
for (curr_stage = 1; curr_stage < STAGE_1_CYCLES-1; curr_stage = curr_stage + 1) begin
VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_WORDS*32) )) s0_1_cc (
VX_generic_register #(
.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_WORDS*32))
) s0_1_cc (
.clk (clk),
.reset(reset),
.stall(stall),
@@ -164,7 +163,6 @@ module VX_tag_data_access
end
endgenerate
assign use_read_valid_st1e = read_valid_st1c[STAGE_1_CYCLES-1] || (FUNC_ID == `SFUNC_ID); // If shared memory, always valid
assign use_read_dirty_st1e = read_dirty_st1c[STAGE_1_CYCLES-1] && (FUNC_ID != `SFUNC_ID); // Dirty only applies in Dcache
assign use_read_tag_st1e = (FUNC_ID == `SFUNC_ID) ? writeaddr_st1e[`TAG_SELECT_ADDR_RNG] : read_tag_st1c [STAGE_1_CYCLES-1]; // Tag is always the same in SM
@@ -178,6 +176,7 @@ module VX_tag_data_access
wire[`OFFSET_SIZE_RNG] byte_select = writeaddr_st1e[`OFFSET_ADDR_RNG];
wire[`WORD_SELECT_SIZE_RNG] block_offset = writeaddr_st1e[`WORD_SELECT_ADDR_RNG];
/* verilator lint_off UNUSED */
wire lw = valid_req_st1e && (mem_read_st1e == `LW_MEM_READ);
wire lb = valid_req_st1e && (mem_read_st1e == `LB_MEM_READ);
wire lh = valid_req_st1e && (mem_read_st1e == `LH_MEM_READ);
@@ -187,49 +186,15 @@ module VX_tag_data_access
wire b0 = (byte_select == 0);
wire b1 = (byte_select == 1);
wire b2 = (byte_select == 2);
wire b3 = (byte_select == 3);
wire b3 = (byte_select == 3);
/* verilator lint_on UNUSED */
/* verilator lint_off UNUSED */
wire[31:0] w0 = read_data_st1c[STAGE_1_CYCLES-1][0][31:0];
wire[31:0] w1 = read_data_st1c[STAGE_1_CYCLES-1][1][31:0];
wire[31:0] w2 = read_data_st1c[STAGE_1_CYCLES-1][2][31:0];
wire[31:0] w3 = read_data_st1c[STAGE_1_CYCLES-1][3][31:0];
wire[31:0] data_unmod = read_data_st1c[STAGE_1_CYCLES-1][block_offset][31:0];
wire[31:0] data_unQual = (b0 || lw) ? (data_unmod) :
b1 ? (data_unmod >> 8) :
b2 ? (data_unmod >> 16) :
(data_unmod >> 24);
wire[31:0] lb_data = (data_unQual[7] ) ? (data_unQual | 32'hFFFFFF00) : (data_unQual & 32'hFF);
wire[31:0] lh_data = (data_unQual[15]) ? (data_unQual | 32'hFFFF0000) : (data_unQual & 32'hFFFF);
wire[31:0] lbu_data = (data_unQual & 32'hFF);
wire[31:0] lhu_data = (data_unQual & 32'hFFFF);
wire[31:0] lw_data = (data_unQual);
wire[31:0] sw_data = writeword_st1e[31:0];
wire[31:0] sb_data = b1 ? {{16{1'b0}}, writeword_st1e[7:0], { 8{1'b0}}} :
b2 ? {{ 8{1'b0}}, writeword_st1e[7:0], {16{1'b0}}} :
b3 ? {{ 0{1'b0}}, writeword_st1e[7:0], {24{1'b0}}} :
writeword_st1e[31:0];
wire[31:0] sh_data = b2 ? {writeword_st1e[15:0], {16{1'b0}}} : writeword_st1e[31:0];
wire[31:0] use_write_dat = sb ? sb_data :
sh ? sh_data :
sw_data;
wire[31:0] data_Qual = lb ? lb_data :
lh ? lh_data :
lhu ? lhu_data :
lbu ? lbu_data :
lw_data;
/* verilator lint_on UNUSED */
/////////////////////// STORE LOGIC ///////////////////
@@ -245,6 +210,7 @@ module VX_tag_data_access
wire[`DBANK_LINE_WORDS-1:0][3:0] we;
wire[`DBANK_LINE_WORDS-1:0][31:0] data_write;
genvar g;
generate
for (g = 0; g < `DBANK_LINE_WORDS; g = g + 1) begin : write_enables
@@ -257,9 +223,18 @@ module VX_tag_data_access
(normal_write && sh) ? sh_mask :
4'b0000;
if (!(FUNC_ID == `L2FUNC_ID)) assign data_write[g] = force_write ? writedata_st1e[g] : use_write_dat;
if (FUNC_ID != `L2FUNC_ID) begin
wire[31:0] sb_data = b1 ? {{16{1'b0}}, writeword_st1e[7:0], { 8{1'b0}}} :
b2 ? {{ 8{1'b0}}, writeword_st1e[7:0], {16{1'b0}}} :
b3 ? {{ 0{1'b0}}, writeword_st1e[7:0], {24{1'b0}}} :
writeword_st1e[31:0];
wire[31:0] sw_data = writeword_st1e[31:0];
wire[31:0] sh_data = b2 ? {writeword_st1e[15:0], {16{1'b0}}} : writeword_st1e[31:0];
wire[31:0] use_write_dat = sb ? sb_data : sh ? sh_data : sw_data;
assign data_write[g] = force_write ? writedata_st1e[g] : use_write_dat;
end
end
if ((FUNC_ID == `L2FUNC_ID)) begin
if (FUNC_ID == `L2FUNC_ID) begin
assign data_write = force_write ? writedata_st1e : writeword_st1e;
end
endgenerate
@@ -268,13 +243,29 @@ module VX_tag_data_access
assign use_write_data = data_write;
///////////////////////
if (FUNC_ID == `L2FUNC_ID) begin
assign readword_st1e = read_data_st1c[STAGE_1_CYCLES-1];
end else begin
wire[31:0] data_unmod = read_data_st1c[STAGE_1_CYCLES-1][block_offset][31:0];
wire[31:0] data_unQual = (b0 || lw) ? (data_unmod) :
b1 ? (data_unmod >> 8) :
b2 ? (data_unmod >> 16) :
(data_unmod >> 24);
wire[31:0] lb_data = (data_unQual[7] ) ? (data_unQual | 32'hFFFFFF00) : (data_unQual & 32'hFF);
wire[31:0] lh_data = (data_unQual[15]) ? (data_unQual | 32'hFFFF0000) : (data_unQual & 32'hFFFF);
wire[31:0] lbu_data = (data_unQual & 32'hFF);
wire[31:0] lhu_data = (data_unQual & 32'hFFFF);
wire[31:0] lw_data = (data_unQual);
wire[31:0] data_Qual = lb ? lb_data :
lh ? lh_data :
lhu ? lhu_data :
lbu ? lbu_data :
lw_data;
assign readword_st1e = data_Qual;
end
wire[`TAG_SELECT_ADDR_RNG] writeaddr_tag = writeaddr_st1e[`TAG_SELECT_ADDR_RNG];
wire tags_mismatch = writeaddr_tag != use_read_tag_st1e;

View File

@@ -1,7 +1,6 @@
`include "VX_cache_config.vh"
module VX_tag_data_structure
#(
module VX_tag_data_structure #(
// Size of cache in bytes
parameter CACHE_SIZE_BYTES = 1024,
// Size of line inside a bank in bytes
@@ -17,8 +16,7 @@ module VX_tag_data_structure
// Function ID, {Dcache=0, Icache=1, Sharedmemory=2}
parameter FUNC_ID = 0,
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Queues feeding into banks Knobs {1, 2, 4, 8, ...}
// Core Request Queue Size
parameter REQQ_SIZE = 8,
// Miss Reserv Queue Knob
@@ -28,7 +26,7 @@ module VX_tag_data_structure
// Snoop Req Queue
parameter SNRQ_SIZE = 8,
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
parameter CWBQ_SIZE = 8,
// Dram Writeback Queue Size
@@ -41,44 +39,37 @@ module VX_tag_data_structure
// Fill Invalidator Size {Fill invalidator must be active}
parameter FILL_INVALIDAOR_SIZE = 16,
// Dram knobs
// Dram knobs
parameter SIMULATED_DRAM_LATENCY_CYCLES = 10
) (
input wire clk,
input wire reset,
input wire stall_bank_pipe,
)
(
input wire clk,
input wire reset,
input wire stall_bank_pipe,
input wire[31:0] read_addr,
output wire read_valid,
output wire read_dirty,
output wire[`TAG_SELECT_SIZE_RNG] read_tag,
input wire[`LINE_SELECT_SIZE_RNG] read_addr,
output wire read_valid,
output wire read_dirty,
output wire[`TAG_SELECT_SIZE_RNG] read_tag,
output wire[`DBANK_LINE_WORDS-1:0][31:0] read_data,
input wire invalidate,
input wire[`DBANK_LINE_WORDS-1:0][3:0] write_enable,
input wire write_fill,
input wire[31:0] write_addr,
input wire invalidate,
input wire[`DBANK_LINE_WORDS-1:0][3:0] write_enable,
input wire write_fill,
input wire[`LINE_SELECT_SIZE_RNG] write_addr,
input wire[`TAG_SELECT_SIZE_RNG] tag_index,
input wire[`DBANK_LINE_WORDS-1:0][31:0] write_data,
input wire fill_sent
input wire fill_sent
);
reg[`DBANK_LINE_WORDS-1:0][3:0][7:0] data [`BANK_LINE_COUNT-1:0];
reg[`TAG_SELECT_SIZE_RNG] tag [`BANK_LINE_COUNT-1:0];
reg valid[`BANK_LINE_COUNT-1:0];
reg dirty[`BANK_LINE_COUNT-1:0];
reg [`DBANK_LINE_WORDS-1:0][3:0][7:0] data [`BANK_LINE_COUNT-1:0];
reg [`TAG_SELECT_SIZE_RNG] tag [`BANK_LINE_COUNT-1:0];
reg valid [`BANK_LINE_COUNT-1:0];
reg dirty [`BANK_LINE_COUNT-1:0];
wire[`TAG_SELECT_ADDR_RNG] curr_tag = write_addr[`TAG_SELECT_ADDR_RNG];
wire[`LINE_SELECT_ADDR_RNG] curr_inx = write_addr[`LINE_SELECT_ADDR_RNG];
assign read_valid = valid[read_addr[`LINE_SELECT_ADDR_RNG]];
assign read_dirty = dirty[read_addr[`LINE_SELECT_ADDR_RNG]];
assign read_tag = tag [read_addr[`LINE_SELECT_ADDR_RNG]];
assign read_data = data [read_addr[`LINE_SELECT_ADDR_RNG]];
assign read_valid = valid [read_addr];
assign read_dirty = dirty [read_addr];
assign read_tag = tag [read_addr];
assign read_data = data [read_addr];
wire going_to_write = (|write_enable);
@@ -94,27 +85,27 @@ module VX_tag_data_structure
end
end else if (!stall_bank_pipe) begin
if (going_to_write) begin
valid[write_addr[`LINE_SELECT_ADDR_RNG]] <= 1;
tag [write_addr[`LINE_SELECT_ADDR_RNG]] <= write_addr[`TAG_SELECT_ADDR_RNG];
valid[write_addr] <= 1;
tag [write_addr] <= tag_index;
if (write_fill) begin
dirty[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0;
dirty[write_addr] <= 0;
end else begin
dirty[write_addr[`LINE_SELECT_ADDR_RNG]] <= 1;
dirty[write_addr] <= 1;
end
end else if (fill_sent) begin
dirty[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0;
// valid[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0;
dirty[write_addr] <= 0;
// valid[write_addr] <= 0;
end
if (invalidate) begin
valid[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0;
valid[write_addr] <= 0;
end
for (f = 0; f < `DBANK_LINE_WORDS; f = f + 1) begin
if (write_enable[f][0]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][0] <= write_data[f][7 :0 ];
if (write_enable[f][1]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][1] <= write_data[f][15:8 ];
if (write_enable[f][2]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][2] <= write_data[f][23:16];
if (write_enable[f][3]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][3] <= write_data[f][31:24];
if (write_enable[f][0]) data[write_addr][f][0] <= write_data[f][7 :0 ];
if (write_enable[f][1]) data[write_addr][f][1] <= write_data[f][15:8 ];
if (write_enable[f][2]) data[write_addr][f][2] <= write_data[f][23:16];
if (write_enable[f][3]) data[write_addr][f][3] <= write_data[f][31:24];
end
end
end

View File

@@ -4,10 +4,11 @@
`include "../VX_define.vh"
interface VX_branch_response_inter ();
wire valid_branch;
wire branch_dir;
wire[31:0] branch_dest;
wire[`NW_BITS-1:0] branch_warp_num;
wire valid_branch;
wire branch_dir;
wire [31:0] branch_dest;
wire [`NW_BITS-1:0] branch_warp_num;
endinterface

View File

@@ -5,15 +5,15 @@
interface VX_csr_req_inter ();
wire[`NUM_THREADS-1:0] valid;
wire[`NW_BITS-1:0] warp_num;
wire[4:0] rd;
wire[1:0] wb;
wire[4:0] alu_op;
wire is_csr;
wire[11:0] csr_address;
wire csr_immed;
wire[31:0] csr_mask;
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
wire [4:0] rd;
wire [1:0] wb;
wire [4:0] alu_op;
wire is_csr;
wire [11:0] csr_address;
wire csr_immed;
wire [31:0] csr_mask;
endinterface

View File

@@ -5,12 +5,12 @@
interface VX_csr_wb_inter ();
wire[`NUM_THREADS-1:0] valid;
wire[`NW_BITS-1:0] warp_num;
wire[4:0] rd;
wire[1:0] wb;
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
wire [4:0] rd;
wire [1:0] wb;
wire[`NUM_THREADS-1:0][31:0] csr_result;
wire [`NUM_THREADS-1:0][31:0] csr_result;
endinterface

View File

@@ -5,11 +5,11 @@
interface VX_dcache_request_inter ();
wire[`NUM_THREADS-1:0][31:0] out_cache_driver_in_address;
wire[2:0] out_cache_driver_in_mem_read;
wire[2:0] out_cache_driver_in_mem_write;
wire[`NUM_THREADS-1:0] out_cache_driver_in_valid;
wire[`NUM_THREADS-1:0][31:0] out_cache_driver_in_data;
wire [`NUM_THREADS-1:0][31:0] out_cache_driver_in_address;
wire [2:0] out_cache_driver_in_mem_read;
wire [2:0] out_cache_driver_in_mem_write;
wire [`NUM_THREADS-1:0] out_cache_driver_in_valid;
wire [`NUM_THREADS-1:0][31:0] out_cache_driver_in_data;
endinterface

View File

@@ -5,8 +5,8 @@
interface VX_dcache_response_inter ();
wire[`NUM_THREADS-1:0][31:0] in_cache_driver_out_data;
wire delay;
wire [`NUM_THREADS-1:0][31:0] in_cache_driver_out_data;
wire delay;
endinterface

View File

@@ -6,18 +6,19 @@
interface VX_dram_req_rsp_inter #(
parameter NUM_BANKS = 8,
parameter NUM_WORDS_PER_BLOCK = 4) ();
parameter NUM_WORDS_PER_BLOCK = 4
) ();
// Req
wire [31:0] o_m_evict_addr;
wire [31:0] o_m_read_addr;
wire o_m_valid;
wire[NUM_BANKS - 1:0][NUM_WORDS_PER_BLOCK-1:0][31:0] o_m_writedata;
wire o_m_read_or_write;
wire [31:0] o_m_evict_addr;
wire [31:0] o_m_read_addr;
wire o_m_valid;
wire [NUM_BANKS - 1:0][NUM_WORDS_PER_BLOCK-1:0][31:0] o_m_writedata;
wire o_m_read_or_write;
// Rsp
wire[NUM_BANKS - 1:0][NUM_WORDS_PER_BLOCK-1:0][31:0] i_m_readdata;
wire i_m_ready;
wire [NUM_BANKS - 1:0][NUM_WORDS_PER_BLOCK-1:0][31:0] i_m_readdata;
wire i_m_ready;
endinterface

View File

@@ -6,43 +6,43 @@
interface VX_exec_unit_req_inter ();
// Meta
wire[`NUM_THREADS-1:0] valid;
wire[`NW_BITS-1:0] warp_num;
wire[31:0] curr_PC;
wire[31:0] PC_next;
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
wire [31:0] curr_PC;
wire [31:0] PC_next;
// Write Back Info
wire[4:0] rd;
wire[1:0] wb;
wire [4:0] rd;
wire [1:0] wb;
// Data and alu op
wire[`NUM_THREADS-1:0][31:0] a_reg_data;
wire[`NUM_THREADS-1:0][31:0] b_reg_data;
wire[4:0] alu_op;
wire[4:0] rs1;
wire[4:0] rs2;
wire rs2_src;
wire[31:0] itype_immed;
wire[19:0] upper_immed;
wire [`NUM_THREADS-1:0][31:0] a_reg_data;
wire [`NUM_THREADS-1:0][31:0] b_reg_data;
wire [4:0] alu_op;
wire [4:0] rs1;
wire [4:0] rs2;
wire rs2_src;
wire [31:0] itype_immed;
wire [19:0] upper_immed;
// Branch type
wire[2:0] branch_type;
wire [2:0] branch_type;
// Jal info
wire jalQual;
wire jal;
wire[31:0] jal_offset;
wire jalQual;
wire jal;
wire [31:0] jal_offset;
/* verilator lint_off UNUSED */
wire ebreak;
wire wspawn;
/* verilator lint_on UNUSED */
/* verilator lint_off UNUSED */
wire ebreak;
wire wspawn;
/* verilator lint_on UNUSED */
// CSR info
wire is_csr;
wire[11:0] csr_address;
wire csr_immed;
wire[31:0] csr_mask;
wire is_csr;
wire [11:0] csr_address;
wire csr_immed;
wire [31:0] csr_mask;
endinterface

View File

@@ -5,37 +5,37 @@
interface VX_frE_to_bckE_req_inter ();
wire[11:0] csr_address;
wire is_csr;
wire csr_immed;
wire[31:0] csr_mask;
wire[4:0] rd;
wire[4:0] rs1;
wire[4:0] rs2;
wire[4:0] alu_op;
wire[1:0] wb;
wire rs2_src;
wire[31:0] itype_immed;
wire[2:0] mem_read;
wire[2:0] mem_write;
wire[2:0] branch_type;
wire[19:0] upper_immed;
wire[31:0] curr_PC;
/* verilator lint_off UNUSED */
wire ebreak;
/* verilator lint_on UNUSED */
wire jalQual;
wire jal;
wire[31:0] jal_offset;
wire[31:0] PC_next;
wire[`NUM_THREADS-1:0] valid;
wire[`NW_BITS-1:0] warp_num;
wire [11:0] csr_address;
wire is_csr;
wire csr_immed;
wire [31:0] csr_mask;
wire [4:0] rd;
wire [4:0] rs1;
wire [4:0] rs2;
wire [4:0] alu_op;
wire [1:0] wb;
wire rs2_src;
wire [31:0] itype_immed;
wire [2:0] mem_read;
wire [2:0] mem_write;
wire [2:0] branch_type;
wire [19:0] upper_immed;
wire [31:0] curr_PC;
/* verilator lint_off UNUSED */
wire ebreak;
/* verilator lint_on UNUSED */
wire jalQual;
wire jal;
wire [31:0] jal_offset;
wire [31:0] PC_next;
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
// GPGPU stuff
wire is_wspawn;
wire is_tmc;
wire is_split;
wire is_barrier;
wire is_wspawn;
wire is_tmc;
wire is_split;
wire is_barrier;
endinterface

View File

@@ -5,12 +5,10 @@
`include "../VX_define.vh"
interface VX_gpr_clone_inter ();
/* verilator lint_off UNUSED */
wire is_clone;
wire[`NW_BITS-1:0] warp_num;
wire is_clone;
wire[`NW_BITS-1:0] warp_num;
/* verilator lint_on UNUSED */
endinterface
`endif

View File

@@ -6,8 +6,8 @@
interface VX_gpr_data_inter ();
wire[`NUM_THREADS-1:0][31:0] a_reg_data;
wire[`NUM_THREADS-1:0][31:0] b_reg_data;
wire [`NUM_THREADS-1:0][31:0] a_reg_data;
wire [`NUM_THREADS-1:0][31:0] b_reg_data;
endinterface

View File

@@ -5,9 +5,9 @@
interface VX_gpr_read_inter ();
wire[4:0] rs1;
wire[4:0] rs2;
wire[`NW_BITS-1:0] warp_num;
wire [4:0] rs1;
wire [4:0] rs2;
wire [`NW_BITS-1:0] warp_num;
endinterface

View File

@@ -6,10 +6,9 @@
interface VX_gpr_wspawn_inter ();
/* verilator lint_off UNUSED */
wire is_wspawn;
wire[`NW_BITS-1:0] which_wspawn;
wire [`NW_BITS-1:0] which_wspawn;
// wire[`NW_BITS-1:0] warp_num;
/* verilator lint_on UNUSED */
endinterface
`endif

View File

@@ -1,33 +1,20 @@
`ifndef VX_GPU_DRAM_DCACHE_REQ
`define VX_GPU_DRAM_DCACHE_REQ
`include "../generic_cache/VX_cache_config.vh"
interface VX_gpu_dcache_dram_req_inter
#(
parameter BANK_LINE_WORDS = 2
)
();
interface VX_gpu_dcache_dram_req_inter #(
parameter BANK_LINE_WORDS = 2
) ();
// DRAM Request
wire dram_req;
wire dram_req_write;
wire dram_req_read;
wire [31:0] dram_req_addr;
wire [31:0] dram_req_size;
wire [BANK_LINE_WORDS-1:0][31:0] dram_req_data;
wire [BANK_LINE_WORDS-1:0][31:0] dram_req_data;
wire dram_req_full;
// Snoop
wire dram_because_of_snp;
wire dram_snp_full;
// DRAM Cache can't accept response
wire dram_fill_accept;
// DRAM Cache can't accept request
wire dram_req_delay;
wire dram_rsp_ready;
endinterface

View File

@@ -1,18 +0,0 @@
`ifndef VX_GPU_DRAM_DCACHE_RES
`define VX_GPU_DRAM_DCACHE_RES
`include "../generic_cache/VX_cache_config.vh"
interface VX_gpu_dcache_dram_res_inter
#(
parameter BANK_LINE_WORDS = 2
)
();
// DRAM Rsponse
wire dram_fill_rsp;
wire [31:0] dram_fill_rsp_addr;
wire [BANK_LINE_WORDS-1:0][31:0] dram_fill_rsp_data;
endinterface
`endif

View File

@@ -0,0 +1,16 @@
`ifndef VX_GPU_DRAM_DCACHE_RSP
`define VX_GPU_DRAM_DCACHE_RSP
`include "../generic_cache/VX_cache_config.vh"
interface VX_gpu_dcache_dram_rsp_inter #(
parameter BANK_LINE_WORDS = 2
) ();
// DRAM Response
wire dram_rsp_valid;
wire [31:0] dram_rsp_addr;
wire [BANK_LINE_WORDS-1:0][31:0] dram_rsp_data;
endinterface
`endif

View File

@@ -1,15 +1,11 @@
`ifndef VX_GPU_DCACHE_REQ
`define VX_GPU_DCACHE_REQ
`include "../generic_cache/VX_cache_config.vh"
interface VX_gpu_dcache_req_inter
#(
parameter NUM_REQUESTS = 32
)
();
interface VX_gpu_dcache_req_inter #(
parameter NUM_REQUESTS = 32
) ();
// Core Request
wire [NUM_REQUESTS-1:0] core_req_valid;

View File

@@ -1,17 +1,18 @@
`ifndef VX_GPU_DCACHE_RES
`define VX_GPU_DCACHE_RES
`ifndef VX_GPU_DCACHE_RSP
`define VX_GPU_DCACHE_RSP
`include "../generic_cache/VX_cache_config.vh"
interface VX_gpu_dcache_res_inter
#(
parameter NUM_REQUESTS = 32
) ();
interface VX_gpu_dcache_rsp_inter #(
parameter NUM_REQUESTS = 32
) ();
// Cache WB
wire [NUM_REQUESTS-1:0] core_wb_valid;
/* verilator lint_off UNUSED */
wire [4:0] core_wb_req_rd;
wire [1:0] core_wb_req_wb;
/* verilator lint_off UNUSED */
wire [`NW_BITS-1:0] core_wb_warp_num;
wire [NUM_REQUESTS-1:0][31:0] core_wb_readdata;
wire [NUM_REQUESTS-1:0][31:0] core_wb_pc;

View File

@@ -5,7 +5,7 @@
interface VX_gpu_dcache_snp_req_inter ();
// Snoop Req
wire snp_req;
wire snp_req_valid;
wire [31:0] snp_req_addr;
endinterface

View File

@@ -5,8 +5,8 @@
interface VX_gpu_inst_req_inter();
wire[`NUM_THREADS-1:0] valid;
wire[`NW_BITS-1:0] warp_num;
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
wire is_wspawn;
wire is_tmc;
wire is_split;
@@ -15,8 +15,8 @@ interface VX_gpu_inst_req_inter();
wire[31:0] pc_next;
wire[`NUM_THREADS-1:0][31:0] a_reg_data;
wire[31:0] rd2;
wire [`NUM_THREADS-1:0][31:0] a_reg_data;
wire [31:0] rd2;
endinterface

View File

@@ -6,11 +6,12 @@
interface VX_gpu_snp_req_rsp ();
// Snoop request
wire snp_req;
wire[31:0] snp_req_addr;
wire snp_req_valid;
wire [31:0] snp_req_addr;
wire snp_req_full;
// Snoop Response
wire snp_delay;
// TODO:
endinterface

View File

@@ -6,11 +6,11 @@
interface VX_icache_request_inter ();
wire[31:0] pc_address;
wire[2:0] out_cache_driver_in_mem_read;
wire[2:0] out_cache_driver_in_mem_write;
wire out_cache_driver_in_valid;
wire[31:0] out_cache_driver_in_data;
wire [31:0] pc_address;
wire [2:0] out_cache_driver_in_mem_read;
wire [2:0] out_cache_driver_in_mem_write;
wire out_cache_driver_in_valid;
wire [31:0] out_cache_driver_in_data;
endinterface

View File

@@ -7,8 +7,8 @@ interface VX_icache_response_inter ();
// wire ready;
// wire stall;
wire[31:0] instruction;
wire delay;
wire [31:0] instruction;
wire delay;
endinterface

View File

@@ -6,12 +6,12 @@
interface VX_inst_exec_wb_inter ();
wire[`NUM_THREADS-1:0][31:0] alu_result;
wire[31:0] exec_wb_pc;
wire[4:0] rd;
wire[1:0] wb;
wire[`NUM_THREADS-1:0] wb_valid;
wire[`NW_BITS-1:0] wb_warp_num;
wire [`NUM_THREADS-1:0][31:0] alu_result;
wire [31:0] exec_wb_pc;
wire [4:0] rd;
wire [1:0] wb;
wire [`NUM_THREADS-1:0] wb_valid;
wire [`NW_BITS-1:0] wb_warp_num;
endinterface

View File

@@ -6,12 +6,12 @@
interface VX_inst_mem_wb_inter ();
wire[`NUM_THREADS-1:0][31:0] loaded_data;
wire[31:0] mem_wb_pc;
wire[4:0] rd;
wire[1:0] wb;
wire[`NUM_THREADS-1:0] wb_valid;
wire[`NW_BITS-1:0] wb_warp_num;
wire [`NUM_THREADS-1:0][31:0] loaded_data;
wire [31:0] mem_wb_pc;
wire [4:0] rd;
wire [1:0] wb;
wire [`NUM_THREADS-1:0] wb_valid;
wire [`NW_BITS-1:0] wb_warp_num;
endinterface

View File

@@ -5,10 +5,10 @@
interface VX_inst_meta_inter ();
wire[31:0] instruction;
wire[31:0] inst_pc;
wire[`NW_BITS-1:0] warp_num;
wire[`NUM_THREADS-1:0] valid;
wire [31:0] instruction;
wire [31:0] inst_pc;
wire [`NW_BITS-1:0] warp_num;
wire [`NUM_THREADS-1:0] valid;
endinterface

View File

@@ -7,8 +7,8 @@
interface VX_jal_response_inter ();
wire jal;
wire[31:0] jal_dest;
wire[`NW_BITS-1:0] jal_warp_num;
wire [31:0] jal_dest;
wire [`NW_BITS-1:0] jal_warp_num;
endinterface

View File

@@ -6,8 +6,8 @@
interface VX_join_inter ();
wire is_join;
wire[`NW_BITS-1:0] join_warp_num;
wire is_join;
wire [`NW_BITS-1:0] join_warp_num;
endinterface

View File

@@ -6,16 +6,16 @@
interface VX_lsu_req_inter ();
wire[`NUM_THREADS-1:0] valid;
wire[31:0] lsu_pc;
wire[`NW_BITS-1:0] warp_num;
wire[`NUM_THREADS-1:0][31:0] store_data;
wire[`NUM_THREADS-1:0][31:0] base_address; // A reg data
wire[31:0] offset; // itype_immed
wire[2:0] mem_read;
wire[2:0] mem_write;
wire[4:0] rd;
wire[1:0] wb;
wire [`NUM_THREADS-1:0] valid;
wire [31:0] lsu_pc;
wire [`NW_BITS-1:0] warp_num;
wire [`NUM_THREADS-1:0][31:0] store_data;
wire [`NUM_THREADS-1:0][31:0] base_address; // A reg data
wire [31:0] offset; // itype_immed
wire [2:0] mem_read;
wire [2:0] mem_write;
wire [4:0] rd;
wire [1:0] wb;
endinterface

View File

@@ -5,20 +5,20 @@
interface VX_mem_req_inter ();
wire[`NUM_THREADS-1:0][31:0] alu_result;
wire[2:0] mem_read;
wire[2:0] mem_write;
wire[4:0] rd;
wire[1:0] wb;
wire[4:0] rs1;
wire[4:0] rs2;
wire[`NUM_THREADS-1:0][31:0] rd2;
wire[31:0] PC_next;
wire[31:0] curr_PC;
wire[31:0] branch_offset;
wire[2:0] branch_type;
wire[`NUM_THREADS-1:0] valid;
wire[`NW_BITS-1:0] warp_num;
wire [`NUM_THREADS-1:0][31:0] alu_result;
wire [2:0] mem_read;
wire [2:0] mem_write;
wire [4:0] rd;
wire [1:0] wb;
wire [4:0] rs1;
wire [4:0] rs2;
wire [`NUM_THREADS-1:0][31:0] rd2;
wire [31:0] PC_next;
wire [31:0] curr_PC;
wire [31:0] branch_offset;
wire [2:0] branch_type;
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
endinterface

View File

@@ -6,13 +6,13 @@
interface VX_mw_wb_inter ();
wire[`NUM_THREADS-1:0][31:0] alu_result;
wire[`NUM_THREADS-1:0][31:0] mem_result;
wire[4:0] rd;
wire[1:0] wb;
wire[31:0] PC_next;
wire[`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
wire [`NUM_THREADS-1:0][31:0] alu_result;
wire [`NUM_THREADS-1:0][31:0] mem_result;
wire [4:0] rd;
wire [1:0] wb;
wire [31:0] PC_next;
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
endinterface

View File

@@ -6,27 +6,29 @@
interface VX_warp_ctl_inter ();
wire[`NW_BITS-1:0] warp_num;
wire change_mask;
wire[`NUM_THREADS-1:0] thread_mask;
wire [`NW_BITS-1:0] warp_num;
wire change_mask;
wire [`NUM_THREADS-1:0] thread_mask;
wire wspawn;
wire[31:0] wspawn_pc;
wire[`NUM_WARPS-1:0] wspawn_new_active;
wire wspawn;
wire [31:0] wspawn_pc;
wire [`NUM_WARPS-1:0] wspawn_new_active;
wire ebreak;
wire ebreak;
// barrier
wire is_barrier;
wire[31:0] barrier_id;
wire[$clog2(`NUM_WARPS):0] num_warps;
wire is_barrier;
wire [31:0] barrier_id;
wire [$clog2(`NUM_WARPS):0] num_warps;
wire is_split;
wire dont_split;
wire[`NW_BITS-1:0] split_warp_num;
wire[`NUM_THREADS-1:0] split_new_mask;
wire[`NUM_THREADS-1:0] split_later_mask;
wire[31:0] split_save_pc;
wire is_split;
wire dont_split;
/* verilator lint_off UNUSED */
wire [`NW_BITS-1:0] split_warp_num;
/* verilator lint_on UNUSED */
wire [`NUM_THREADS-1:0] split_new_mask;
wire [`NUM_THREADS-1:0] split_later_mask;
wire [31:0] split_save_pc;
endinterface

View File

@@ -5,12 +5,12 @@
interface VX_wb_inter ();
wire[`NUM_THREADS-1:0][31:0] write_data;
wire[31:0] wb_pc;
wire[4:0] rd;
wire[1:0] wb;
wire[`NUM_THREADS-1:0] wb_valid;
wire[`NW_BITS-1:0] wb_warp_num;
wire [`NUM_THREADS-1:0][31:0] write_data;
wire [31:0] wb_pc;
wire [4:0] rd;
wire [1:0] wb;
wire [`NUM_THREADS-1:0] wb_valid;
wire [`NW_BITS-1:0] wb_warp_num;
endinterface

View File

@@ -5,8 +5,8 @@
interface VX_wstall_inter();
wire wstall;
wire[`NW_BITS-1:0] warp_num;
wire wstall;
wire [`NW_BITS-1:0] warp_num;
endinterface

View File

@@ -1,32 +1,28 @@
`include "../VX_define.vh"
module VX_d_e_reg (
input wire clk,
input wire reset,
input wire in_branch_stall,
input wire in_freeze,
VX_frE_to_bckE_req_inter VX_frE_to_bckE_req,
input wire clk,
input wire reset,
input wire in_branch_stall,
input wire in_freeze,
VX_frE_to_bckE_req_inter vx_frE_to_bckE_req,
VX_frE_to_bckE_req_inter vx_bckE_req
);
wire stall = in_freeze;
wire flush = (in_branch_stall == `STALL);
VX_frE_to_bckE_req_inter VX_bckE_req
VX_generic_register #(
.N(233 + `NW_BITS-1 + 1 + `NUM_THREADS)
) d_e_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (flush),
.in ({vx_frE_to_bckE_req.csr_address, vx_frE_to_bckE_req.jalQual, vx_frE_to_bckE_req.ebreak, vx_frE_to_bckE_req.is_csr, vx_frE_to_bckE_req.csr_immed, vx_frE_to_bckE_req.csr_mask, vx_frE_to_bckE_req.rd, vx_frE_to_bckE_req.rs1, vx_frE_to_bckE_req.rs2, vx_frE_to_bckE_req.alu_op, vx_frE_to_bckE_req.wb, vx_frE_to_bckE_req.rs2_src, vx_frE_to_bckE_req.itype_immed, vx_frE_to_bckE_req.mem_read, vx_frE_to_bckE_req.mem_write, vx_frE_to_bckE_req.branch_type, vx_frE_to_bckE_req.upper_immed, vx_frE_to_bckE_req.curr_PC, vx_frE_to_bckE_req.jal, vx_frE_to_bckE_req.jal_offset, vx_frE_to_bckE_req.PC_next, vx_frE_to_bckE_req.valid, vx_frE_to_bckE_req.warp_num, vx_frE_to_bckE_req.is_wspawn, vx_frE_to_bckE_req.is_tmc, vx_frE_to_bckE_req.is_split, vx_frE_to_bckE_req.is_barrier}),
.out ({vx_bckE_req.csr_address , vx_bckE_req.jalQual , vx_bckE_req.ebreak ,vx_bckE_req.is_csr , vx_bckE_req.csr_immed , vx_bckE_req.csr_mask , vx_bckE_req.rd , vx_bckE_req.rs1 , vx_bckE_req.rs2 , vx_bckE_req.alu_op , vx_bckE_req.wb , vx_bckE_req.rs2_src , vx_bckE_req.itype_immed , vx_bckE_req.mem_read , vx_bckE_req.mem_write , vx_bckE_req.branch_type , vx_bckE_req.upper_immed , vx_bckE_req.curr_PC , vx_bckE_req.jal , vx_bckE_req.jal_offset , vx_bckE_req.PC_next , vx_bckE_req.valid , vx_bckE_req.warp_num , vx_bckE_req.is_wspawn , vx_bckE_req.is_tmc , vx_bckE_req.is_split , vx_bckE_req.is_barrier })
);
wire stall = in_freeze;
wire flush = (in_branch_stall == `STALL);
VX_generic_register #(.N(233 + `NW_BITS-1 + 1 + `NUM_THREADS)) d_e_reg
(
.clk (clk),
.reset(reset),
.stall(stall),
.flush(flush),
.in ({VX_frE_to_bckE_req.csr_address, VX_frE_to_bckE_req.jalQual, VX_frE_to_bckE_req.ebreak, VX_frE_to_bckE_req.is_csr, VX_frE_to_bckE_req.csr_immed, VX_frE_to_bckE_req.csr_mask, VX_frE_to_bckE_req.rd, VX_frE_to_bckE_req.rs1, VX_frE_to_bckE_req.rs2, VX_frE_to_bckE_req.alu_op, VX_frE_to_bckE_req.wb, VX_frE_to_bckE_req.rs2_src, VX_frE_to_bckE_req.itype_immed, VX_frE_to_bckE_req.mem_read, VX_frE_to_bckE_req.mem_write, VX_frE_to_bckE_req.branch_type, VX_frE_to_bckE_req.upper_immed, VX_frE_to_bckE_req.curr_PC, VX_frE_to_bckE_req.jal, VX_frE_to_bckE_req.jal_offset, VX_frE_to_bckE_req.PC_next, VX_frE_to_bckE_req.valid, VX_frE_to_bckE_req.warp_num, VX_frE_to_bckE_req.is_wspawn, VX_frE_to_bckE_req.is_tmc, VX_frE_to_bckE_req.is_split, VX_frE_to_bckE_req.is_barrier}),
.out ({VX_bckE_req.csr_address , VX_bckE_req.jalQual , VX_bckE_req.ebreak ,VX_bckE_req.is_csr , VX_bckE_req.csr_immed , VX_bckE_req.csr_mask , VX_bckE_req.rd , VX_bckE_req.rs1 , VX_bckE_req.rs2 , VX_bckE_req.alu_op , VX_bckE_req.wb , VX_bckE_req.rs2_src , VX_bckE_req.itype_immed , VX_bckE_req.mem_read , VX_bckE_req.mem_write , VX_bckE_req.branch_type , VX_bckE_req.upper_immed , VX_bckE_req.curr_PC , VX_bckE_req.jal , VX_bckE_req.jal_offset , VX_bckE_req.PC_next , VX_bckE_req.valid , VX_bckE_req.warp_num , VX_bckE_req.is_wspawn , VX_bckE_req.is_tmc , VX_bckE_req.is_split , VX_bckE_req.is_barrier })
);
endmodule

View File

@@ -102,7 +102,7 @@ module VX_priority_encoder_sm
// wire[`NUM_THREADS-1:0] new_left_requests = left_requests & ~(serviced_qual);
always @(posedge clk, posedge reset) begin
always @(posedge clk) begin
if (reset) begin
left_requests <= 0;
// serviced = 0;

View File

@@ -8,8 +8,8 @@ module VX_shared_memory_block
parameter BITS_PER_BANK = 3
)
(
input wire clk, // Clock
input wire reset,
input wire clk, // Clock
input wire reset,
//input wire[6:0] addr,
//input wire[3:0][31:0] wdata,
//input wire[1:0] we,
@@ -22,28 +22,16 @@ module VX_shared_memory_block
input wire shm_write,
output wire[SMB_WORDS_PER_READ-1:0][31:0] data_out
);
`ifndef SYN
reg[SMB_WORDS_PER_READ-1:0][3:0][7:0] shared_memory[SMB_HEIGHT-1:0];
wire [$clog2(SMB_HEIGHT) - 1:0]reg_addr;
reg [SMB_WORDS_PER_READ-1:0][3:0][7:0] shared_memory[SMB_HEIGHT-1:0];
wire [$clog2(SMB_HEIGHT) - 1:0] reg_addr;
//wire need_to_write = (|we);
integer curr_ind;
// initial begin
// for (curr_ind = 0; curr_ind < SMB_HEIGHT; curr_ind = curr_ind + 1)
// begin
// shared_memory[curr_ind] = 0;
// end
// end
always @(posedge clk, posedge reset) begin
always @(posedge clk) begin
if (reset) begin
//for (curr_ind = 0; curr_ind < 128; curr_ind = curr_ind + 1)
end else if(shm_write) begin
//--
end else if (shm_write) begin
if (we == 2'b00) shared_memory[reg_addr][0] <= wdata[0];
if (we == 2'b01) shared_memory[reg_addr][1] <= wdata[1];
if (we == 2'b10) shared_memory[reg_addr][2] <= wdata[2];
@@ -52,10 +40,6 @@ module VX_shared_memory_block
end
assign reg_addr = addr;
// always @(posedge clk)
// reg_addr <= addr;
assign data_out = shm_write ? 0 : shared_memory[reg_addr];
`else
@@ -69,6 +53,7 @@ module VX_shared_memory_block
//assign write_bit_mask[1] = (we == 2'b01) ? {32{1'b1}} : {32{1'b0}};
//assign write_bit_mask[2] = (we == 2'b10) ? {32{1'b1}} : {32{1'b0}};
//assign write_bit_mask[3] = (we == 2'b11) ? {32{1'b1}} : {32{1'b0}};
genvar curr_word;
for (curr_word = 0; curr_word < SMB_WORDS_PER_READ; curr_word = curr_word + 1)
begin
@@ -115,7 +100,6 @@ module VX_shared_memory_block
);
/* verilator lint_on PINCONNECTEMPTY */
`endif
endmodule

View File

@@ -51,18 +51,17 @@ void Simulator::ibus_driver() {
}
}
if (vortex_->I_dram_req && !I_dram_stalled_) {
if (!I_dram_stalled_) {
// std::cout << "Icache Dram Request received!\n";
if (vortex_->I_dram_req_read) {
// std::cout << "Icache Dram Request is read!\n";
// Need to add an element
dram_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
dram_req.data_length = vortex_->I_dram_req_size / 4;
dram_req.base_addr = vortex_->I_dram_req_addr;
dram_req.data = (unsigned *)malloc(dram_req.data_length * sizeof(unsigned));
dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES);
for (int i = 0; i < dram_req.data_length; i++) {
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = dram_req.base_addr + (i * 4);
unsigned data_rd;
ram_->getWord(curr_addr, &data_rd);
@@ -74,9 +73,8 @@ void Simulator::ibus_driver() {
if (vortex_->I_dram_req_write) {
unsigned base_addr = vortex_->I_dram_req_addr;
unsigned data_length = vortex_->I_dram_req_size / 4;
for (int i = 0; i < data_length; i++) {
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = base_addr + (i * 4);
unsigned data_wr = vortex_->I_dram_req_data[i];
ram_->writeWord(curr_addr, &data_wr);
@@ -84,22 +82,22 @@ void Simulator::ibus_driver() {
}
}
if (vortex_->I_dram_fill_accept && dequeue_valid) {
if (vortex_->I_dram_rsp_ready && dequeue_valid) {
// std::cout << "Icache Dram Response Sending...!\n";
vortex_->I_dram_fill_rsp = 1;
vortex_->I_dram_fill_rsp_addr = I_dram_req_vec_[dequeue_index].base_addr;
vortex_->I_dram_rsp_valid = 1;
vortex_->I_dram_rsp_addr = I_dram_req_vec_[dequeue_index].base_addr;
// std::cout << "Fill Rsp -> Addr: " << std::hex << (I_dram_req_vec_[dequeue_index].base_addr) << std::dec << "\n";
for (int i = 0; i < I_dram_req_vec_[dequeue_index].data_length; i++) {
vortex_->I_dram_fill_rsp_data[i] = I_dram_req_vec_[dequeue_index].data[i];
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
vortex_->I_dram_rsp_data[i] = I_dram_req_vec_[dequeue_index].data[i];
}
free(I_dram_req_vec_[dequeue_index].data);
I_dram_req_vec_.erase(I_dram_req_vec_.begin() + dequeue_index);
} else {
vortex_->I_dram_fill_rsp = 0;
vortex_->I_dram_fill_rsp_addr = 0;
vortex_->I_dram_rsp_valid = 0;
vortex_->I_dram_rsp_addr = 0;
}
#ifdef ENABLE_DRAM_STALLS
@@ -112,7 +110,7 @@ void Simulator::ibus_driver() {
}
#endif
vortex_->dram_req_delay = I_dram_stalled_;
vortex_->dram_req_full = I_dram_stalled_;
}
#endif
@@ -144,63 +142,15 @@ void Simulator::dbus_driver() {
#ifdef USE_MULTICORE
if (vortex_->out_dram_req && !dram_stalled_) {
if (vortex_->out_dram_req_read) {
// Need to add an element
dram_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
dram_req.data_length = vortex_->out_dram_req_size / 4;
dram_req.base_addr = vortex_->out_dram_req_addr;
dram_req.data = (unsigned *)malloc(dram_req.data_length * sizeof(unsigned));
for (int i = 0; i < dram_req.data_length; i++) {
unsigned curr_addr = dram_req.base_addr + (i * 4);
unsigned data_rd;
ram_->getWord(curr_addr, &data_rd);
dram_req.data[i] = data_rd;
}
dram_req_vec_.push_back(dram_req);
}
if (vortex_->out_dram_req_write) {
unsigned base_addr = vortex_->out_dram_req_addr;
unsigned data_length = vortex_->out_dram_req_size / 4;
for (int i = 0; i < data_length; i++) {
unsigned curr_addr = base_addr + (i * 4);
unsigned data_wr = vortex_->out_dram_req_data[i];
ram_->writeWord(curr_addr, &data_wr);
}
}
}
if (vortex_->out_dram_fill_accept && dequeue_valid) {
vortex_->out_dram_fill_rsp = 1;
vortex_->out_dram_fill_rsp_addr = dram_req_vec_[dequeue_index].base_addr;
for (int i = 0; i < dram_req_vec_[dequeue_index].data_length; i++) {
vortex_->out_dram_fill_rsp_data[i] = dram_req_vec_[dequeue_index].data[i];
}
free(dram_req_vec_[dequeue_index].data);
dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index);
} else {
vortex_->out_dram_fill_rsp = 0;
vortex_->out_dram_fill_rsp_addr = 0;
}
#else
if (vortex_->dram_req && !dram_stalled_) {
if (!dram_stalled_) {
if (vortex_->dram_req_read) {
// Need to add an element
dram_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
dram_req.data_length = vortex_->dram_req_size / 4;
dram_req.base_addr = vortex_->dram_req_addr;
dram_req.data = (unsigned *)malloc(dram_req.data_length * sizeof(unsigned));
dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES);
for (int i = 0; i < dram_req.data_length; i++) {
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = dram_req.base_addr + (i * 4);
unsigned data_rd;
ram_->getWord(curr_addr, &data_rd);
@@ -211,9 +161,8 @@ void Simulator::dbus_driver() {
if (vortex_->dram_req_write) {
unsigned base_addr = vortex_->dram_req_addr;
unsigned data_length = vortex_->dram_req_size / 4;
for (int i = 0; i < data_length; i++) {
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = base_addr + (i * 4);
unsigned data_wr = vortex_->dram_req_data[i];
ram_->writeWord(curr_addr, &data_wr);
@@ -221,34 +170,79 @@ void Simulator::dbus_driver() {
}
}
if (vortex_->dram_fill_accept && dequeue_valid) {
vortex_->dram_fill_rsp = 1;
vortex_->dram_fill_rsp_addr = dram_req_vec_[dequeue_index].base_addr;
if (vortex_->dram_rsp_ready && dequeue_valid) {
vortex_->dram_rsp_valid = 1;
vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr;
for (int i = 0; i < dram_req_vec_[dequeue_index].data_length; i++) {
vortex_->dram_fill_rsp_data[i] = dram_req_vec_[dequeue_index].data[i];
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i];
}
free(dram_req_vec_[dequeue_index].data);
dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index);
} else {
vortex_->dram_fill_rsp = 0;
vortex_->dram_fill_rsp_addr = 0;
vortex_->dram_rsp_valid = 0;
vortex_->dram_rsp_addr = 0;
}
#else
if (!dram_stalled_) {
if (vortex_->dram_req_read) {
// Need to add an element
dram_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
dram_req.base_addr = vortex_->dram_req_addr;
dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES);
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = dram_req.base_addr + (i * 4);
unsigned data_rd;
ram_->getWord(curr_addr, &data_rd);
dram_req.data[i] = data_rd;
}
dram_req_vec_.push_back(dram_req);
}
if (vortex_->dram_req_write) {
unsigned base_addr = vortex_->dram_req_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = base_addr + (i * 4);
unsigned data_wr = vortex_->dram_req_data[i];
ram_->writeWord(curr_addr, &data_wr);
}
}
}
if (vortex_->dram_rsp_ready && dequeue_valid) {
vortex_->dram_rsp_valid = 1;
vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i];
}
free(dram_req_vec_[dequeue_index].data);
dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index);
} else {
vortex_->dram_rsp_valid = 0;
vortex_->dram_rsp_addr = 0;
}
#endif
#ifdef USE_MULTICORE
vortex_->out_dram_req_delay = dram_stalled_;
vortex_->dram_req_full = dram_stalled_;
#else
vortex_->dram_req_delay = dram_stalled_;
vortex_->dram_req_full = dram_stalled_;
#endif
}
void Simulator::io_handler() {
#ifdef USE_MULTICORE
bool io_valid = false;
for (int c = 0; c < vortex_->number_cores; c++) {
for (int c = 0; c < NUM_CORES; c++) {
if (vortex_->io_valid[c]) {
uint32_t data_write = (uint32_t)vortex_->io_data[c];
char c = (char)data_write;
@@ -318,33 +312,33 @@ void Simulator::send_snoops(uint32_t mem_addr, uint32_t size) {
#ifdef USE_MULTICORE
// submit snoop requests for the needed blocks
vortex_->llc_snp_req_addr = aligned_addr_start;
vortex_->llc_snp_req = false;
vortex_->llc_snp_req_valid = false;
for (;;) {
this->step();
if (vortex_->llc_snp_req) {
vortex_->llc_snp_req = false;
if (vortex_->llc_snp_req_valid) {
vortex_->llc_snp_req_valid = false;
if (vortex_->llc_snp_req_addr >= aligned_addr_end)
break;
vortex_->llc_snp_req_addr += GLOBAL_BLOCK_SIZE_BYTES;
}
if (!vortex_->llc_snp_req_delay) {
vortex_->llc_snp_req = true;
if (!vortex_->llc_snp_req_full) {
vortex_->llc_snp_req_valid = true;
}
}
#else
// submit snoop requests for the needed blocks
vortex_->snp_req_addr = aligned_addr_start;
vortex_->snp_req = false;
vortex_->snp_req_valid = false;
for (;;) {
this->step();
if (vortex_->snp_req) {
vortex_->snp_req = false;
if (vortex_->snp_req_valid) {
vortex_->snp_req_valid = false;
if (vortex_->snp_req_addr >= aligned_addr_end)
break;
vortex_->snp_req_addr += GLOBAL_BLOCK_SIZE_BYTES;
}
if (!vortex_->snp_req_delay) {
vortex_->snp_req = true;
if (!vortex_->snp_req_full) {
vortex_->snp_req_valid = true;
}
}
#endif
@@ -362,7 +356,6 @@ void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) {
// this->send_snoops(mem_addr, size);
// this->wait(PIPELINE_FLUSH_LATENCY);
// #endif
}
bool Simulator::run() {
@@ -381,7 +374,7 @@ bool Simulator::run() {
int status = 0;
#else
// check riscv-tests PASSED/FAILED status
int status = (int)vortex_->Vortex->vx_back_end->VX_wb->last_data_wb & 0xf;
int status = (int)vortex_->Vortex->vx_back_end->vx_wb->last_data_wb & 0xf;
#endif
return (status == 1);

View File

@@ -27,7 +27,6 @@
typedef struct {
int cycles_left;
int data_length;
unsigned base_addr;
unsigned *data;
} dram_req_t;

View File

@@ -12,7 +12,7 @@ int main(int argc, char **argv)
Verilated::commandArgs(argc, argv);
//#define ALL_TESTS
#define ALL_TESTS
#ifdef ALL_TESTS
bool passed = true;