diff --git a/rtl/VX_cache/VX_bank.v b/rtl/VX_cache/VX_bank.v index 3167daf3..9896efe6 100644 --- a/rtl/VX_cache/VX_bank.v +++ b/rtl/VX_cache/VX_bank.v @@ -49,6 +49,7 @@ module VX_bank ( wire dfpq_full; wire[31:0] dfpq_addr_st0; wire[`BANK_LINE_SIZE_RNG][31:0] dfpq_filldata_st0; + reg dfpq_hazard_st0; assign dram_fill_accept = !dfpq_full; @@ -76,6 +77,7 @@ module VX_bank ( wire [`NW_M1:0] reqq_req_warp_num_st0; wire [2:0] reqq_req_mem_read_st0; wire [2:0] reqq_req_mem_write_st0; + reg reqq_hazard_st0; assign reqq_push = !delay_req && (|bank_valids); @@ -119,6 +121,7 @@ module VX_bank ( wire [`NW_M1:0] mrvq_warp_num_st0; wire [2:0] mrvq_mem_read_st0; wire [2:0] mrvq_mem_write_st0; + reg mrvq_hazard_st0; wire miss_add; wire[31:0] miss_add_addr; @@ -165,9 +168,27 @@ module VX_bank ( wire stall_bank_pipe; - assign dfpq_pop = !dfpq_empty && !stall_bank_pipe; - assign mrvq_pop = !dfpq_pop && mrvq_valid_st0 && !stall_bank_pipe; - assign reqq_pop = !mrvq_pop && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0]; + assign dfpq_pop = !dfpq_empty && !stall_bank_pipe && !dfpq_hazard_st0; + assign mrvq_pop = !dfpq_pop && mrvq_valid_st0 && !stall_bank_pipe && !mrvq_hazard_st0; + assign reqq_pop = !mrvq_pop && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0] && !reqq_hazard_st0; + + + integer st1_cycle; + + always @(*) begin + assign dfpq_hazard_st0 = 0; + assign mrvq_hazard_st0 = 0; + assign reqq_hazard_st0 = 0; + for (st1_cycle = 0; st1_cycle < `STAGE_1_CYCLES; st1_cycle = st1_cycle + 1) begin + if (valid_st1[st1_cycle] && going_to_write_st1[st1_cycle]) begin + if (dfpq_addr_st0 [31:`LINE_SELECT_ADDR_START] == addr_st1[st1_cycle][31:`LINE_SELECT_ADDR_START]) assign dfpq_hazard_st0 = 1; + if (mrvq_addr_st0 [31:`LINE_SELECT_ADDR_START] == addr_st1[st1_cycle][31:`LINE_SELECT_ADDR_START]) assign mrvq_hazard_st0 = 1; + if (reqq_req_addr_st0[31:`LINE_SELECT_ADDR_START] == addr_st1[st1_cycle][31:`LINE_SELECT_ADDR_START]) assign reqq_hazard_st0 = 1; + end + end + end + + wire qual_is_fill_st0; @@ -176,13 +197,15 @@ module VX_bank ( wire [31:0] qual_writeword_st0; wire [`BANK_LINE_SIZE_RNG][31:0] qual_writedata_st0; wire [`REQ_INST_META_SIZE-1:0] qual_inst_meta_st0; + wire qual_going_to_write_st0; - wire valid_st1[`STAGE_1_CYCLES-1:0]; - wire [31:0] addr_st1[`STAGE_1_CYCLES-1:0]; - wire [31:0] writeword_st1[`STAGE_1_CYCLES-1:0]; - wire [`REQ_INST_META_SIZE-1:0] inst_meta_st1[`STAGE_1_CYCLES-1:0]; - wire is_fill_st1[`STAGE_1_CYCLES-1:0]; - wire [`BANK_LINE_SIZE_RNG][31:0] writedata_st1[`STAGE_1_CYCLES-1:0]; + wire valid_st1 [`STAGE_1_CYCLES-1:0]; + wire going_to_write_st1[`STAGE_1_CYCLES-1:0]; + wire [31:0] addr_st1 [`STAGE_1_CYCLES-1:0]; + wire [31:0] writeword_st1 [`STAGE_1_CYCLES-1:0]; + wire [`REQ_INST_META_SIZE-1:0] inst_meta_st1 [`STAGE_1_CYCLES-1:0]; + wire is_fill_st1 [`STAGE_1_CYCLES-1:0]; + wire [`BANK_LINE_SIZE_RNG][31:0] writedata_st1 [`STAGE_1_CYCLES-1:0]; assign qual_is_fill_st0 = dfpq_pop; assign qual_valid_st0 = dfpq_pop || mrvq_pop || reqq_pop; @@ -202,25 +225,30 @@ module VX_bank ( reqq_pop ? {reqq_req_rd_st0, reqq_req_wb_st0, reqq_req_warp_num_st0, reqq_req_mem_read_st0, reqq_req_mem_write_st0, reqq_req_tid_st0} : 0; - VX_generic_register #(.N( 1 + 32 + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*32) + 1)) s0_1_c0 ( + assign qual_going_to_write_st0 = dfpq_pop ? 1 : + (mrvq_pop && (mrvq_mem_write_st0 != `NO_MEM_WRITE)) ? 1 : + (reqq_pop && (reqq_req_mem_write_st0 != `NO_MEM_WRITE)) ? 1 : + 0; + + VX_generic_register #(.N( 1 + 1 + 32 + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*32) + 1)) s0_1_c0 ( .clk (clk), .reset(reset), .stall(stall_bank_pipe), .flush(0), - .in ({qual_valid_st0, qual_addr_st0, qual_writeword_st0, qual_inst_meta_st0, qual_is_fill_st0, qual_writedata_st0}), - .out ({valid_st1[0] , addr_st1[0] , writeword_st1[0] , inst_meta_st1[0] , is_fill_st1[0] , writedata_st1[0]}) + .in ({qual_going_to_write_st0, qual_valid_st0, qual_addr_st0, qual_writeword_st0, qual_inst_meta_st0, qual_is_fill_st0, qual_writedata_st0}), + .out ({going_to_write_st1[0] , valid_st1[0] , addr_st1[0] , writeword_st1[0] , inst_meta_st1[0] , is_fill_st1[0] , writedata_st1[0]}) ); genvar curr_stage; generate for (curr_stage = 1; curr_stage < `STAGE_1_CYCLES; curr_stage = curr_stage + 1) begin - VX_generic_register #(.N( 1 + 32 + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*32) + 1)) s0_1_cc ( + VX_generic_register #(.N( 1 + 1 + 32 + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*32) + 1)) s0_1_cc ( .clk (clk), .reset(reset), .stall(stall_bank_pipe), .flush(0), - .in ({valid_st1[curr_stage-1], addr_st1[curr_stage-1], writeword_st1[curr_stage-1], inst_meta_st1[curr_stage-1], is_fill_st1[curr_stage-1] , writedata_st1[curr_stage-1]}), - .out ({valid_st1[curr_stage] , addr_st1[curr_stage] , writeword_st1[curr_stage] , inst_meta_st1[curr_stage] , is_fill_st1[curr_stage] , writedata_st1[curr_stage] }) + .in ({going_to_write_st1[curr_stage-1], valid_st1[curr_stage-1], addr_st1[curr_stage-1], writeword_st1[curr_stage-1], inst_meta_st1[curr_stage-1], is_fill_st1[curr_stage-1] , writedata_st1[curr_stage-1]}), + .out ({going_to_write_st1[curr_stage] , valid_st1[curr_stage] , addr_st1[curr_stage] , writeword_st1[curr_stage] , inst_meta_st1[curr_stage] , is_fill_st1[curr_stage] , writedata_st1[curr_stage] }) ); end endgenerate @@ -239,6 +267,7 @@ module VX_bank ( wire [2:0] mem_read_st1e; wire [2:0] mem_write_st1e; wire [`vx_clog2(`NUMBER_REQUESTS)-1:0] tid_st1e; + wire fill_saw_dirty_st1e; assign {rd_st1e, wb_st1e, warp_num_st1e, mem_read_st1e, mem_write_st1e, tid_st1e} = inst_meta_st1[`STAGE_1_CYCLES-1]; @@ -266,7 +295,8 @@ module VX_bank ( .readdata_st1e (readdata_st1e), .readtag_st1e (readtag_st1e), .miss_st1e (miss_st1e), - .dirty_st1e (dirty_st1e) + .dirty_st1e (dirty_st1e), + .fill_saw_dirty_st1e(fill_saw_dirty_st1e) ); wire qual_valid_st1e_2 = valid_st1[`STAGE_1_CYCLES-1] && !is_fill_st1[`STAGE_1_CYCLES-1]; @@ -281,14 +311,15 @@ module VX_bank ( wire[`REQ_INST_META_SIZE-1:0] inst_meta_st2; wire[`TAG_SELECT_SIZE_RNG] readtag_st2; wire is_fill_st2; + wire fill_saw_dirty_st2; - VX_generic_register #(.N( 1 + 1 + 32 + 32 + 32 + (`BANK_LINE_SIZE_WORDS * 32) + 1 + 1 + `REQ_INST_META_SIZE + `TAG_SELECT_NUM_BITS)) st_1e_2 ( + VX_generic_register #(.N( 1 + 1 + 1 + 32 + 32 + 32 + (`BANK_LINE_SIZE_WORDS * 32) + 1 + 1 + `REQ_INST_META_SIZE + `TAG_SELECT_NUM_BITS)) st_1e_2 ( .clk (clk), .reset(reset), .stall(stall_bank_pipe), .flush(0), - .in ({is_fill_st1[`STAGE_1_CYCLES-1], qual_valid_st1e_2, addr_st1[`STAGE_1_CYCLES-1], writeword_st1[`STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, inst_meta_st1[`STAGE_1_CYCLES-1]}), - .out ({is_fill_st2 , valid_st2 , addr_st2 , writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , inst_meta_st2 }) + .in ({fill_saw_dirty_st1e, is_fill_st1[`STAGE_1_CYCLES-1], qual_valid_st1e_2, addr_st1[`STAGE_1_CYCLES-1], writeword_st1[`STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, inst_meta_st1[`STAGE_1_CYCLES-1]}), + .out ({fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , inst_meta_st2 }) ); @@ -324,7 +355,7 @@ module VX_bank ( ); // Enqueue to DWB Queue - wire dwbq_push = valid_st2 && miss_st2 && dirty_st2; + wire dwbq_push = (valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2; wire[31:0] dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]}; wire[`BANK_LINE_SIZE_RNG][31:0] dwbq_req_data = readdata_st2; wire dwbq_empty; diff --git a/rtl/VX_cache/VX_cache_config.v b/rtl/VX_cache/VX_cache_config.v index c1844dcc..44c993b0 100644 --- a/rtl/VX_cache/VX_cache_config.v +++ b/rtl/VX_cache/VX_cache_config.v @@ -39,7 +39,7 @@ `define DFQQ_SIZE `REQQ_SIZE // Dram knobs - `define SIMULATED_DRAM_LATENCY_CYCLES 50 + `define SIMULATED_DRAM_LATENCY_CYCLES 10 // ========================================= Configurable Knobs ========================================= @@ -89,8 +89,8 @@ `define BANK_SIZE_BYTES `CACHE_SIZE_BYTES/`NUMBER_BANKS -`define BANK_LINE_COUNT `BANK_SIZE_BYTES/`BANK_LINE_SIZE_BYTES -`define BANK_LINE_SIZE_WORDS `BANK_LINE_SIZE_BYTES / `WORD_SIZE_BYTES +`define BANK_LINE_COUNT (`BANK_SIZE_BYTES/`BANK_LINE_SIZE_BYTES) +`define BANK_LINE_SIZE_WORDS (`BANK_LINE_SIZE_BYTES / `WORD_SIZE_BYTES) `define BANK_LINE_SIZE_RNG `BANK_LINE_SIZE_WORDS-1:0 // Offset is fixed @@ -106,7 +106,7 @@ `define WORD_SELECT_ADDR_START 1+`OFFSET_ADDR_END `define WORD_SELECT_ADDR_END `WORD_SELECT_SIZE_END+`OFFSET_ADDR_END `define WORD_SELECT_ADDR_RNG `WORD_SELECT_ADDR_END:`WORD_SELECT_ADDR_START -`define WORD_SELECT_SIZE_RNG `WORD_SELECT_SIZE_END-1:`WORD_SELECT_SIZE_END +`define WORD_SELECT_SIZE_RNG `WORD_SELECT_SIZE_END-1:0 `define BANK_SELECT_NUM_BITS $clog2(`NUMBER_BANKS) `define BANK_SELECT_SIZE_END `BANK_SELECT_NUM_BITS diff --git a/rtl/VX_cache/VX_cache_req_queue.v b/rtl/VX_cache/VX_cache_req_queue.v index 5d2451ad..7fce8ac5 100644 --- a/rtl/VX_cache/VX_cache_req_queue.v +++ b/rtl/VX_cache/VX_cache_req_queue.v @@ -69,6 +69,7 @@ module VX_cache_req_queue ( wire push_qual = reqq_push && !reqq_full; wire pop_qual = reqq_pop && use_empty && !out_empty && !reqq_empty; + VX_generic_queue #(.DATAW( (`NUMBER_REQUESTS * (1+32+32)) + 5 + 2 + (`NW_M1+1) + 3 + 3 ), .SIZE(`REQQ_SIZE)) reqq_queue( .clk (clk), .reset (reset), @@ -81,15 +82,16 @@ module VX_cache_req_queue ( ); + wire[`NUMBER_REQUESTS-1:0] real_out_per_valids = out_per_valids & {`NUMBER_REQUESTS{~reqq_empty}}; - assign qual_valids = use_empty ? out_per_valids : out_empty ? 0 : use_per_valids; - assign qual_addr = use_empty ? out_per_addr : use_per_addr; - assign qual_writedata = use_empty ? out_per_writedata : use_per_writedata; - assign qual_rd = use_empty ? out_per_rd : use_per_rd; - assign qual_wb = use_empty ? out_per_wb : use_per_wb; - assign qual_warp_num = use_empty ? out_per_warp_num : use_per_warp_num; - assign qual_mem_read = use_empty ? out_per_mem_read : use_per_mem_read; - assign qual_mem_write = use_empty ? out_per_mem_write : use_per_mem_write; + assign qual_valids = use_empty ? real_out_per_valids : out_empty ? 0 : use_per_valids; + assign qual_addr = use_empty ? out_per_addr : use_per_addr; + assign qual_writedata = use_empty ? out_per_writedata : use_per_writedata; + assign qual_rd = use_empty ? out_per_rd : use_per_rd; + assign qual_wb = use_empty ? out_per_wb : use_per_wb; + assign qual_warp_num = use_empty ? out_per_warp_num : use_per_warp_num; + assign qual_mem_read = use_empty ? out_per_mem_read : use_per_mem_read; + assign qual_mem_write = use_empty ? out_per_mem_write : use_per_mem_write; wire[`vx_clog2(`NUMBER_REQUESTS)-1:0] qual_request_index; wire qual_has_request; diff --git a/rtl/VX_cache/VX_cache_wb_sel_merge.v b/rtl/VX_cache/VX_cache_wb_sel_merge.v index b4806dd1..08d97350 100644 --- a/rtl/VX_cache/VX_cache_wb_sel_merge.v +++ b/rtl/VX_cache/VX_cache_wb_sel_merge.v @@ -24,7 +24,7 @@ module VX_cache_wb_sel_merge ( ); reg [`NUMBER_BANKS-1:0] per_bank_wb_pop_unqual; - assign per_bank_wb_pop = per_bank_wb_pop_unqual & {`NUMBER_BANKS{core_no_wb_slot}}; + assign per_bank_wb_pop = per_bank_wb_pop_unqual & {`NUMBER_BANKS{~core_no_wb_slot}}; wire[`NUMBER_BANKS-1:0] bank_wants_wb; genvar curr_bank; @@ -51,8 +51,10 @@ module VX_cache_wb_sel_merge ( genvar this_bank; generate always @(*) begin + assign core_wb_valid = 0; + assign core_wb_readdata = 0; for (this_bank = 0; this_bank < `NUMBER_BANKS; this_bank = this_bank + 1) begin - if (found_bank && (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index]) && (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index])) begin + if (found_bank && (per_bank_wb_valid[this_bank]) && (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index]) && (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index])) begin assign core_wb_valid[per_bank_wb_tid[this_bank]] = 1; assign core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank]; assign per_bank_wb_pop_unqual[this_bank] = 1; diff --git a/rtl/VX_cache/VX_tag_data_access.v b/rtl/VX_cache/VX_tag_data_access.v index e6452a57..2f342ab4 100644 --- a/rtl/VX_cache/VX_tag_data_access.v +++ b/rtl/VX_cache/VX_tag_data_access.v @@ -21,7 +21,8 @@ module VX_tag_data_access ( output wire[`BANK_LINE_SIZE_RNG][31:0] readdata_st1e, output wire[`TAG_SELECT_SIZE_RNG] readtag_st1e, output wire miss_st1e, - output wire dirty_st1e + output wire dirty_st1e, + output wire fill_saw_dirty_st1e ); @@ -46,6 +47,8 @@ module VX_tag_data_access ( wire[`BANK_LINE_SIZE_RNG][3:0] use_write_enable; wire[`BANK_LINE_SIZE_RNG][31:0] use_write_data; + + wire fill_sent; VX_tag_data_structure VX_tag_data_structure( .clk (clk), .reset (reset), @@ -59,7 +62,8 @@ module VX_tag_data_access ( .write_enable(use_write_enable), .write_fill (writefill_st1e), .write_addr (writeaddr_st1e), - .write_data (use_write_data) + .write_data (use_write_data), + .fill_sent (fill_sent) ); VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`BANK_LINE_SIZE_WORDS*32) )) s0_1_c0 ( @@ -89,7 +93,10 @@ module VX_tag_data_access ( assign use_read_valid_st1e = read_valid_st1c[`STAGE_1_CYCLES-1]; assign use_read_dirty_st1e = read_dirty_st1c[`STAGE_1_CYCLES-1]; assign use_read_tag_st1e = read_tag_st1c [`STAGE_1_CYCLES-1]; - assign use_read_data_st1e = read_data_st1c [`STAGE_1_CYCLES-1]; + + genvar curr_w; + for (curr_w = 0; curr_w < `BANK_LINE_SIZE_WORDS; curr_w = curr_w+1) assign use_read_data_st1e[curr_w][31:0] = read_data_st1c[`STAGE_1_CYCLES-1][curr_w][31:0]; + // assign use_read_data_st1e = read_data_st1c [`STAGE_1_CYCLES-1]; /////////////////////// LOAD LOGIC /////////////////// @@ -107,10 +114,17 @@ module VX_tag_data_access ( wire b2 = (byte_select == 2); wire b3 = (byte_select == 3); - wire[31:0] data_unQual = (b0 || lw) ? (use_read_data_st1e[block_offset]) : - b1 ? (use_read_data_st1e[block_offset] >> 8) : - b2 ? (use_read_data_st1e[block_offset] >> 16) : - (use_read_data_st1e[block_offset] >> 24); + wire[31:0] w0 = read_data_st1c[`STAGE_1_CYCLES-1][0][31:0]; + wire[31:0] w1 = read_data_st1c[`STAGE_1_CYCLES-1][1][31:0]; + wire[31:0] w2 = read_data_st1c[`STAGE_1_CYCLES-1][2][31:0]; + wire[31:0] w3 = read_data_st1c[`STAGE_1_CYCLES-1][3][31:0]; + + wire[31:0] data_unmod = read_data_st1c[`STAGE_1_CYCLES-1][block_offset][31:0]; + + wire[31:0] data_unQual = (b0 || lw) ? (data_unmod) : + b1 ? (data_unmod >> 8) : + b2 ? (data_unmod >> 16) : + (data_unmod >> 24); wire[31:0] lb_data = (data_unQual[7] ) ? (data_unQual | 32'hFFFFFF00) : (data_unQual & 32'hFF); @@ -151,8 +165,8 @@ module VX_tag_data_access ( wire[3:0] sb_mask = (b0 ? 4'b0001 : (b1 ? 4'b0010 : (b2 ? 4'b0100 : 4'b1000))); wire[3:0] sh_mask = (b0 ? 4'b0011 : 4'b1100); - wire should_write = (sw || sb || sh) && valid_req_st1e && !miss_st1e; - wire force_write = writefill_st1e && valid_req_st1e; + wire should_write = (sw || sb || sh) && valid_req_st1e && use_read_valid_st1e && !miss_st1e; + wire force_write = writefill_st1e && valid_req_st1e && miss_st1e; wire[`BANK_LINE_SIZE_RNG][3:0] we; wire[`BANK_LINE_SIZE_RNG][31:0] data_write; @@ -161,13 +175,13 @@ module VX_tag_data_access ( for (g = 0; g < `BANK_LINE_SIZE_WORDS; g = g + 1) begin : write_enables wire normal_write = (block_offset == g) && should_write; - assign we[g] = (force_write) ? 4'b1111 : + assign we[g] = (force_write) ? 4'b1111 : (normal_write && sw) ? 4'b1111 : (normal_write && sb) ? sb_mask : (normal_write && sh) ? sh_mask : 4'b0000; - assign data_write[g] = force_write ? writedata_st1e : use_write_dat ; + assign data_write[g] = force_write ? writedata_st1e[g] : use_write_dat; end endgenerate @@ -181,6 +195,8 @@ module VX_tag_data_access ( assign dirty_st1e = valid_req_st1e && use_read_valid_st1e && use_read_dirty_st1e; assign readdata_st1e = use_read_data_st1e; assign readtag_st1e = use_read_tag_st1e; + assign fill_sent = miss_st1e; + assign fill_saw_dirty_st1e = force_write && dirty_st1e; endmodule diff --git a/rtl/VX_cache/VX_tag_data_structure.v b/rtl/VX_cache/VX_tag_data_structure.v index da3f8b4d..bad6f0ea 100644 --- a/rtl/VX_cache/VX_tag_data_structure.v +++ b/rtl/VX_cache/VX_tag_data_structure.v @@ -11,7 +11,8 @@ module VX_tag_data_structure ( input wire[`BANK_LINE_SIZE_RNG][3:0] write_enable, input wire write_fill, input wire[31:0] write_addr, - input wire[`BANK_LINE_SIZE_RNG][31:0] write_data + input wire[`BANK_LINE_SIZE_RNG][31:0] write_data, + input wire fill_sent ); @@ -38,7 +39,9 @@ module VX_tag_data_structure ( end else begin dirty[write_addr[`LINE_SELECT_ADDR_RNG]] <= 1; end - end + end else if (fill_sent) begin + dirty[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0; + end for (f = 0; f < `BANK_LINE_SIZE_WORDS; f = f + 1) begin if (write_enable[f][0]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][0] <= write_data[f][7 :0 ]; diff --git a/rtl/VX_fetch.v b/rtl/VX_fetch.v index cde7d725..35278807 100644 --- a/rtl/VX_fetch.v +++ b/rtl/VX_fetch.v @@ -8,6 +8,8 @@ module VX_fetch ( VX_join_inter VX_join, input wire schedule_delay, input wire icache_stage_delay, + input wire[`NW_M1:0] icache_stage_wid, + input wire[`NT-1:0] icache_stage_valids, output wire out_ebreak, VX_jal_response_inter VX_jal_rsp, @@ -40,7 +42,7 @@ module VX_fetch ( // Locals - assign pipe_stall = schedule_delay || icache_stage_delay || stall_might_be_branch; + assign pipe_stall = schedule_delay || icache_stage_delay || (stall_might_be_branch && (icache_stage_wid == warp_num)) ; VX_warp_scheduler warp_scheduler( .clk (clk), diff --git a/rtl/VX_front_end.v b/rtl/VX_front_end.v index b8d90c15..0ab8288f 100644 --- a/rtl/VX_front_end.v +++ b/rtl/VX_front_end.v @@ -37,6 +37,9 @@ wire icache_stage_delay; wire vortex_ebreak; wire terminate_sim; +wire[`NW_M1:0] icache_stage_wid; +wire[`NT-1:0] icache_stage_valids; + assign fetch_ebreak = vortex_ebreak || terminate_sim; @@ -46,6 +49,8 @@ VX_join_inter VX_join(); VX_fetch vx_fetch( .clk (clk), .reset (reset), + .icache_stage_wid (icache_stage_wid), + .icache_stage_valids(icache_stage_valids), .VX_wstall (VX_wstall), .VX_join (VX_join), .schedule_delay (schedule_delay), @@ -74,6 +79,8 @@ VX_icache_stage VX_icache_stage( .clk (clk), .reset (reset), .icache_stage_delay(icache_stage_delay), + .icache_stage_valids(icache_stage_valids), + .icache_stage_wid (icache_stage_wid), .fe_inst_meta_fi (fe_inst_meta_fi2), .fe_inst_meta_id (fe_inst_meta_id), .icache_response (icache_response_fe), diff --git a/rtl/VX_generic_queue.v b/rtl/VX_generic_queue.v index fd3d77e7..f388c6d3 100644 --- a/rtl/VX_generic_queue.v +++ b/rtl/VX_generic_queue.v @@ -39,7 +39,7 @@ module VX_generic_queue tail <= tail+1; end - if (pop) begin + if (pop && !empty) begin head <= head + 1; end diff --git a/rtl/VX_icache_stage.v b/rtl/VX_icache_stage.v index daeaef5b..3c6b3c3d 100644 --- a/rtl/VX_icache_stage.v +++ b/rtl/VX_icache_stage.v @@ -4,6 +4,8 @@ module VX_icache_stage ( input wire clk, input wire reset, output wire icache_stage_delay, + output wire[`NW_M1:0] icache_stage_wid, + output wire[`NT-1:0] icache_stage_valids, VX_inst_meta_inter fe_inst_meta_fi, VX_inst_meta_inter fe_inst_meta_id, VX_icache_response_inter icache_response, @@ -27,5 +29,8 @@ module VX_icache_stage ( assign fe_inst_meta_id.warp_num = fe_inst_meta_fi.warp_num; assign fe_inst_meta_id.valid = fe_inst_meta_fi.valid & {`NT{!icache_stage_delay}}; + assign icache_stage_wid = fe_inst_meta_fi.warp_num; + assign icache_stage_valids = fe_inst_meta_fi.valid; + endmodule \ No newline at end of file diff --git a/rtl/simulate/test_bench.cpp b/rtl/simulate/test_bench.cpp index 8eb90aa4..8b2e7e9e 100644 --- a/rtl/simulate/test_bench.cpp +++ b/rtl/simulate/test_bench.cpp @@ -87,7 +87,9 @@ int main(int argc, char **argv) #else - char testing[] = "../../emulator/riscv_tests/rv32ui-p-auipc.hex"; + // char testing[] = "../../runtime/mains/simple/vx_simple_main.hex"; + // char testing[] = "../../emulator/riscv_tests/rv32ui-p-lw.hex"; + char testing[] = "../../emulator/riscv_tests/rv32ui-p-sw.hex"; Vortex v; // const char *testing; diff --git a/rtl/simulate/test_bench.h b/rtl/simulate/test_bench.h index 3a001377..72d26652 100644 --- a/rtl/simulate/test_bench.h +++ b/rtl/simulate/test_bench.h @@ -28,6 +28,14 @@ double sc_time_stamp() return time_stamp / 1000.0; } +typedef struct +{ + int cycles_left; + int data_length; + unsigned base_addr; + unsigned * data; +} dram_req_t; + class Vortex { public: @@ -69,6 +77,7 @@ class Vortex int debug_end_wait; int debug_debugAddr; double stats_sim_time; + std::vector dram_req_vec; #ifdef VCD_OUTPUT VerilatedVcdC *m_trace; #endif @@ -235,65 +244,77 @@ void Vortex::io_handler() bool Vortex::dbus_driver() { - vortex->i_m_ready_d = false; - + // Iterate through each element, and get pop index + int dequeue_index = -1; + bool dequeue_valid = false; + for (int i = 0; i < this->dram_req_vec.size(); i++) { - - // int dcache_num_words_per_block - - if (refill_d) + if (this->dram_req_vec[i].cycles_left > 0) { - refill_d = false; - vortex->i_m_ready_d = true; - - for (int curr_bank = 0; curr_bank < vortex->Vortex__DOT__dcache_banks; curr_bank++) - { - for (int curr_word = 0; curr_word < vortex->Vortex__DOT__dcache_num_words_per_block; curr_word++) - { - unsigned curr_index = (curr_word * vortex->Vortex__DOT__dcache_banks) + curr_bank; - unsigned curr_addr = refill_addr_d + (4*curr_index); - - unsigned curr_value; - ram.getWord(curr_addr, &curr_value); - - vortex->i_m_readdata_d[curr_bank][curr_word] = curr_value; - - } - } - } - else - { - if (vortex->o_m_valid_d) - { - - if (vortex->o_m_read_or_write_d) - { - // fprintf(stderr, "++++++++++++++++++++++++++++++++\n"); - unsigned base_addr = vortex->o_m_evict_addr_d; - - for (int curr_bank = 0; curr_bank < vortex->Vortex__DOT__dcache_banks; curr_bank++) - { - for (int curr_word = 0; curr_word < vortex->Vortex__DOT__dcache_num_words_per_block; curr_word++) - { - unsigned curr_index = (curr_word * vortex->Vortex__DOT__dcache_banks) + curr_bank; - unsigned curr_addr = base_addr + (4*curr_index); - - unsigned curr_value = vortex->o_m_writedata_d[curr_bank][curr_word]; - - ram.writeWord( curr_addr, &curr_value); - } - } - } - - // Respond next cycle - refill_d = true; - refill_addr_d = vortex->o_m_read_addr_d; - } + this->dram_req_vec[i].cycles_left -= 1; } + if ((this->dram_req_vec[i].cycles_left == 0) && (!dequeue_valid)) + { + dequeue_index = i; + dequeue_valid = true; + } } + if (vortex->dram_req) + { + if (vortex->dram_req_read) + { + // Need to add an element + dram_req_t dram_req; + dram_req.cycles_left = vortex->dram_expected_lat; + dram_req.data_length = vortex->dram_req_size / 4; + dram_req.base_addr = vortex->dram_req_addr; + dram_req.data = (unsigned *) malloc(dram_req.data_length * sizeof(unsigned)); + + for (int i = 0; i < dram_req.data_length; i++) + { + unsigned curr_addr = dram_req.base_addr + (i*4); + unsigned data_rd; + ram.getWord(curr_addr, &data_rd); + dram_req.data[i] = data_rd; + } + this->dram_req_vec.push_back(dram_req); + } + + if (vortex->dram_req_write) + { + unsigned base_addr = vortex->dram_req_addr; + unsigned data_length = vortex->dram_req_size / 4; + + for (int i = 0; i < data_length; i++) + { + unsigned curr_addr = base_addr + (i*4); + unsigned data_wr = vortex->dram_req_data[i]; + ram.writeWord(curr_addr, &data_wr); + } + } + } + + if (vortex->dram_fill_accept && dequeue_valid) + { + vortex->dram_fill_rsp = 1; + vortex->dram_fill_rsp_addr = this->dram_req_vec[dequeue_index].base_addr; + for (int i = 0; i < this->dram_req_vec[dequeue_index].data_length; i++) + { + vortex->dram_fill_rsp_data[i] = this->dram_req_vec[dequeue_index].data[i]; + } + free(this->dram_req_vec[dequeue_index].data); + + this->dram_req_vec.erase(this->dram_req_vec.begin() + dequeue_index); + } + else + { + vortex->dram_fill_rsp = 0; + vortex->dram_fill_rsp_addr = 0; + } + return false; } @@ -430,4 +451,4 @@ bool Vortex::simulate(std::string file_to_simulate) return (status == 1); // return (1 == 1); -} \ No newline at end of file +}