diff --git a/driver/tests/tex_demo/kernel.c b/driver/tests/tex_demo/kernel.c index 1e36b359..89693f2c 100644 --- a/driver/tests/tex_demo/kernel.c +++ b/driver/tests/tex_demo/kernel.c @@ -22,8 +22,8 @@ void kernel_body(int task_id, void* arg) { for (uint32_t y = 0; y < _arg->tile_height; ++y) { for (uint32_t x = 0; x < _arg->tile_width; ++x) { - int32_t u = (int32_t)(fu * (1<<28)); - int32_t v = (int32_t)(fv * (1<<28)); + int32_t u = (int32_t)(fu * (1<<20)); + int32_t v = (int32_t)(fv * (1<<20)); dst_ptr[x] = vx_tex(0, u, v, 0); fu += _arg->deltaX; } diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index b34b6788..735913f7 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -71,13 +71,13 @@ module VX_core #( //-- VX_dcache_core_req_if #( - .NUM_REQS(`DNUM_REQUESTS), + .LANES(`DNUM_REQUESTS), .WORD_SIZE(`DWORD_SIZE), .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) ) dcache_core_req_if(); VX_dcache_core_rsp_if #( - .NUM_REQS(`DNUM_REQUESTS), + .LANES(`DNUM_REQUESTS), .WORD_SIZE(`DWORD_SIZE), .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) ) dcache_core_rsp_if(); diff --git a/hw/rtl/VX_dcache_arb.v b/hw/rtl/VX_dcache_arb.v new file mode 100644 index 00000000..958146a2 --- /dev/null +++ b/hw/rtl/VX_dcache_arb.v @@ -0,0 +1,118 @@ +`include "VX_define.vh" + +module VX_mem_arb #( + parameter NUM_REQS = 1, + parameter DATA_WIDTH = 1, + parameter TAG_IN_WIDTH = 1, + parameter TAG_OUT_WIDTH = 1, + parameter BUFFERED_REQ = 0, + parameter BUFFERED_RSP = 0, + + parameter DATA_SIZE = (DATA_WIDTH / 8), + parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE), + parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) +) ( + input wire clk, + input wire reset, + + // input requests + input wire [NUM_REQS-1:0] req_valid_in, + input wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] req_tag_in, + input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] req_addr_in, + input wire [NUM_REQS-1:0] req_rw_in, + input wire [NUM_REQS-1:0][DATA_SIZE-1:0] req_byteen_in, + input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] req_data_in, + output wire [NUM_REQS-1:0] req_ready_in, + + // output request + output wire req_valid_out, + output wire [TAG_OUT_WIDTH-1:0] req_tag_out, + output wire [ADDR_WIDTH-1:0] req_addr_out, + output wire req_rw_out, + output wire [DATA_SIZE-1:0] req_byteen_out, + output wire [DATA_WIDTH-1:0] req_data_out, + input wire req_ready_out, + + // input response + input wire rsp_valid_in, + input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, + input wire [DATA_WIDTH-1:0] rsp_data_in, + output wire rsp_ready_in, + + // output responses + output wire [NUM_REQS-1:0] rsp_valid_out, + output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, + output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_out, + input wire [NUM_REQS-1:0] rsp_ready_out +); + localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; + localparam RSP_DATAW = TAG_IN_WIDTH + DATA_WIDTH; + + if (NUM_REQS > 1) begin + + wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_in; + for (genvar i = 0; i < NUM_REQS; i++) begin + assign req_merged_data_in[i] = {{req_tag_in[i], LOG_NUM_REQS'(i)}, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; + end + + VX_stream_arbiter #( + .NUM_REQS (NUM_REQS), + .DATAW (REQ_DATAW), + .BUFFERED (BUFFERED_REQ) + ) req_arb ( + .clk (clk), + .reset (reset), + .valid_in (req_valid_in), + .data_in (req_merged_data_in), + .ready_in (req_ready_in), + .valid_out (req_valid_out), + .data_out ({req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}), + .ready_out (req_ready_out) + ); + + /////////////////////////////////////////////////////////////////////// + + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in [LOG_NUM_REQS-1:0]; + + wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_merged_data_out; + for (genvar i = 0; i < NUM_REQS; i++) begin + assign {rsp_tag_out[i], rsp_data_out[i]} = rsp_merged_data_out[i]; + end + + VX_stream_demux #( + .NUM_REQS (NUM_REQS), + .DATAW (RSP_DATAW), + .BUFFERED (BUFFERED_RSP) + ) rsp_demux ( + .clk (clk), + .reset (reset), + .sel (rsp_sel), + .valid_in (rsp_valid_in), + .data_in ({rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH], rsp_data_in}), + .ready_in (rsp_ready_in), + .valid_out (rsp_valid_out), + .data_out (rsp_merged_data_out), + .ready_out (rsp_ready_out) + ); + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + assign req_valid_out = req_valid_in; + assign req_tag_out = req_tag_in; + assign req_addr_out = req_addr_in; + assign req_rw_out = req_rw_in; + assign req_byteen_out = req_byteen_in; + assign req_data_out = req_data_in; + assign req_ready_in = req_ready_out; + + assign rsp_valid_out = rsp_valid_in; + assign rsp_tag_out = rsp_tag_in; + assign rsp_data_out = rsp_data_in; + assign rsp_ready_in = rsp_ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index d70841a2..59855419 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -283,8 +283,13 @@ `define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) // Core request tag bits +`ifdef EXT_TEX_ENABLE +`define LSU_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) +`define DCORE_TAG_WIDTH (`LSU_TAG_WIDTH+1) +`else `define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) - +`endif + // DRAM request data bits `define DDRAM_LINE_WIDTH (`DCACHE_LINE_SIZE * 8) diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 0964b415..015093cb 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -47,7 +47,74 @@ module VX_execute #( VX_fpu_to_csr_if fpu_to_csr_if(); `ifdef EXT_TEX_ENABLE + + VX_dcache_core_req_if #( + .LANES(`NUM_THREADS), + .WORD_SIZE(4), + .CORE_TAG_WIDTH(`LSU_TAG_WIDTH) + ) tex_dcache_req_if(); + + VX_dcache_core_rsp_if #( + .LANES(`NUM_THREADS), + .WORD_SIZE(4), + .CORE_TAG_WIDTH(`LSU_TAG_WIDTH) + ) tex_dcache_rsp_if(); + + VX_dcache_core_req_if #( + .LANES(`NUM_THREADS), + .WORD_SIZE(4), + .CORE_TAG_WIDTH(`LSU_TAG_WIDTH) + ) lsu_dcache_req_if(); + + VX_dcache_core_rsp_if #( + .LANES(`NUM_THREADS), + .WORD_SIZE(4), + .CORE_TAG_WIDTH(`LSU_TAG_WIDTH) + ) lsu_dcache_rsp_if(); + VX_tex_csr_if tex_csr_if(); + + VX_tex_lsu_arb #( + .NUM_REQS (2), + .LANES (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_IN_WIDTH (`LSU_TAG_WIDTH), + .TAG_OUT_WIDTH (`DCORE_TAG_WIDTH) + ) tex_lsu_arb ( + .clk (clk), + .reset (reset), + + // Tex/LSU request + .req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}), + .req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}), + .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), + .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), + .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), + .req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}), + .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), + + // Dcache request + .req_valid_out (dcache_req_if.valid), + .req_rw_out (dcache_req_if.rw), + .req_byteen_out (dcache_req_if.byteen), + .req_addr_out (dcache_req_if.addr), + .req_data_out (dcache_req_if.data), + .req_tag_out (dcache_req_if.tag), + .req_ready_out (dcache_req_if.ready), + + // Tex/LSU response + .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), + .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), + .rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}), + .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}), + + // Dcache response + .rsp_valid_in (dcache_rsp_if.valid), + .rsp_tag_in (dcache_rsp_if.tag), + .rsp_data_in (dcache_rsp_if.data), + .rsp_ready_in (dcache_rsp_if.ready) + ); + `endif wire[`NUM_WARPS-1:0] csr_pending; @@ -63,105 +130,24 @@ module VX_execute #( .alu_commit_if (alu_commit_if) ); -`ifdef EXT_TEX_ENABLE - - VX_dcache_core_req_if #( - .NUM_REQS(`NUM_THREADS), - .WORD_SIZE(4), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) - ) tex_dcache_req_if(); - - VX_dcache_core_rsp_if #( - .NUM_REQS(`NUM_THREADS), - .WORD_SIZE(4), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) - ) tex_dcache_rsp_if(); - - VX_dcache_core_req_if #( - .NUM_REQS(`NUM_THREADS), - .WORD_SIZE(4), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) - ) lsu_dcache_req_if(); - - VX_dcache_core_rsp_if #( - .NUM_REQS(`NUM_THREADS), - .WORD_SIZE(4), - .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) - ) lsu_dcache_rsp_if(); - - VX_mem_arb #( - .NUM_REQS (2), - .DATA_WIDTH (`WORD_WIDTH), - .TAG_IN_WIDTH (`DCORE_TAG_WIDTH), - .TAG_OUT_WIDTH (`DCORE_TAG_WIDTH), - .BUFFERED_REQ (0), - .BUFFERED_RSP (0) - ) dcache_arb ( - .clk (clk), - .reset (reset), - - // Tex/LSU request - .req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}), - .req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}), - .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), - .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), - .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), - .req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}), - .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), - - // Dcache request - .req_valid_out (dcache_req_if.valid), - .req_rw_out (dcache_req_if.rw), - .req_byteen_out (dcache_req_if.byteen), - .req_addr_out (dcache_req_if.addr), - .req_data_out (dcache_req_if.data), - .req_tag_out (dcache_req_if.tag), - .req_ready_out (dcache_req_if.ready), - - // Tex/LSU response - .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), - .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), - .rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}), - .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}), - - // Dcache response - .rsp_valid_in (dcache_rsp_if.valid), - .rsp_tag_in (dcache_rsp_if.tag), - .rsp_data_in (dcache_rsp_if.data), - .rsp_ready_in (dcache_rsp_if.ready) - ); - - VX_lsu_unit #( .CORE_ID(CORE_ID) ) lsu_unit ( `SCOPE_BIND_VX_execute_lsu_unit .clk (clk), .reset (reset), + `ifdef EXT_TEX_ENABLE .dcache_req_if (lsu_dcache_req_if), .dcache_rsp_if (lsu_dcache_rsp_if), - .lsu_req_if (lsu_req_if), - .ld_commit_if (ld_commit_if), - .st_commit_if (st_commit_if) - ); - -`else - - VX_lsu_unit #( - .CORE_ID(CORE_ID) - ) lsu_unit ( - `SCOPE_BIND_VX_execute_lsu_unit - .clk (clk), - .reset (reset), + `else .dcache_req_if (dcache_req_if), .dcache_rsp_if (dcache_rsp_if), + `endif .lsu_req_if (lsu_req_if), .ld_commit_if (ld_commit_if), .st_commit_if (st_commit_if) ); -`endif - VX_csr_unit #( .CORE_ID(CORE_ID) ) csr_unit ( diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 9e3b36d1..8b5c3f2d 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -41,30 +41,30 @@ module VX_mem_unit # ( ) dcache_dram_rsp_if(), icache_dram_rsp_if(); VX_dcache_core_req_if #( - .NUM_REQS (`DNUM_REQUESTS), + .LANES (`DNUM_REQUESTS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) dcache_req_if(); VX_dcache_core_rsp_if #( - .NUM_REQS (`DNUM_REQUESTS), + .LANES (`DNUM_REQUESTS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) dcache_rsp_if(); VX_dcache_core_req_if #( - .NUM_REQS (`DNUM_REQUESTS), + .LANES (`DNUM_REQUESTS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) smem_req_if(); VX_dcache_core_rsp_if #( - .NUM_REQS (`DNUM_REQUESTS), + .LANES (`DNUM_REQUESTS), .WORD_SIZE (`DWORD_SIZE), .CORE_TAG_WIDTH (`DCORE_TAG_WIDTH) ) smem_rsp_if(); - VX_databus_arb databus_arb ( + VX_smem_arb databus_arb ( .clk (clk), .reset (reset), diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index 91305a4f..b6d6f0a2 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -61,7 +61,7 @@ module VX_pipeline #( // VX_dcache_core_req_if #( - .NUM_REQS(`NUM_THREADS), + .LANES(`NUM_THREADS), .WORD_SIZE(4), .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) ) dcache_core_req_if(); @@ -79,7 +79,7 @@ module VX_pipeline #( // VX_dcache_core_rsp_if #( - .NUM_REQS(`NUM_THREADS), + .LANES(`NUM_THREADS), .WORD_SIZE(4), .CORE_TAG_WIDTH(`DCORE_TAG_WIDTH) ) dcache_core_rsp_if(); diff --git a/hw/rtl/VX_smem_arb.v b/hw/rtl/VX_smem_arb.v new file mode 100644 index 00000000..f6e1e960 --- /dev/null +++ b/hw/rtl/VX_smem_arb.v @@ -0,0 +1,118 @@ +`include "VX_define.vh" + +module VX_smem_arb ( + input wire clk, + input wire reset, + + // input request + VX_dcache_core_req_if core_req_if, + + // output requests + VX_dcache_core_req_if cache_req_if, + VX_dcache_core_req_if smem_req_if, + + // input responses + VX_dcache_core_rsp_if cache_rsp_if, + VX_dcache_core_rsp_if smem_rsp_if, + + // output response + VX_dcache_core_rsp_if core_rsp_if +); + localparam SMEM_ASHIFT = `CLOG2(`SHARED_MEM_BASE_ADDR_ALIGN); + localparam REQ_ASHIFT = `CLOG2(`DWORD_SIZE); + localparam REQ_ADDRW = 32 - REQ_ASHIFT; + localparam REQ_DATAW = REQ_ADDRW + 1 + `DWORD_SIZE + (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + localparam RSP_DATAW = `NUM_THREADS + `NUM_THREADS * (`DWORD_SIZE*8) + `DCORE_TAG_WIDTH; + + // + // handle requests + // + + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + + wire cache_req_ready_in; + wire smem_req_ready_in; + + // select shared memory bus + wire is_smem_addr = core_req_if.valid[i] && `SM_ENABLE + && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT)) + && (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT)); + + VX_skid_buffer #( + .DATAW (REQ_DATAW) + ) cache_out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (core_req_if.valid[i] && !is_smem_addr), + .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (cache_req_ready_in), + .valid_out (cache_req_if.valid[i]), + .data_out ({cache_req_if.addr[i], cache_req_if.rw[i], cache_req_if.byteen[i], cache_req_if.data[i], cache_req_if.tag[i]}), + .ready_out (cache_req_if.ready[i]) + ); + + VX_skid_buffer #( + .DATAW (REQ_DATAW) + ) smem_out_buffer ( + .clk (clk), + .reset (reset), + .valid_in (core_req_if.valid[i] && is_smem_addr), + .data_in ({core_req_if.addr[i], core_req_if.rw[i], core_req_if.byteen[i], core_req_if.data[i], core_req_if.tag[i]}), + .ready_in (smem_req_ready_in), + .valid_out (smem_req_if.valid[i]), + .data_out ({smem_req_if.addr[i], smem_req_if.rw[i], smem_req_if.byteen[i], smem_req_if.data[i], smem_req_if.tag[i]}), + .ready_out (smem_req_if.ready[i]) + ); + + assign core_req_if.ready[i] = is_smem_addr ? smem_req_ready_in : cache_req_ready_in; + end + + // + // handle responses + // + + if (`SM_ENABLE ) begin + + wire [1:0][RSP_DATAW-1:0] rsp_data_in; + wire [1:0] rsp_valid_in; + wire [1:0] rsp_ready_in; + + wire core_rsp_valid; + wire [`NUM_THREADS-1:0] core_rsp_valid_tmask; + + assign rsp_data_in[0] = {cache_rsp_if.valid, cache_rsp_if.data, cache_rsp_if.tag}; + assign rsp_data_in[1] = {smem_rsp_if.valid, smem_rsp_if.data, smem_rsp_if.tag}; + + assign rsp_valid_in[0] = (| cache_rsp_if.valid); + assign rsp_valid_in[1] = (| smem_rsp_if.valid) & `SM_ENABLE; + + VX_stream_arbiter #( + .NUM_REQS (2), + .DATAW (RSP_DATAW), + .BUFFERED (0) + ) rsp_arb ( + .clk (clk), + .reset (reset), + .valid_in (rsp_valid_in), + .data_in (rsp_data_in), + .ready_in (rsp_ready_in), + .valid_out (core_rsp_valid), + .data_out ({core_rsp_valid_tmask, core_rsp_if.data, core_rsp_if.tag}), + .ready_out (core_rsp_if.ready) + ); + + assign cache_rsp_if.ready = rsp_ready_in[0]; + assign smem_rsp_if.ready = rsp_ready_in[1]; + + assign core_rsp_if.valid = {`NUM_THREADS{core_rsp_valid}} & core_rsp_valid_tmask; + + end else begin + + assign core_rsp_if.valid = cache_rsp_if.valid; + assign core_rsp_if.tag = cache_rsp_if.tag; + assign core_rsp_if.data = cache_rsp_if.data; + assign cache_rsp_if.ready = core_rsp_if.ready; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_tex_cache_arb.v b/hw/rtl/VX_tex_cache_arb.v new file mode 100644 index 00000000..4aa06b56 --- /dev/null +++ b/hw/rtl/VX_tex_cache_arb.v @@ -0,0 +1,136 @@ +`include "VX_define.vh" + +module VX_dcache_arb #( + parameter NUM_REQS = 1, + parameter LANES = 1, + parameter WORD_SIZE = 1, + parameter TAG_IN_WIDTH = 1, + parameter TAG_OUT_WIDTH = 1 + parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) +) ( + input wire clk, + input wire reset, + + // input requests + input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in, + input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in, + input wire [NUM_REQS-1:0][LANES-1:0][WORD_SIZE-1:0] req_byteen_in, + input wire [NUM_REQS-1:0][LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_in, + input wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] req_data_in, + input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in, + output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in, + + // output request + output wire [LANES-1:0] req_valid_out, + output wire [LANES-1:0] req_rw_out, + output wire [LANES-1:0][WORD_SIZE-1:0] req_byteen_out, + output wire [LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_out, + output wire [LANES-1:0][`WORD_WIDTH-1:0] req_data_out, + output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out, + input wire [LANES-1:0] req_ready_out, + + // input response + input wire [LANES-1:0] rsp_valid_in, + input wire [LANES-1:0][`WORD_WIDTH-1:0] rsp_data_in, + input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, + output wire rsp_ready_in, + + // output responses + output wire [NUM_REQS-1:0][LANES-1:0] rsp_valid_out, + output wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] rsp_data_out, + output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, + input wire [NUM_REQS-1:0] rsp_ready_out +); + localparam REQ_DATAW = LANES * (1 + TAG_IN_WIDTH + `WORD_ADDR_WIDTH + 1 + WORD_SIZE + `WORD_WIDTH); + localparam RSP_DATAW = LANES * `WORD_WIDTH + TAG_IN_WIDTH; + + if (NUM_REQS > 1) begin + + wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_in; + wire [NUM_REQS-1:0] req_valid_in_any; + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign req_merged_data_in[i] = {req_valid_in[i], req_tag_in[i], req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; + assign req_valid_in_any[i] = (| req_valid_in[i]); + end + + wire sel_valid; + wire [LOG_NUM_REQS-1:0] sel_idx; + wire [NUM_REQS-1:0] sel_1hot; + + wire sel_enable = (| req_ready_out); + + VX_rr_arbiter #( + .NUM_REQS(NUM_REQS), + .LOCK_ENABLE(1) + ) sel_arb ( + .clk (clk), + .reset (reset), + .requests (req_valid_in_any), + .enable (sel_enable), + .grant_valid (sel_valid), + .grant_index (sel_idx), + .grant_onehot (sel_1hot) + ); + + wire [LANES-1:0] req_valid_out_unqual; + wire [LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_out_unqual; + + assign {req_valid_out_unqual, req_tag_out_unqual, req_addr_out, req_rw_out, req_byteen_out, req_data_out} = req_merged_data_in[sel_idx]; + + assign req_valid_out = req_valid_out_unqual & {LANES{sel_valid}}; + + for (genvar i = 0; i < LANES; i++) begin + assign req_tag_out[i] = {req_tag_out_unqual[i], sel_idx}; + end + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign req_ready_in[i] = req_ready_out & {LANES{sel_1hot[i]}}; + end + + /////////////////////////////////////////////////////////////////////// + + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in [LOG_NUM_REQS-1:0]; + + wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_merged_data_out; + for (genvar i = 0; i < NUM_REQS; i++) begin + assign {rsp_tag_out[i], rsp_data_out[i]} = rsp_merged_data_out[i]; + end + + VX_stream_demux #( + .NUM_REQS (NUM_REQS), + .DATAW (RSP_DATAW), + .BUFFERED (BUFFERED_RSP) + ) rsp_demux ( + .clk (clk), + .reset (reset), + .sel (rsp_sel), + .valid_in (rsp_valid_in), + .data_in ({rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH], rsp_data_in}), + .ready_in (rsp_ready_in), + .valid_out (rsp_valid_out), + .data_out (rsp_merged_data_out), + .ready_out (rsp_ready_out) + ); + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + assign req_valid_out = req_valid_in; + assign req_tag_out = req_tag_in; + assign req_addr_out = req_addr_in; + assign req_rw_out = req_rw_in; + assign req_byteen_out = req_byteen_in; + assign req_data_out = req_data_in; + assign req_ready_in = req_ready_out; + + assign rsp_valid_out = rsp_valid_in; + assign rsp_tag_out = rsp_tag_in; + assign rsp_data_out = rsp_data_in; + assign rsp_ready_in = rsp_ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_tex_lsu_arb.v b/hw/rtl/VX_tex_lsu_arb.v new file mode 100644 index 00000000..d35af222 --- /dev/null +++ b/hw/rtl/VX_tex_lsu_arb.v @@ -0,0 +1,128 @@ +`include "VX_define.vh" + +module VX_tex_lsu_arb #( + parameter NUM_REQS = 1, + parameter LANES = 1, + parameter WORD_SIZE = 1, + parameter TAG_IN_WIDTH = 1, + parameter TAG_OUT_WIDTH = 1, + parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) +) ( + input wire clk, + input wire reset, + + // input requests + input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in, + input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in, + input wire [NUM_REQS-1:0][LANES-1:0][WORD_SIZE-1:0] req_byteen_in, + input wire [NUM_REQS-1:0][LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_in, + input wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] req_data_in, + input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in, + output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in, + + // output request + output wire [LANES-1:0] req_valid_out, + output wire [LANES-1:0] req_rw_out, + output wire [LANES-1:0][WORD_SIZE-1:0] req_byteen_out, + output wire [LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_out, + output wire [LANES-1:0][`WORD_WIDTH-1:0] req_data_out, + output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out, + input wire [LANES-1:0] req_ready_out, + + // input response + input wire [LANES-1:0] rsp_valid_in, + input wire [LANES-1:0][`WORD_WIDTH-1:0] rsp_data_in, + input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, + output wire rsp_ready_in, + + // output responses + output wire [NUM_REQS-1:0][LANES-1:0] rsp_valid_out, + output wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] rsp_data_out, + output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, + input wire [NUM_REQS-1:0] rsp_ready_out +); + localparam REQ_DATAW = LANES * (1 + TAG_IN_WIDTH + `WORD_ADDR_WIDTH + 1 + WORD_SIZE + `WORD_WIDTH); + + if (NUM_REQS > 1) begin + + wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_in; + wire [NUM_REQS-1:0] req_valid_in_any; + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign req_merged_data_in[i] = {req_valid_in[i], req_tag_in[i], req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; + assign req_valid_in_any[i] = (| req_valid_in[i]); + end + + wire sel_valid; + wire [LOG_NUM_REQS-1:0] sel_idx; + wire [NUM_REQS-1:0] sel_1hot; + + wire sel_enable = (| req_ready_out); + + VX_rr_arbiter #( + .NUM_REQS(NUM_REQS), + .LOCK_ENABLE(1) + ) sel_arb ( + .clk (clk), + .reset (reset), + .requests (req_valid_in_any), + .enable (sel_enable), + .grant_valid (sel_valid), + .grant_index (sel_idx), + .grant_onehot (sel_1hot) + ); + + wire [LANES-1:0] req_valid_out_unqual; + wire [LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_out_unqual; + + assign {req_valid_out_unqual, req_tag_out_unqual, req_addr_out, req_rw_out, req_byteen_out, req_data_out} = req_merged_data_in[sel_idx]; + + assign req_valid_out = req_valid_out_unqual & {LANES{sel_valid}}; + + for (genvar i = 0; i < LANES; i++) begin + assign req_tag_out[i] = {req_tag_out_unqual[i], sel_idx}; + end + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign req_ready_in[i] = req_ready_out & {LANES{sel_1hot[i]}}; + end + + /////////////////////////////////////////////////////////////////////// + + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[LOG_NUM_REQS-1:0]; + + reg [NUM_REQS-1:0][LANES-1:0] rsp_valid_out_unqual; + always @(*) begin + rsp_valid_out_unqual = '0; + rsp_valid_out_unqual[rsp_sel] = rsp_valid_in; + end + assign rsp_valid_out = rsp_valid_out_unqual; + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign rsp_data_out[i] = rsp_data_in; + assign rsp_tag_out[i] = rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH]; + end + + assign rsp_ready_in = rsp_ready_out[rsp_sel]; + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + assign req_valid_out = req_valid_in; + assign req_tag_out = req_tag_in; + assign req_addr_out = req_addr_in; + assign req_rw_out = req_rw_in; + assign req_byteen_out = req_byteen_in; + assign req_data_out = req_data_in; + assign req_ready_in = req_ready_out; + + assign rsp_valid_out = rsp_valid_in; + assign rsp_tag_out = rsp_tag_in; + assign rsp_data_out = rsp_data_in; + assign rsp_ready_in = rsp_ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_dcache_core_req_if.v b/hw/rtl/interfaces/VX_dcache_core_req_if.v index 6fd95087..bfc640f2 100644 --- a/hw/rtl/interfaces/VX_dcache_core_req_if.v +++ b/hw/rtl/interfaces/VX_dcache_core_req_if.v @@ -4,18 +4,18 @@ `include "../cache/VX_cache_config.vh" interface VX_dcache_core_req_if #( - parameter NUM_REQS = 1, + parameter LANES = 1, parameter WORD_SIZE = 1, parameter CORE_TAG_WIDTH = 1 ) (); - wire [NUM_REQS-1:0] valid; - wire [NUM_REQS-1:0] rw; - wire [NUM_REQS-1:0][WORD_SIZE-1:0] byteen; - wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] addr; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] data; - wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] tag; - wire [NUM_REQS-1:0] ready; + wire [LANES-1:0] valid; + wire [LANES-1:0] rw; + wire [LANES-1:0][WORD_SIZE-1:0] byteen; + wire [LANES-1:0][`WORD_ADDR_WIDTH-1:0] addr; + wire [LANES-1:0][`WORD_WIDTH-1:0] data; + wire [LANES-1:0][CORE_TAG_WIDTH-1:0] tag; + wire [LANES-1:0] ready; endinterface diff --git a/hw/rtl/interfaces/VX_dcache_core_rsp_if.v b/hw/rtl/interfaces/VX_dcache_core_rsp_if.v index 6732e455..bf6b3fc1 100644 --- a/hw/rtl/interfaces/VX_dcache_core_rsp_if.v +++ b/hw/rtl/interfaces/VX_dcache_core_rsp_if.v @@ -4,15 +4,15 @@ `include "../cache/VX_cache_config.vh" interface VX_dcache_core_rsp_if #( - parameter NUM_REQS = 1, + parameter LANES = 1, parameter WORD_SIZE = 1, parameter CORE_TAG_WIDTH = 1 ) (); - wire [NUM_REQS-1:0] valid; - wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] data; - wire [CORE_TAG_WIDTH-1:0] tag; - wire ready; + wire [LANES-1:0] valid; + wire [LANES-1:0][`WORD_WIDTH-1:0]data; + wire [CORE_TAG_WIDTH-1:0] tag; + wire ready; endinterface diff --git a/hw/rtl/tex_unit/VX_tex_memory.v b/hw/rtl/tex_unit/VX_tex_memory.v index ad5a1601..dfeb4e39 100644 --- a/hw/rtl/tex_unit/VX_tex_memory.v +++ b/hw/rtl/tex_unit/VX_tex_memory.v @@ -198,7 +198,7 @@ module VX_tex_memory #( // send store commit - wire is_store_rsp = req_valid && ~req_wb && req_sent_all; + //wire is_store_rsp = req_valid && ~req_wb && req_sent_all; // assign st_commit_if.valid = is_store_rsp; // assign st_commit_if.wid = req_wid; diff --git a/hw/rtl/tex_unit/VX_tex_unit.v b/hw/rtl/tex_unit/VX_tex_unit.v index 3ac38183..fe46d4f2 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.v +++ b/hw/rtl/tex_unit/VX_tex_unit.v @@ -12,12 +12,11 @@ module VX_tex_unit #( VX_tex_csr_if tex_csr_if, // Outputs - VX_tex_rsp_if tex_rsp_if + VX_tex_rsp_if tex_rsp_if, // Texture unit <-> Memory Unit VX_dcache_core_req_if dcache_req_if, VX_dcache_core_rsp_if dcache_rsp_if - ); `UNUSED_PARAM (CORE_ID) @@ -83,7 +82,8 @@ module VX_tex_unit #( // texture response `UNUSED_VAR (tex_req_if.u) `UNUSED_VAR (tex_req_if.v) - `UNUSED_VAR (tex_req_if.lod_t) + `UNUSED_VAR (tex_req_if.lod) + `UNUSED_VAR (tex_req_if.t) assign stall_in = stall_out; @@ -96,7 +96,7 @@ module VX_tex_unit #( assign rsp_data = {`NUM_THREADS{32'hFF0000FF}}; // dummy blue value - //point sampling texel address computation + /*//point sampling texel address computation for (genvar i = 0; i < `NUM_THREADS; i++) begin assign tex_req_if.u[i] = gpu_req_if.rs1_data[i]; assign tex_req_if.v[i] = gpu_req_if.rs2_data[i]; @@ -108,7 +108,7 @@ module VX_tex_unit #( .clk (clk), .reset (reset), ); - end + end*/ // fifo/wait buffer for fragments and also to dcache