diff --git a/driver/tests/tex_demo/common.h b/driver/tests/tex_demo/common.h index 9de20206..9ee0d230 100644 --- a/driver/tests/tex_demo/common.h +++ b/driver/tests/tex_demo/common.h @@ -4,11 +4,13 @@ struct kernel_arg_t { uint32_t num_tasks; uint32_t src_width; uint32_t src_height; + uint32_t src_stride; uint32_t src_pitch; + uint32_t src_ptr; uint32_t dst_width; uint32_t dst_height; + uint32_t dst_stride; uint32_t dst_pitch; - uint32_t src_ptr; uint32_t dst_ptr; }; diff --git a/driver/tests/tex_demo/demo b/driver/tests/tex_demo/demo index aee45cbc..9c1958de 100755 Binary files a/driver/tests/tex_demo/demo and b/driver/tests/tex_demo/demo differ diff --git a/driver/tests/tex_demo/demo.cpp b/driver/tests/tex_demo/demo.cpp index 4329c825..2c5db4cc 100644 --- a/driver/tests/tex_demo/demo.cpp +++ b/driver/tests/tex_demo/demo.cpp @@ -156,9 +156,11 @@ int main(int argc, char *argv[]) { kernel_arg.num_tasks = std::min(num_tasks, dst_height); kernel_arg.src_width = src_width; kernel_arg.src_height = src_height; + kernel_arg.src_stride = src_bpp; kernel_arg.src_pitch = src_bpp * src_width * src_height; kernel_arg.dst_width = dst_width; kernel_arg.dst_height = dst_height; + kernel_arg.dst_stride = dst_bpp; kernel_arg.dst_pitch = dst_bpp * dst_width * dst_height; kernel_arg.src_ptr = src_addr; kernel_arg.dst_ptr = dst_addr; diff --git a/driver/tests/tex_demo/kernel.bin b/driver/tests/tex_demo/kernel.bin index c992ca17..a3119cfc 100755 Binary files a/driver/tests/tex_demo/kernel.bin and b/driver/tests/tex_demo/kernel.bin differ diff --git a/driver/tests/tex_demo/kernel.c b/driver/tests/tex_demo/kernel.c index 89693f2c..cbcc591c 100644 --- a/driver/tests/tex_demo/kernel.c +++ b/driver/tests/tex_demo/kernel.c @@ -2,12 +2,16 @@ #include #include #include "common.h" + +uint32_t ilog2 (uint32_t value) { + return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1; +} struct tile_arg_t { - struct kernel_arg_t karg; - uint32_t tile_width; - uint32_t tile_height; - float deltaX; - float deltaY; + struct kernel_arg_t karg; + uint32_t tile_width; + uint32_t tile_height; + float deltaX; + float deltaY; }; void kernel_body(int task_id, void* arg) { @@ -36,15 +40,14 @@ int main() { struct kernel_arg_t* arg = (struct kernel_arg_t*)0x0; // configure texture unit - vx_csr_write(CSR_TEX0_ADDR, arg->src_ptr); - vx_csr_write(CSR_TEX0_FORMAT, 0); - vx_csr_write(CSR_TEX0_WIDTH, arg->src_width); - vx_csr_write(CSR_TEX0_HEIGHT, arg->src_height); - vx_csr_write(CSR_TEX0_PITCH, arg->src_pitch); - vx_csr_write(CSR_TEX0_WRAP_U, 0); - vx_csr_write(CSR_TEX0_WRAP_V, 0); - vx_csr_write(CSR_TEX0_MIN_FILTER, 0); - vx_csr_write(CSR_TEX0_MAX_FILTER, 0); + vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr); + vx_csr_write(CSR_TEX_FORMAT(0), 0); + vx_csr_write(CSR_TEX_WIDTH(0), ilog2(arg->src_width)); + vx_csr_write(CSR_TEX_HEIGHT(0), ilog2(arg->src_height)); + vx_csr_write(CSR_TEX_STRIDE(0), ilog2(arg->src_stride)); + vx_csr_write(CSR_TEX_WRAP_U(0), 0); + vx_csr_write(CSR_TEX_WRAP_V(0), 0); + vx_csr_write(CSR_TEX_FILTER(0), 0); struct tile_arg_t targ; targ.karg = *arg; diff --git a/driver/tests/tex_demo/kernel.dump b/driver/tests/tex_demo/kernel.dump index b9f88ac4..9760426f 100644 --- a/driver/tests/tex_demo/kernel.dump +++ b/driver/tests/tex_demo/kernel.dump @@ -29,7 +29,7 @@ Disassembly of section .init: Disassembly of section .text: 80000050
: -80000050: 01c02783 lw a5,28(zero) # 1c <__stack_usage+0x1c> +80000050: 01402783 lw a5,20(zero) # 14 <__stack_usage+0x14> 80000054: 00100073 ebreak 80000058 : diff --git a/driver/tests/tex_demo/kernel.elf b/driver/tests/tex_demo/kernel.elf index a928fbde..56270014 100755 Binary files a/driver/tests/tex_demo/kernel.elf and b/driver/tests/tex_demo/kernel.elf differ diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 897e9f4b..8bc7a7f6 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -237,33 +237,21 @@ `define CSR_NW 12'hFC1 `define CSR_NC 12'hFC2 -////////// Texture Unit CSRs ///////////// +////////// Texture Units ////////////////////////////////////////////////////// -`define CSR_TEX_BEGIN 12'hFD0 +`define NUM_TEX_UNITS 2 -// Unit 1 -`define CSR_TEX0_ADDR `CSR_TEX_BEGIN -`define CSR_TEX0_FORMAT `CSR_TEX_BEGIN + 12'h1 -`define CSR_TEX0_WIDTH `CSR_TEX_BEGIN + 12'h2 -`define CSR_TEX0_HEIGHT `CSR_TEX_BEGIN + 12'h3 -`define CSR_TEX0_PITCH `CSR_TEX_BEGIN + 12'h4 -`define CSR_TEX0_WRAP_U `CSR_TEX_BEGIN + 12'h5 -`define CSR_TEX0_WRAP_V `CSR_TEX_BEGIN + 12'h6 -`define CSR_TEX0_MIN_FILTER `CSR_TEX_BEGIN + 12'h7 -`define CSR_TEX0_MAX_FILTER `CSR_TEX_BEGIN + 12'h8 +`define CSR_TEX_STATES 8 +`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES) -// Unit 2 -`define CSR_TEX1_ADDR `CSR_TEX_BEGIN + 12'h9 -`define CSR_TEX1_FORMAT `CSR_TEX_BEGIN + 12'hA -`define CSR_TEX1_WIDTH `CSR_TEX_BEGIN + 12'hB -`define CSR_TEX1_HEIGHT `CSR_TEX_BEGIN + 12'hC -`define CSR_TEX1_PITCH `CSR_TEX_BEGIN + 12'hD -`define CSR_TEX1_WRAP_U `CSR_TEX_BEGIN + 12'hE -`define CSR_TEX1_WRAP_V `CSR_TEX_BEGIN + 12'hF -`define CSR_TEX1_MIN_FILTER `CSR_TEX_BEGIN + 12'h10 -`define CSR_TEX1_MAX_FILTER `CSR_TEX_BEGIN + 12'h11 - -`define CSR_TEX_END `CSR_TEX1_MAX_FILTER +`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00) +`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01) +`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h02) +`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h03) +`define CSR_TEX_STRIDE(x) (`CSR_TEX_BEGIN(x) + 12'h04) +`define CSR_TEX_WRAP_U(x) (`CSR_TEX_BEGIN(x) + 12'h05) +`define CSR_TEX_WRAP_V(x) (`CSR_TEX_BEGIN(x) + 12'h06) +`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h07) // Pipeline Queues //////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index c9ac3357..a8302f0f 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -57,27 +57,26 @@ module VX_csr_data #( | fcsr[fpu_to_csr_if.write_wid][`FFG_BITS-1:0]; end - if (write_enable && (write_addr > `CSR_TEX_END || write_addr < `CSR_TEX_BEGIN)) begin + if (write_enable) begin case (write_addr) `CSR_FFLAGS: fcsr[write_wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0]; `CSR_FRM: fcsr[write_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; `CSR_FCSR: fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; - `CSR_SATP: csr_satp <= write_data; - - `CSR_MSTATUS: csr_mstatus <= write_data; - `CSR_MEDELEG: csr_medeleg <= write_data; - `CSR_MIDELEG: csr_mideleg <= write_data; - `CSR_MIE: csr_mie <= write_data; - `CSR_MTVEC: csr_mtvec <= write_data; - - `CSR_MEPC: csr_mepc <= write_data; - + `CSR_SATP: csr_satp <= write_data; + `CSR_MSTATUS: csr_mstatus <= write_data; + `CSR_MEDELEG: csr_medeleg <= write_data; + `CSR_MIDELEG: csr_mideleg <= write_data; + `CSR_MIE: csr_mie <= write_data; + `CSR_MTVEC: csr_mtvec <= write_data; + `CSR_MEPC: csr_mepc <= write_data; `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data; `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data; default: begin - assert(~write_enable) else $error("%t: invalid CSR write address: %0h", $time, write_addr); + if (write_addr < `CSR_TEX_BEGIN(0) || write_addr > `CSR_TEX_BEGIN(`CSR_TEX_STATES)) begin + $error("%t: invalid CSR write address: %0h", $time, write_addr); + end end endcase end diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index d30ca443..2fc3ffcd 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -361,7 +361,8 @@ module VX_decode #( `ifdef EXT_TEX_ENABLE 3'h5: begin op_type = `OP_BITS'(`GPU_TEX); - use_rd = 1; + op_mod = instr[26:25]; + use_rd = 1; use_rs1 = 1; use_rs2 = 1; use_rs3 = 1; diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 5c85cb5e..1bb0e1af 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -26,7 +26,7 @@ `define CSR_ADDR_BITS 12 -`define CSR_WIDTH 32 +`define CSR_WIDTH 12 /////////////////////////////////////////////////////////////////////////////// @@ -388,13 +388,17 @@ `define XDRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH+`CLOG2(2)) ////////////////////////// Texture Unit Configurable Knobs ////////////////////////////// -`define NUM_TEX_UNITS 2 -`define MADDRW 8 -`define MAXWTW 8 -`define MAXHTW 8 -`define MAXFTW 8 -`define MAXFMW 8 -`define MAXAMW 8 + +`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS) + +`define TEX_ADDR_BITS 32 +`define TEX_FMT_BITS 3 +`define TEX_WRAP_BITS 2 +`define TEX_WIDTH_BITS 12 +`define TEX_HEIGHT_BITS 12 +`define TEX_STRIDE_BITS 12 +`define TEX_FILTER_BITS 1 + //////////////////////////////////////////////////////////////////////////////////////// `include "VX_types.vh" diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index dcbdab68..86d274e3 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -111,13 +111,11 @@ module VX_gpu_unit #( assign tex_req_if.PC = gpu_req_if.PC; assign tex_req_if.rd = gpu_req_if.rd; assign tex_req_if.wb = gpu_req_if.wb; - - for (genvar i = 0; i < `NUM_THREADS; i++) begin - assign tex_req_if.u[i] = gpu_req_if.rs1_data[i]; - assign tex_req_if.v[i] = gpu_req_if.rs2_data[i]; - assign tex_req_if.lod[i] = gpu_req_if.rs3_data[i][31:8]; - assign tex_req_if.t[i] = gpu_req_if.rs3_data[i][7:0]; - end + + assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0]; + assign tex_req_if.u = gpu_req_if.rs1_data; + assign tex_req_if.v = gpu_req_if.rs2_data; + assign tex_req_if.lod = gpu_req_if.rs3_data; VX_tex_unit #( .CORE_ID(CORE_ID) @@ -159,6 +157,7 @@ module VX_gpu_unit #( assign rsp_wb = 0; assign rsp_data = warp_ctl_data; + `UNUSED_VAR (gpu_req_if.op_mod) `UNUSED_VAR (gpu_req_if.rs2_data) `UNUSED_VAR (gpu_req_if.rs3_data) `UNUSED_VAR (gpu_req_if.wb) diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 68b5123d..d8f7e7bb 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -111,14 +111,14 @@ module VX_instr_demux ( wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) //update number of bits + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) //update number of bits ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), + .data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.op_mod, execute_if.rd, execute_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/interfaces/VX_tex_req_if.v b/hw/rtl/interfaces/VX_tex_req_if.v index fffd308f..48555145 100644 --- a/hw/rtl/interfaces/VX_tex_req_if.v +++ b/hw/rtl/interfaces/VX_tex_req_if.v @@ -9,12 +9,14 @@ interface VX_tex_req_if (); wire [`NW_BITS-1:0] wid; wire [`NUM_THREADS-1:0] tmask; wire [31:0] PC; - wire [`NR_BITS-1:0] rd; + wire [`NR_BITS-1:0] rd; wire wb; + + wire [`NUM_THREADS-1:0][`NTEX_BITS-1:0] unit; wire [`NUM_THREADS-1:0][31:0] u; wire [`NUM_THREADS-1:0][31:0] v; - wire [`NUM_THREADS-1:0][23:0] lod; - wire [`NUM_THREADS-1:0][7:0] t; + wire [`NUM_THREADS-1:0][31:0] lod; + wire ready; endinterface diff --git a/hw/rtl/tex_unit/VX_tex_addr_gen.v b/hw/rtl/tex_unit/VX_tex_addr_gen.v new file mode 100644 index 00000000..62ab1431 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_addr_gen.v @@ -0,0 +1,70 @@ +`include "VX_define.vh" + +module VX_tex_addr_gen #( + parameter CORE_ID = 0, + parameter REQ_TAG_WIDTH = 1, + parameter FRAC_BITS = 20, + parameter INT_BITS = 32 - FRAC_BITS +) ( + input wire clk, + input wire reset, + + // handshake + + input wire valid_in, + output wire ready_in, + + // inputs + + output wire [REQ_TAG_WIDTH-1:0] req_tag, + input wire [`TEX_FILTER_BITS-1:0] filter, + input wire [`TEX_WRAP_BITS-1:0] wrap_u, + input wire [`TEX_WRAP_BITS-1:0] wrap_v, + + input wire [`TEX_ADDR_BITS-1:0] base_addr, + input wire [1:0] log2_stride, + input wire [`TEX_WIDTH_BITS-1:0] log2_width, + input wire [`TEX_HEIGHT_BITS-1:0] log2_height, + input wire [3:0] lod, + + input wire [31:0] coord_u, + input wire [31:0] coord_v, + + // outputs + + output wire [3:0] mem_req_valid, + output wire [REQ_TAG_WIDTH-1:0] mem_req_tag, + output wire [3:0][31:0] mem_req_addr, + input wire mem_req_ready +); + + `UNUSED_VAR (filter) + `UNUSED_VAR (lod) + + wire [31:0] u, y; + wire [31:0] x_offset, y_offset; + wire [31:0] addr0; + + // addressing mode + + assign x_offset = u >> (5'(FRAC_BITS) - log2_width); + assign y_offset = v >> (5'(FRAC_BITS) - log2_height); + assign addr0 = base_addr + (x_offset + (y_offset << log2_width)) << log2_stride; + + wire [3:0] req_valids = 4'(valid_in); + wire [3:0][31:0] req_address = {4{addr0}}; + + VX_pipe_register #( + .DATAW (1 + 4 + 4 * 32 + REQ_TAG_WIDTH), + .RESETW (1) + ) pipe_reg ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({req_valids, req_address, req_tag}), + .data_out ({mem_req_valid, mem_req_addr, mem_req_tag}) + ); + + assign ready_in = ~stall_out; + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_clamp.v b/hw/rtl/tex_unit/VX_tex_clamp.v new file mode 100644 index 00000000..84efd086 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_clamp.v @@ -0,0 +1,22 @@ +`include "VX_define.vh" + +module VX_tex_addr_gen #( + parameter FRAC_BITS = 20, + parameter INT_BITS = 32 - FRAC_BITS +) ( + input wire [`TEX_WRAP_BITS-1:0] wrap_i; + input wire [31:0] coord_i, + input wire [31:0] coord_o +) + + always @(*) begin + case (wrap_i) + `ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; + `ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; + `ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; + //`ALU_SLL, + default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0]; + endcase + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_format.v b/hw/rtl/tex_unit/VX_tex_format.v new file mode 100644 index 00000000..663cf8af --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_format.v @@ -0,0 +1,8 @@ +module VX_tex_format #( + parameter CORE_ID = 0 +) ( + // TODO +) + // TODO + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_memory.v b/hw/rtl/tex_unit/VX_tex_memory.v index dfeb4e39..4b714565 100644 --- a/hw/rtl/tex_unit/VX_tex_memory.v +++ b/hw/rtl/tex_unit/VX_tex_memory.v @@ -1,23 +1,29 @@ `include "VX_define.vh" module VX_tex_memory #( - parameter CORE_ID = 0 + parameter CORE_ID = 0, + parameter TAG_IN_WIDTH = 1 ) ( `SCOPE_IO_VX_lsu_unit input wire clk, input wire reset, - // Dcache interface + // memory interface VX_dcache_core_req_if dcache_req_if, VX_dcache_core_rsp_if dcache_rsp_if, // inputs - VX_lsu_req_if lsu_req_if, + input wire [3:0] req_valid, + input wire [3:0][31:0] req_addr, + input wire [TAG_IN_WIDTH-1:0] req_tag, + output wire req_ready, // outputs - VX_commit_if ld_commit_if - // VX_commit_if st_commit_if + output wire rsp_valid, + output wire [3:0][31:0] rsp_data, + output wire [TAG_IN_WIDTH-1:0] rsp_tag, + input wire rsp_ready ); `UNUSED_PARAM (CORE_ID) diff --git a/hw/rtl/tex_unit/VX_tex_sampler.v b/hw/rtl/tex_unit/VX_tex_sampler.v new file mode 100644 index 00000000..06e2fde6 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_sampler.v @@ -0,0 +1,12 @@ +`include "VX_define.vh" + +module VX_tex_sampler #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset +); + + // TODO + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_unit.v b/hw/rtl/tex_unit/VX_tex_unit.v index 8131171b..543b3f31 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.v +++ b/hw/rtl/tex_unit/VX_tex_unit.v @@ -5,20 +5,22 @@ module VX_tex_unit #( parameter CORE_ID = 0 ) ( input wire clk, - input wire reset, + input wire reset, + + // Texture unit <-> Memory Unit + VX_dcache_core_req_if dcache_req_if, + VX_dcache_core_rsp_if dcache_rsp_if, // Inputs VX_tex_req_if tex_req_if, VX_tex_csr_if tex_csr_if, // Outputs - VX_tex_rsp_if tex_rsp_if, - - // Texture unit <-> Memory Unit - VX_dcache_core_req_if dcache_req_if, - VX_dcache_core_rsp_if dcache_rsp_if + VX_tex_rsp_if tex_rsp_if ); + localparam MEM_REQ_TAGW = `NW_BITS + 32 + 1 + `NR_BITS + `NTEX_BITS; + `UNUSED_PARAM (CORE_ID) `UNUSED_VAR (reset) @@ -31,104 +33,131 @@ module VX_tex_unit #( wire [`NUM_THREADS-1:0][31:0] rsp_data; wire stall_in, stall_out; - reg [`CSR_WIDTH-1:0] tex_addr [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_format [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_width [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_height [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_stride [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_wrap_u [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_wrap_v [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_min_filter [`NUM_TEX_UNITS-1: 0]; - reg [`CSR_WIDTH-1:0] tex_max_filter [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_ADDR_BITS-1:0] tex_addr [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_FMT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_WIDTH_BITS-1:0] tex_width [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_HEIGHT_BITS-1:0] tex_height [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_STRIDE_BITS-1:0] tex_stride [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_WRAP_BITS-1:0] tex_wrap_u [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_WRAP_BITS-1:0] tex_wrap_v [`NUM_TEX_UNITS-1: 0]; + reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1: 0]; - `UNUSED_VAR (tex_format) - `UNUSED_VAR (tex_stride) - `UNUSED_VAR (tex_wrap_u) - `UNUSED_VAR (tex_wrap_v) - `UNUSED_VAR (tex_min_filter) - `UNUSED_VAR (tex_max_filter) + // CSRs programming - //tex csr programming, need to make make consistent with `NUM_TEX_UNITS - always @(posedge clk ) begin - if (tex_csr_if.write_enable) begin - case (tex_csr_if.write_addr) - `CSR_TEX0_ADDR : tex_addr[0] <= tex_csr_if.write_data; - `CSR_TEX0_FORMAT : tex_format[0] <= tex_csr_if.write_data; - `CSR_TEX0_WIDTH : tex_width[0] <= tex_csr_if.write_data; - `CSR_TEX0_HEIGHT : tex_height[0] <= tex_csr_if.write_data; - `CSR_TEX0_PITCH : tex_stride[0] <= tex_csr_if.write_data; - `CSR_TEX0_WRAP_U : tex_wrap_u[0] <= tex_csr_if.write_data; - `CSR_TEX0_WRAP_V : tex_wrap_v[0] <= tex_csr_if.write_data; - `CSR_TEX0_MIN_FILTER : tex_min_filter[0] <= tex_csr_if.write_data; - `CSR_TEX0_MAX_FILTER : tex_max_filter[0] <= tex_csr_if.write_data; - - `CSR_TEX1_ADDR : tex_addr[1] <= tex_csr_if.write_data; - `CSR_TEX1_FORMAT : tex_format[1] <= tex_csr_if.write_data; - `CSR_TEX1_WIDTH : tex_width[1] <= tex_csr_if.write_data; - `CSR_TEX1_HEIGHT : tex_height[1] <= tex_csr_if.write_data; - `CSR_TEX1_PITCH : tex_stride[1] <= tex_csr_if.write_data; - `CSR_TEX1_WRAP_U : tex_wrap_u[1] <= tex_csr_if.write_data; - `CSR_TEX1_WRAP_V : tex_wrap_v[1] <= tex_csr_if.write_data; - `CSR_TEX1_MIN_FILTER : tex_min_filter[1] <= tex_csr_if.write_data; - `CSR_TEX1_MAX_FILTER : tex_max_filter[1] <= tex_csr_if.write_data; - default:; - endcase + for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin + always @(posedge clk ) begin + if (reset) begin + tex_addr[i] <= 0; + tex_format[i] <= 0; + tex_width[i] <= 0; + tex_height[i] <= 0; + tex_stride[i] <= 0; + tex_wrap_u[i] <= 0; + tex_wrap_v[i] <= 0; + tex_filter[i] <= 0; + end begin + if (tex_csr_if.write_enable) begin + case (tex_csr_if.write_addr) + `CSR_TEX_ADDR(i) : tex_addr[i] <= tex_csr_if.write_data; + `CSR_TEX_FORMAT(i) : tex_format[i] <= tex_csr_if.write_data; + `CSR_TEX_WIDTH(i) : tex_width[i] <= tex_csr_if.write_data; + `CSR_TEX_HEIGHT(i) : tex_height[i] <= tex_csr_if.write_data; + `CSR_TEX_STRIDE(i) : tex_stride[i] <= tex_csr_if.write_data; + `CSR_TEX_WRAP_U(i) : tex_wrap_u[i] <= tex_csr_if.write_data; + `CSR_TEX_WRAP_V(i) : tex_wrap_v[i] <= tex_csr_if.write_data; + `CSR_TEX_FILTER(i) : tex_filter[i] <= tex_csr_if.write_data; + default: + assert(tex_csr_if.write_addr >= `CSR_TEX_BEGIN(0) + && tex_csr_if.write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES)); + endcase + end + end end end - // texture response - `UNUSED_VAR (tex_req_if.lod) + // address generation - // texture unit <-> dcache - VX_lsu_req_if lsu_req_if(); - VX_commit_if ld_commit_if(); + wire [3:0] mem_req_valid; + wire [3:0][31:0] mem_req_addr; + wire [TAG_IN_WIDTH-1:0] mem_req_tag; + wire mem_req_ready; - VX_tex_memory #( - .CORE_ID(CORE_ID) - ) tex_memory ( + wire mem_rsp_valid; + wire [3:0][31:0] mem_rsp_data; + wire [TAG_IN_WIDTH-1:0] mem_rsp_tag; + wire mem_rsp_ready; + + VX_tex_addr_gen #( + .FRAC_BITS(20) + ) tex_addr_gen ( .clk (clk), .reset (reset), - .dcache_req_if (dcache_req_if), - .dcache_rsp_if (dcache_rsp_if), - .lsu_req_if (lsu_req_if), - .ld_commit_if (ld_commit_if) + + .valid_in (tex_req_if.valid), + .ready_in (tex_req_if.ready), + + .req_tag ({tex_req_if.wid, tex_req_if.PC, tex_req_if.rd, tex_req_if.wb}), + .filter (tex_filter[tex_req_if.unit]), + .wrap_u (tex_wrap_ufilter[tex_req_if.unit]), + .wrap_v (tex_wrap_v[tex_req_if.unit]), + + .base_addr (tex_addr[tex_req_if.unit]), + .log2_stride (tex_stride[tex_req_if.unit]), + .log2_width (tex_width[tex_req_if.unit]), + .log2_height (tex_height[tex_req_if.unit]), + + .coord_u (tex_req_if.u), + .coord_v (tex_req_if.v), + .lod (tex_req_if.lod), + + .mem_req_valid (mem_req_valid), + .mem_req_tag (mem_req_tag), + .mem_req_addr (mem_req_addr), + .mem_req_ready (mem_req_ready) ); - //point sampling - texel address computation - wire [`NUM_THREADS-1:0] pt_addr_valid; - wire [`NUM_THREADS-1:0] pt_addr_ready; + // retrieve texel values from memory + + VX_tex_memory #( + .CORE_ID (CORE_ID), + .REQ_TAG_WIDTH (MEM_REQ_TAGW) + ) tex_memory ( + .clk (clk), + .reset (reset), - for (genvar i = 0; i < `NUM_THREADS; i++) begin - wire [`CSR_WIDTH-1:0] tex_addr_select; - wire [`CSR_WIDTH-1:0] tex_width_select; - wire [`CSR_WIDTH-1:0] tex_height_select; - - assign tex_addr_select = (tex_req_if.t[i] == 'b1) ? tex_addr[1] : tex_addr[0]; - assign tex_width_select = (tex_req_if.t[i] == 'b1) ? tex_width[1] : tex_width[0]; - assign tex_height_select = (tex_req_if.t[i] == 'b1) ? tex_height[1] : tex_height[0]; - - VX_tex_pt_addr #( - .FRAC_BITS(28) - ) tex_pt_addr ( - .clk (clk), - .reset (reset), + // memory interface + .dcache_req_if (dcache_req_if), + .dcache_rsp_if (dcache_rsp_if), - .valid_in (tex_req_if.valid), - .ready_out (pt_addr_ready[i]), + // inputs + req_valid (mem_req_valid), + req_addr (mem_req_addr), + req_tag (mem_req_tag), + req_ready (mem_req_ready), - .tex_addr (tex_addr_select), - .tex_width (tex_width_select), - .tex_height (tex_height_select), + // outputs + rsp_valid (mem_rsp_valid), + rsp_texel (mem_rsp_data), + rsp_tag (mem_rsp_tag), + rsp_ready (mem_rsp_ready) + ); - .tex_u (tex_req_if.u[i]), - .tex_v (tex_req_if.v[i]), + // apply sampler - .pt_addr (lsu_req_if.base_addr[i]), + VX_tex_sampler #( + .CORE_ID (CORE_ID) + ) tex_sampler ( + .clk (clk), + .reset (reset) - .valid_out (pt_addr_valid[i]), - .ready_in (lsu_req_if.ready) - ); - end + // inputs + //.valid_in (mem_rsp_valid), + //.texel (mem_rsp_data), + //.req_wid (mem_rsp_tag), + //.req_PC (mem_rsp_tag), + //.format (mem_rsp_tag), + //.ready_in (mem_rsp_ready), + ); assign tex_req_if.ready = (& pt_addr_ready); @@ -176,8 +205,8 @@ module VX_tex_unit #( `ifdef DBG_PRINT_TEX always @(posedge clk) begin if (tex_csr_if.write_enable - && (tex_csr_if.write_addr <= `CSR_TEX_END - || tex_csr_if.write_addr >= `CSR_TEX_BEGIN)) begin + && (tex_csr_if.write_addr >= `CSR_TEX_BEGIN(0) + && tex_csr_if.write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))) begin $display("%t: core%0d-tex_csr: csr_tex0_addr, csr_data=%0h", $time, CORE_ID, tex_addr[0]); $display("%t: core%0d-tex_csr: csr_tex0_format, csr_data=%0h", $time, CORE_ID, tex_format[0]); $display("%t: core%0d-tex_csr: csr_tex0_width, csr_data=%0h", $time, CORE_ID, tex_width[0]); diff --git a/hw/rtl/tex_unit/VX_tex_wrap.v b/hw/rtl/tex_unit/VX_tex_wrap.v new file mode 100644 index 00000000..01458269 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_wrap.v @@ -0,0 +1,23 @@ +`include "VX_define.vh" + +module VX_tex_wrap #( + parameter CORE_ID = 0, + parameter FRAC_BITS = 20, + parameter INT_BITS = 32 - FRAC_BITS +) ( + input wire [`TEX_WRAP_BITS-1:0] wrap_i; + input wire [31:0] coord_i, + input wire [31:0] coord_o +) + + always @(*) begin + case (wrap_i) + `ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; + `ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i]; + `ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i]; + //`ALU_SLL, + default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0]; + endcase + end + +endmodule \ No newline at end of file diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index 67644ff0..9fad2c92 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -14,41 +14,41 @@ extern "C" { #endif #define vx_csr_swap(csr, val) ({ \ - unsigned long __v = (unsigned long)(val); \ + unsigned __v = (unsigned )(val); \ __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ __v; \ }) #define vx_csr_read(csr) ({ \ - register unsigned long __v; \ + register unsigned __v; \ __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ __v; \ }) #define vx_csr_write(csr, val) ({ \ - unsigned long __v = (unsigned long)(val); \ + unsigned __v = (unsigned )(val); \ __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ }) #define vx_csr_read_set(csr, val) ({ \ - unsigned long __v = (unsigned long)(val); \ + unsigned __v = (unsigned )(val); \ __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ __v; \ }) #define vx_csr_set(csr, val) ({ \ - unsigned long __v = (unsigned long)(val); \ + unsigned __v = (unsigned )(val); \ __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ }) #define vx_csr_read_clear(csr, val) ({ \ - unsigned long __v = (unsigned long)(val); \ + unsigned __v = (unsigned )(val); \ __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ __v; \ }) #define vx_csr_clear(csr, val) ({ \ - unsigned long __v = (unsigned long)(val); \ + unsigned __v = (unsigned )(val); \ __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ }) @@ -77,6 +77,13 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) { asm volatile (".insn s 0x6b, 4, %1, 0cd (%0)" :: "r"(barried_id), "r"(num_warps)); } +// Texture load +#define vx_tex_ld(unit, u, v, lod) ({ \ + register unsigned result; \ + asm volatile (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" :: "r"(result), "r"(u), "r"(v), "r"(lod)); \ + result; \ +}) + // Return active warp's thread id inline int vx_thread_id() { int result; @@ -154,14 +161,6 @@ inline int vx_num_instrs() { return result; } -// Texture load instruction -inline int vx_tex_ld(unsigned t, unsigned u, unsigned v, unsigned lod_t) { - lod_t = (lod_t << 8) | t; - int result; - asm volatile (".insn r4 0x6b, 5, 1, %0, %1, %2, %3" :: "r"(result), "r"(u), "r"(v), "r"(lod_t)); - return result; -} - #define __if(b) vx_split(b); \ if (b)