diff --git a/driver/tests/tex_demo/kernel.bin b/driver/tests/tex_demo/kernel.bin index 84e59dd6..d740e548 100755 Binary files a/driver/tests/tex_demo/kernel.bin and b/driver/tests/tex_demo/kernel.bin differ diff --git a/driver/tests/tex_demo/kernel.c b/driver/tests/tex_demo/kernel.c index 89f7cafb..87a2159f 100644 --- a/driver/tests/tex_demo/kernel.c +++ b/driver/tests/tex_demo/kernel.c @@ -27,7 +27,7 @@ void kernel_body(int task_id, void* arg) { for (uint32_t x = 0; x < _arg->tile_width; ++x) { int32_t u = (int32_t)(fu * (1<<20)); int32_t v = (int32_t)(fv * (1<<20)); - dst_ptr[x] = vx_tex(0, u, v, 0); + dst_ptr[x] = vx_tex(0, u, v, 0x0); fu += _arg->deltaX; } dst_ptr += _arg->karg.dst_pitch; diff --git a/driver/tests/tex_demo/kernel.dump b/driver/tests/tex_demo/kernel.dump index e5da1dbf..f1179c11 100644 --- a/driver/tests/tex_demo/kernel.dump +++ b/driver/tests/tex_demo/kernel.dump @@ -36,23 +36,23 @@ Disassembly of section .text: 80000060: 04912223 sw s1,68(sp) 80000064: 01442783 lw a5,20(s0) # 7ffff014 <__stack_size+0x7fffec14> 80000068: fd079073 csrw 0xfd0,a5 -8000006c: fd105073 csrwi 0xfd1,0 +8000006c: fd405073 csrwi 0xfd4,0 80000070: 00442503 lw a0,4(s0) 80000074: 01f00493 li s1,31 80000078: 00151513 slli a0,a0,0x1 8000007c: fff50513 addi a0,a0,-1 80000080: 400000ef jal ra,80000480 <__clzsi2> 80000084: 40a48533 sub a0,s1,a0 -80000088: fd251073 csrw 0xfd2,a0 +80000088: fd551073 csrw 0xfd5,a0 8000008c: 00842503 lw a0,8(s0) 80000090: 00151513 slli a0,a0,0x1 80000094: fff50513 addi a0,a0,-1 80000098: 3e8000ef jal ra,80000480 <__clzsi2> 8000009c: 40a484b3 sub s1,s1,a0 -800000a0: fd349073 csrw 0xfd3,s1 -800000a4: fd405073 csrwi 0xfd4,0 -800000a8: fd505073 csrwi 0xfd5,0 -800000ac: fd605073 csrwi 0xfd6,0 +800000a0: fd649073 csrw 0xfd6,s1 +800000a4: fd105073 csrwi 0xfd1,0 +800000a8: fd205073 csrwi 0xfd2,0 +800000ac: fd305073 csrwi 0xfd3,0 800000b0: 01442503 lw a0,20(s0) 800000b4: 01842583 lw a1,24(s0) 800000b8: 01c42603 lw a2,28(s0) diff --git a/driver/tests/tex_demo/kernel.elf b/driver/tests/tex_demo/kernel.elf index ee946379..ea2de947 100755 Binary files a/driver/tests/tex_demo/kernel.elf and b/driver/tests/tex_demo/kernel.elf differ diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index 83ec0608..dc9dc7e8 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -204,7 +204,7 @@ module VX_csr_data #( `CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID; default: begin - assert (~read_enable || read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES)) + assert (~read_enable || (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))) else $error("%t: invalid CSR read address: %0h", $time, read_addr); end endcase diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index d5eff8e1..c4d699b7 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -24,7 +24,7 @@ module VX_decode #( reg [`MOD_BITS-1:0] op_mod; reg [31:0] imm; reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm; - reg rd_fp, rs1_fp, rs2_fp; + reg rd_fp, rs1_fp, rs2_fp, rs3_fp; reg is_join, is_wstall; wire [31:0] instr = ifetch_rsp_if.instr; @@ -59,6 +59,7 @@ module VX_decode #( rd_fp = 0; rs1_fp = 0; rs2_fp = 0; + rs3_fp = 1; is_join = 0; is_wstall = 0; @@ -367,6 +368,7 @@ module VX_decode #( use_rs1 = 1; use_rs2 = 1; use_rs3 = 1; + rs3_fp = 0; end `endif default:; @@ -395,7 +397,7 @@ module VX_decode #( assign decode_if.rd = {rd_fp, rd}; assign decode_if.rs1 = {rs1_fp, rs1_qual}; assign decode_if.rs2 = {rs2_fp, rs2}; - assign decode_if.rs3 = {1'b1, rs3}; + assign decode_if.rs3 = {rs3_fp, rs3}; `else `UNUSED_VAR (rd_fp) `UNUSED_VAR (rs1_fp) diff --git a/hw/rtl/tex_unit/VX_tex_addr.v b/hw/rtl/tex_unit/VX_tex_addr.v index 96540749..98ac31c5 100644 --- a/hw/rtl/tex_unit/VX_tex_addr.v +++ b/hw/rtl/tex_unit/VX_tex_addr.v @@ -50,8 +50,8 @@ module VX_tex_addr #( `UNUSED_PARAM (CORE_ID) - wire [1:0][`NUM_THREADS-1:0][`FIXED_FRAC-1:0] u; - wire [1:0][`NUM_THREADS-1:0][`FIXED_FRAC-1:0] v; + wire [`NUM_THREADS-1:0][1:0][`FIXED_FRAC-1:0] clamped_u; + wire [`NUM_THREADS-1:0][1:0][`FIXED_FRAC-1:0] clamped_v; wire [`TEX_STRIDE_BITS-1:0] log_stride; // stride @@ -70,9 +70,10 @@ module VX_tex_addr #( wire [31:0] fu[1:0]; wire [31:0] fv[1:0]; - assign fu[0] = coord_u[i] - (filter ? (`FIXED_HALF >> log_widths[i]) : 0); - assign fv[0] = coord_v[i] - (filter ? (`FIXED_HALF >> log_heights[i]) : 0); + assign fu[0] = coord_u[i] - (filter ? (`FIXED_HALF >> log_widths[i]) : 0); assign fu[1] = coord_u[i] + (filter ? (`FIXED_HALF >> log_widths[i]) : 0); + + assign fv[0] = coord_v[i] - (filter ? (`FIXED_HALF >> log_heights[i]) : 0); assign fv[1] = coord_v[i] + (filter ? (`FIXED_HALF >> log_heights[i]) : 0); VX_tex_wrap #( @@ -80,15 +81,7 @@ module VX_tex_addr #( ) tex_wrap_u0 ( .wrap_i (wrap_u), .coord_i (fu[0]), - .coord_o (u[0][i]) - ); - - VX_tex_wrap #( - .CORE_ID (CORE_ID) - ) tex_wrap_v0 ( - .wrap_i (wrap_v), - .coord_i (fv[0]), - .coord_o (v[0][i]) + .coord_o (clamped_u[i][0]) ); VX_tex_wrap #( @@ -96,7 +89,15 @@ module VX_tex_addr #( ) tex_wrap_u1 ( .wrap_i (wrap_u), .coord_i (fu[1]), - .coord_o (u[1][i]) + .coord_o (clamped_u[i][1]) + ); + + VX_tex_wrap #( + .CORE_ID (CORE_ID) + ) tex_wrap_v0 ( + .wrap_i (wrap_v), + .coord_i (fv[0]), + .coord_o (clamped_v[i][0]) ); VX_tex_wrap #( @@ -104,7 +105,7 @@ module VX_tex_addr #( ) tex_wrap_v1 ( .wrap_i (wrap_v), .coord_i (fv[1]), - .coord_o (v[1][i]) + .coord_o (clamped_v[i][1]) ); end @@ -117,10 +118,11 @@ module VX_tex_addr #( wire [`FIXED_FRAC-1:0] x [1:0]; wire [`FIXED_FRAC-1:0] y [1:0]; - assign x[0] = u[0][i] >> ((`FIXED_FRAC) - log_widths[i]); - assign x[1] = u[1][i] >> ((`FIXED_FRAC) - log_widths[i]); - assign y[0] = v[0][i] >> ((`FIXED_FRAC) - log_heights[i]); - assign y[1] = v[1][i] >> ((`FIXED_FRAC) - log_heights[i]); + assign x[0] = clamped_u[i][0] >> ((`FIXED_FRAC) - log_widths[i]); + assign x[1] = clamped_u[i][1] >> ((`FIXED_FRAC) - log_widths[i]); + + assign y[0] = clamped_v[i][0] >> ((`FIXED_FRAC) - log_heights[i]); + assign y[1] = clamped_v[i][1] >> ((`FIXED_FRAC) - log_heights[i]); assign addr[i][0] = base_addr + 32'(mip_offsets[i]) + (32'(x[0]) + (32'(y[0]) << log_widths[i])) << log_stride; assign addr[i][1] = base_addr + 32'(mip_offsets[i]) + (32'(x[1]) + (32'(y[0]) << log_widths[i])) << log_stride; @@ -128,6 +130,12 @@ module VX_tex_addr #( assign addr[i][3] = base_addr + 32'(mip_offsets[i]) + (32'(x[1]) + (32'(y[1]) << log_widths[i])) << log_stride; end + wire [`NUM_THREADS-1:0][`FIXED_FRAC-1:0] u0, v0; + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + assign u0[i] = clamped_u[i][0]; + assign v0[i] = clamped_v[i][0]; + end + wire stall_out = mem_req_valid && ~mem_req_ready; VX_pipe_register #( @@ -137,7 +145,7 @@ module VX_tex_addr #( .clk (clk), .reset (reset), .enable (~stall_out), - .data_in ({valid_in, req_wid, req_tmask, req_PC, filter, log_stride, addr, u[0], v[0], req_info}), + .data_in ({valid_in, req_wid, req_tmask, req_PC, filter, log_stride, addr, u0, v0, req_info}), .data_out ({mem_req_valid, mem_req_wid, mem_req_tmask, mem_req_PC, mem_req_filter, mem_req_stride, mem_req_addr, mem_req_u, mem_req_v, mem_req_info}) ); diff --git a/hw/rtl/tex_unit/VX_tex_unit.v b/hw/rtl/tex_unit/VX_tex_unit.v index e15c100a..69b3b35b 100644 --- a/hw/rtl/tex_unit/VX_tex_unit.v +++ b/hw/rtl/tex_unit/VX_tex_unit.v @@ -24,9 +24,9 @@ module VX_tex_unit #( `UNUSED_PARAM (CORE_ID) `UNUSED_VAR (reset) - reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [(1 << `TEX_MIP_BITS)-1:0]; - reg [`TEX_WIDTH_BITS-1:0] tex_width [(1 << `TEX_MIP_BITS)-1:0]; - reg [`TEX_HEIGHT_BITS-1:0] tex_height [(1 << `TEX_MIP_BITS)-1:0]; + reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [`TEX_WIDTH_BITS-1:0] tex_width [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [`TEX_HEIGHT_BITS-1:0] tex_height [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; @@ -36,50 +36,29 @@ module VX_tex_unit #( // CSRs programming - for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin - always @(posedge clk) begin - if (reset) begin - tex_baddr[i] <= 0; - tex_format[i] <= 0; - tex_wrap_u[i] <= 0; - tex_wrap_v[i] <= 0; - tex_filter[i] <= 0; - end begin - if (tex_csr_if.write_enable) begin - case (tex_csr_if.write_addr) - `CSR_TEX_ADDR(i) : tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; - `CSR_TEX_FORMAT(i) : tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; - `CSR_TEX_WRAP(i) : begin - tex_wrap_u[i] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; - tex_wrap_v[i] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS]; - end - `CSR_TEX_FILTER(i) : tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; - `CSR_TEX_MIPOFF(i), - `CSR_TEX_WIDTH(i), - `CSR_TEX_HEIGHT(i):; - default: - assert(tex_csr_if.write_addr >= `CSR_TEX_BEGIN(0) - && tex_csr_if.write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES)); - endcase - end + for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin + wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS]; + always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + case (tex_csr_if.write_addr) + `CSR_TEX_ADDR(i) : tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; + `CSR_TEX_FORMAT(i) : tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; + `CSR_TEX_WRAP(i) : begin + tex_wrap_u[i] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; + tex_wrap_v[i] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS]; + end + `CSR_TEX_FILTER(i) : tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; + `CSR_TEX_MIPOFF(i) : tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + `CSR_TEX_WIDTH(i) : tex_width[i][mip_level] <= tex_csr_if.write_data[`TEX_WIDTH_BITS-1:0]; + `CSR_TEX_HEIGHT(i) : tex_height[i][mip_level] <= tex_csr_if.write_data[`TEX_HEIGHT_BITS-1:0]; + default: + assert(tex_csr_if.write_addr >= `CSR_TEX_BEGIN(0) + && tex_csr_if.write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES)); + endcase end end end - for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin - wire [`TEX_MIP_BITS-1:0] mip_waddr = tex_csr_if.write_data[24 +: `TEX_MIP_BITS]; - always @(posedge clk) begin - if (tex_csr_if.write_enable && tex_csr_if.write_addr == `CSR_TEX_MIPOFF(i)) - tex_mipoff[mip_waddr] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; - - if (tex_csr_if.write_enable && tex_csr_if.write_addr == `CSR_TEX_WIDTH(i)) - tex_width[mip_waddr] <= tex_csr_if.write_data[`TEX_WIDTH_BITS-1:0]; - - if (tex_csr_if.write_enable && tex_csr_if.write_addr == `CSR_TEX_HEIGHT(i)) - tex_height[mip_waddr] <= tex_csr_if.write_data[`TEX_HEIGHT_BITS-1:0]; - end - end - // mipmap attributes wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] tex_mipoffs; @@ -87,10 +66,11 @@ module VX_tex_unit #( wire [`NUM_THREADS-1:0][`TEX_HEIGHT_BITS-1:0] tex_heights; for (genvar i = 0; i < `NUM_THREADS; ++i) begin - wire [`TEX_MIP_BITS-1:0] mip_raddr = {tex_req_if.unit[`NTEX_BITS-1:0], tex_req_if.lod[i][`TEX_LOD_BITS-1:0]}; - assign tex_mipoffs[i] = tex_mipoff[mip_raddr]; - assign tex_widths[i] = tex_width[mip_raddr]; - assign tex_heights[i] = tex_height[mip_raddr]; + wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; + wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS]; + assign tex_mipoffs[i] = tex_mipoff[unit][mip_level]; + assign tex_widths[i] = tex_width[unit][mip_level]; + assign tex_heights[i] = tex_height[unit][mip_level]; end // address generation @@ -237,13 +217,14 @@ module VX_tex_unit #( if (tex_csr_if.write_enable && (tex_csr_if.write_addr >= `CSR_TEX_BEGIN(i) && tex_csr_if.write_addr < `CSR_TEX_BEGIN(i+1))) begin - $display("%t: core%0d-tex_csr: csr_tex%d_addr, csr_data=%0h", $time, CORE_ID, i, tex_baddr[i]); - $display("%t: core%0d-tex_csr: csr_tex%d_width, csr_data=%0h", $time, CORE_ID, i, tex_width[i]); - $display("%t: core%0d-tex_csr: csr_tex%d_height, csr_data=%0h", $time, CORE_ID, i, tex_height[i]); - $display("%t: core%0d-tex_csr: csr_tex%d_format, csr_data=%0h", $time, CORE_ID, i, tex_format[i]); - $display("%t: core%0d-tex_csr: csr_tex%d_wrap_u, csr_data=%0h", $time, CORE_ID, i, tex_wrap_u[i]); - $display("%t: core%0d-tex_csr: csr_tex%d_wrap_v, csr_data=%0h", $time, CORE_ID, i, tex_wrap_v[i]); - $display("%t: core%0d-tex_csr: csr_tex%d_filter, csr_data=%0h", $time, CORE_ID, i, tex_filter[i]); + $display("%t: core%0d-tex_unit: tex%0d_addr=%0h", $time, CORE_ID, i, tex_baddr[i]); + $display("%t: core%0d-tex_unit: tex%0d_format=%0h", $time, CORE_ID, i, tex_format[i]); + $display("%t: core%0d-tex_unit: tex%0d_wrap_u=%0h", $time, CORE_ID, i, tex_wrap_u[i]); + $display("%t: core%0d-tex_unit: tex%0d_wrap_v=%0h", $time, CORE_ID, i, tex_wrap_v[i]); + $display("%t: core%0d-tex_unit: tex%0d_filter=%0h", $time, CORE_ID, i, tex_filter[i]); + $display("%t: core%0d-tex_unit: tex%0d_mipoff[0]=%0h", $time, CORE_ID, i, tex_mipoff[i][0]); + $display("%t: core%0d-tex_unit: tex%0d_width[0]=%0h", $time, CORE_ID, i, tex_width[i][0]); + $display("%t: core%0d-tex_unit: tex%0d_height[0]=%0h", $time, CORE_ID, i, tex_height[i][0]); end end end diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index 3e582678..f983765f 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -54,8 +54,11 @@ extern "C" { // Texture load #define vx_tex(unit, u, v, l) ({ \ - register unsigned __r; \ - __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(u), "r"(v), "r"(l)); \ + unsigned __r; \ + unsigned __u = u; \ + unsigned __v = v; \ + unsigned __l = l; \ + __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ __r; \ })