diff --git a/ci/regression.sh b/ci/regression.sh index 73ce8e94..8269e875 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -54,6 +54,9 @@ CONFIGS=-DEXT_F_DISABLE make -C hw/simulate # disable shared memory CONFIGS=-DSM_ENABLE=0 make -C hw/simulate +# disabling tex extension +CONFIGS=-DEXT_TEX_DISABLE make -C hw/simulate + # using Default FPU core FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index fe3ca8bb..9dc04a86 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -16,6 +16,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE +DBG_PRINT_FLAGS += -DDBG_PRINT_TEX DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO @@ -37,7 +38,8 @@ SRCS = fpga.cpp opae_sim.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src -RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) +TEX_INCLUDE = -I$(RTL_DIR)/tex_unit +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE) RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 65eb1ac0..73081953 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -16,6 +16,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE +DBG_PRINT_FLAGS += -DDBG_PRINT_TEX DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO @@ -37,7 +38,8 @@ SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src -RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) +TEX_INCLUDE = -I$(RTL_DIR)/tex_unit +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE) VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index 1b920283..bb8068b8 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -154,7 +154,7 @@ module VX_alu_unit #( assign mul_ready_out = ~stall_out; - assign result_valid = mul_valid_out | (alu_req_if.valid && ~is_mul_op); + assign result_valid = mul_valid_out || (alu_req_if.valid && ~is_mul_op); assign result_wid = mul_valid_out ? mul_wid : alu_req_if.wid; assign result_tmask = mul_valid_out ? mul_tmask : alu_req_if.tmask; assign result_PC = mul_valid_out ? mul_PC : alu_req_if.PC; @@ -165,7 +165,7 @@ module VX_alu_unit #( `else - assign stall_in = 0; + assign stall_in = stall_out; assign result_valid = alu_req_if.valid; assign result_wid = alu_req_if.wid; diff --git a/hw/rtl/VX_cache_arb.v b/hw/rtl/VX_cache_arb.v new file mode 100644 index 00000000..85800edf --- /dev/null +++ b/hw/rtl/VX_cache_arb.v @@ -0,0 +1,159 @@ +`include "VX_define.vh" + +module VX_cache_arb #( + parameter NUM_REQS = 1, + parameter LANES = 1, + parameter DATA_SIZE = 1, + parameter TAG_IN_WIDTH = 1, + parameter TAG_SEL_IDX = 0, + parameter BUFFERED_REQ = 0, + parameter BUFFERED_RSP = 0, + parameter TYPE = "R", + + localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)), + localparam DATA_WIDTH = (8 * DATA_SIZE), + localparam LOG_NUM_REQS = `CLOG2(NUM_REQS), + localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS +) ( + input wire clk, + input wire reset, + + // input requests + input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in, + input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in, + input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in, + input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in, + input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in, + input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in, + output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in, + + // output request + output wire [LANES-1:0] req_valid_out, + output wire [LANES-1:0] req_rw_out, + output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out, + output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out, + output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out, + output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out, + input wire [LANES-1:0] req_ready_out, + + // input response + input wire rsp_valid_in, + input wire [LANES-1:0] rsp_tmask_in, + input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in, + input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, + output wire rsp_ready_in, + + // output responses + output wire [NUM_REQS-1:0] rsp_valid_out, + output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out, + output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out, + output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, + input wire [NUM_REQS-1:0] rsp_ready_out +); + localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; + localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH; + + if (NUM_REQS > 1) begin + + wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged; + wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged; + + for (genvar i = 0; i < NUM_REQS; i++) begin + for (genvar j = 0; j < LANES; ++j) begin + wire [TAG_OUT_WIDTH-1:0] req_tag_in_w; + + VX_bits_insert #( + .N (TAG_IN_WIDTH), + .S (LOG_NUM_REQS), + .POS (TAG_SEL_IDX) + ) bits_insert ( + .data_in (req_tag_in[i][j]), + .sel_in (LOG_NUM_REQS'(i)), + .data_out (req_tag_in_w) + ); + + assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]}; + end + end + + VX_stream_arbiter #( + .NUM_REQS (NUM_REQS), + .LANES (LANES), + .DATAW (REQ_DATAW), + .BUFFERED (BUFFERED_REQ), + .TYPE (TYPE) + ) req_arb ( + .clk (clk), + .reset (reset), + .valid_in (req_valid_in), + .data_in (req_data_in_merged), + .ready_in (req_ready_in), + .valid_out (req_valid_out), + .data_out (req_data_out_merged), + .ready_out (req_ready_out) + ); + + for (genvar i = 0; i < LANES; ++i) begin + assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i]; + end + + /////////////////////////////////////////////////////////////////////// + + wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged; + + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS]; + + wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w; + + VX_bits_remove #( + .N (TAG_OUT_WIDTH), + .S (LOG_NUM_REQS), + .POS (TAG_SEL_IDX) + ) bits_remove ( + .data_in (rsp_tag_in), + .data_out (rsp_tag_in_w) + ); + + VX_stream_demux #( + .NUM_REQS (NUM_REQS), + .LANES (1), + .DATAW (RSP_DATAW), + .BUFFERED (BUFFERED_RSP) + ) rsp_demux ( + .clk (clk), + .reset (reset), + .sel_in (rsp_sel), + .valid_in (rsp_valid_in), + .data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}), + .ready_in (rsp_ready_in), + .valid_out (rsp_valid_out), + .data_out (rsp_data_out_merged), + .ready_out (rsp_ready_out) + ); + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i]; + end + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + assign req_valid_out = req_valid_in; + assign req_tag_out = req_tag_in; + assign req_addr_out = req_addr_in; + assign req_rw_out = req_rw_in; + assign req_byteen_out = req_byteen_in; + assign req_data_out = req_data_in; + assign req_ready_in = req_ready_out; + + assign rsp_valid_out = rsp_valid_in; + assign rsp_tmask_out = rsp_tmask_in; + assign rsp_tag_out = rsp_tag_in; + assign rsp_data_out = rsp_data_in; + assign rsp_ready_in = rsp_ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_commit.v b/hw/rtl/VX_commit.v index bdebc5ed..b3ed5a6a 100644 --- a/hw/rtl/VX_commit.v +++ b/hw/rtl/VX_commit.v @@ -73,13 +73,14 @@ module VX_commit #( .ld_commit_if (ld_commit_if), .csr_commit_if (csr_commit_if), .fpu_commit_if (fpu_commit_if), + .gpu_commit_if (gpu_commit_if), .writeback_if (writeback_if) ); - // store and gpu commits don't writeback + // store doesn't writeback assign st_commit_if.ready = 1'b1; - assign gpu_commit_if.ready = 1'b1; + // assign gpu_commit_if.ready = 1'b1; `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index ea0ee9dc..4ca8203b 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -77,6 +77,10 @@ `define EXT_F_ENABLE `endif +`ifndef EXT_TEX_DISABLE +`define EXT_TEX_ENABLE +`endif + // Device identification `define VENDOR_ID 0 `define ARCHITECTURE_ID 0 @@ -229,6 +233,21 @@ `define CSR_NW 12'hFC1 `define CSR_NC 12'hFC2 +////////// Texture Units ////////////////////////////////////////////////////// + +`define NUM_TEX_UNITS 2 + +`define CSR_TEX_STATES 7 +`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES) + +`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00) +`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01) +`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02) +`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03) +`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04) +`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05) +`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06) + // Pipeline Queues //////////////////////////////////////////////////////////// // Size of LSU Request Queue diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index 54bd1caf..4507816e 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -12,7 +12,11 @@ module VX_csr_data #( `endif VX_cmt_to_csr_if cmt_to_csr_if, - VX_fpu_to_csr_if fpu_to_csr_if, + VX_fpu_to_csr_if fpu_to_csr_if, + +`ifdef EXT_TEX_ENABLE + VX_tex_csr_if tex_csr_if, +`endif input wire read_enable, input wire[`CSR_ADDR_BITS-1:0] read_addr, @@ -22,7 +26,7 @@ module VX_csr_data #( input wire write_enable, input wire[`CSR_ADDR_BITS-1:0] write_addr, input wire[`NW_BITS-1:0] write_wid, - input wire[`CSR_WIDTH-1:0] write_data, + input wire[31:0] write_data, input wire busy ); @@ -57,26 +61,33 @@ module VX_csr_data #( `CSR_FRM: fcsr[write_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0]; `CSR_FCSR: fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0]; - `CSR_SATP: csr_satp <= write_data; - - `CSR_MSTATUS: csr_mstatus <= write_data; - `CSR_MEDELEG: csr_medeleg <= write_data; - `CSR_MIDELEG: csr_mideleg <= write_data; - `CSR_MIE: csr_mie <= write_data; - `CSR_MTVEC: csr_mtvec <= write_data; - - `CSR_MEPC: csr_mepc <= write_data; - - `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data; - `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data; + `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; + `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; + `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; default: begin - assert(~write_enable) else $error("%t: invalid CSR write address: %0h", $time, write_addr); + assert (write_addr >= `CSR_TEX_BEGIN(0) && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES)) + else $error("%t: invalid CSR write address: %0h", $time, write_addr); end endcase end end + `UNUSED_VAR (write_data) + + // TEX CSRs +`ifdef EXT_TEX_ENABLE + assign tex_csr_if.write_enable = write_enable; + assign tex_csr_if.write_addr = write_addr; + assign tex_csr_if.write_data = write_data; +`endif + always @(posedge clk) begin if (reset) begin csr_cycle <= 0; @@ -201,7 +212,8 @@ module VX_csr_data #( default: begin if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) - | (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)))) begin + || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32) + || (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin read_addr_valid_r = 0; end end diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index 6bf5277e..581c99e7 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -12,7 +12,11 @@ module VX_csr_unit #( `endif VX_cmt_to_csr_if cmt_to_csr_if, - VX_fpu_to_csr_if fpu_to_csr_if, + VX_fpu_to_csr_if fpu_to_csr_if, + +`ifdef EXT_TEX_ENABLE + VX_tex_csr_if tex_csr_if, +`endif VX_csr_req_if csr_req_if, VX_commit_if csr_commit_if, @@ -42,6 +46,9 @@ module VX_csr_unit #( `endif .cmt_to_csr_if (cmt_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if), + `ifdef EXT_TEX_ENABLE + .tex_csr_if (tex_csr_if), + `endif .read_enable (csr_req_if.valid), .read_addr (csr_req_if.addr), .read_wid (csr_req_if.wid), @@ -49,7 +56,7 @@ module VX_csr_unit #( .write_enable (write_enable), .write_addr (csr_addr_s1), .write_wid (csr_commit_if.wid), - .write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]), + .write_data (csr_updated_data_s1), .busy (busy) ); diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 54257679..0304830a 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -42,6 +42,7 @@ module VX_decode #( wire [31:0] instr = ifetch_rsp_if.data; wire [6:0] opcode = instr[6:0]; + wire [1:0] func2 = instr[26:25]; wire [2:0] func3 = instr[14:12]; wire [6:0] func7 = instr[31:25]; wire [11:0] u_12 = instr[31:20]; @@ -372,6 +373,16 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); end + `ifdef EXT_TEX_ENABLE + 3'h5: begin + op_type = `OP_BITS'(`GPU_TEX); + op_mod = `MOD_BITS'(func2); + use_rd = 1; + `USED_IREG (rs1); + `USED_IREG (rs2); + `USED_IREG (rs3); + end + `endif default:; endcase end @@ -379,6 +390,8 @@ module VX_decode #( endcase end + `UNUSED_VAR (func2) + // disable write to integer register r0 wire wb = use_rd && (| rd_r); diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index fed5bfe8..95e57cd4 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -16,6 +16,8 @@ `define REQS_BITS `LOG2UP(NUM_REQS) +`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS) + `ifdef EXT_F_ENABLE `define NUM_REGS 64 `else @@ -54,6 +56,8 @@ `define INST_GPU 7'b1101011 +`define INST_TEX 7'b0101011 + /////////////////////////////////////////////////////////////////////////////// `define FRM_RNE 3'b000 // round to nearest even @@ -185,6 +189,7 @@ `define GPU_SPLIT 3'h2 `define GPU_JOIN 3'h3 `define GPU_BAR 3'h4 +`define GPU_TEX 3'h5 `define GPU_OTHER 3'h7 `define GPU_BITS 3 `define GPU_OP(x) x[`GPU_BITS-1:0] @@ -293,11 +298,18 @@ // Core request address bits `define DCORE_ADDR_WIDTH (32-`CLOG2(`DWORD_SIZE)) -// TAG sharing enable -`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) +// Core request tag bits +`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) +`ifdef EXT_TEX_ENABLE +`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_ADDR_BITS + `SM_ENABLE) +`define TEX_TAG_ID_BITS (2) +`define DCORE_TAG_ID_BITS (`MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS) + 1) +`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS) +`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS) +`define LSU_TEX_DCACHE_TAG_BITS `MAX(`LSU_DCACHE_TAG_BITS, `TEX_DCACHE_TAG_BITS) +`else `define DCORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_ADDR_BITS + `SM_ENABLE) - -// Input request tag bits +`endif `define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) // Memory request data bits @@ -409,6 +421,8 @@ // Merged D-cache/I-cache memory tag `define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH + `CLOG2(2)) +//////////////////////////////////////////////////////////////////////////////////////// + `include "VX_types.vh" `endif diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index dbddc1fd..5e576a31 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -39,9 +39,98 @@ module VX_execute #( input wire busy ); - VX_fpu_to_csr_if fpu_to_csr_if(); - wire[`NUM_WARPS-1:0] csr_pending; - wire[`NUM_WARPS-1:0] fpu_pending; + VX_fpu_to_csr_if fpu_to_csr_if(); + +`ifdef EXT_TEX_ENABLE + + VX_dcache_req_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + ) lsu_dcache_req_if(); + + VX_dcache_rsp_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + ) lsu_dcache_rsp_if(); + + VX_dcache_req_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + ) tex_dcache_req_if(); + + VX_dcache_rsp_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + ) tex_dcache_rsp_if(); + + VX_tex_csr_if tex_csr_if(); + + wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in; + wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out; + + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + assign tex_tag_in[i][`LSUQ_ADDR_BITS-1:0] = `LSUQ_ADDR_BITS'(tex_dcache_req_if.tag[i][1:0]); + `ifdef DBG_CACHE_REQ_INFO + assign tex_tag_in[i][`LSUQ_ADDR_BITS+:`DBG_CACHE_REQ_MDATAW] = tex_dcache_req_if.tag[i][2+:`DBG_CACHE_REQ_MDATAW]; + `endif + end + assign tex_dcache_rsp_if.tag[1:0] = tex_tag_out[1:0]; +`ifdef DBG_CACHE_REQ_INFO + assign tex_dcache_rsp_if.tag[2+:`DBG_CACHE_REQ_MDATAW] = tex_tag_out[`LSUQ_ADDR_BITS+:`DBG_CACHE_REQ_MDATAW]; +`endif + `UNUSED_VAR (tex_tag_out) + + VX_cache_arb #( + .NUM_REQS (2), + .LANES (`NUM_THREADS), + .DATA_SIZE (4), + .TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS), + .TAG_SEL_IDX (2) + ) tex_lsu_arb ( + .clk (clk), + .reset (reset), + + // Tex/LSU request + .req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}), + .req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}), + .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), + .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), + .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), + .req_tag_in ({tex_tag_in, lsu_dcache_req_if.tag}), + .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), + + // Dcache request + .req_valid_out (dcache_req_if.valid), + .req_rw_out (dcache_req_if.rw), + .req_byteen_out (dcache_req_if.byteen), + .req_addr_out (dcache_req_if.addr), + .req_data_out (dcache_req_if.data), + .req_tag_out (dcache_req_if.tag), + .req_ready_out (dcache_req_if.ready), + + // Dcache response + .rsp_valid_in (dcache_rsp_if.valid), + .rsp_tmask_in (dcache_rsp_if.tmask), + .rsp_tag_in (dcache_rsp_if.tag), + .rsp_data_in (dcache_rsp_if.data), + .rsp_ready_in (dcache_rsp_if.ready), + + // Tex/LSU response + .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), + .rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}), + .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), + .rsp_tag_out ({tex_tag_out, lsu_dcache_rsp_if.tag}), + .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}) + ); + +`endif + + wire [`NUM_WARPS-1:0] csr_pending; + wire [`NUM_WARPS-1:0] fpu_pending; `RESET_RELAY (alu_reset); `RESET_RELAY (lsu_reset); @@ -49,7 +138,7 @@ module VX_execute #( `RESET_RELAY (gpu_reset); VX_alu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) alu_unit ( .clk (clk), .reset (alu_reset), @@ -59,29 +148,37 @@ module VX_execute #( ); VX_lsu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) lsu_unit ( `SCOPE_BIND_VX_execute_lsu_unit .clk (clk), .reset (lsu_reset), + `ifdef EXT_TEX_ENABLE + .dcache_req_if (lsu_dcache_req_if), + .dcache_rsp_if (lsu_dcache_rsp_if), + `else .dcache_req_if (dcache_req_if), .dcache_rsp_if (dcache_rsp_if), + `endif .lsu_req_if (lsu_req_if), .ld_commit_if (ld_commit_if), .st_commit_if (st_commit_if) ); VX_csr_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) csr_unit ( .clk (clk), .reset (csr_reset), `ifdef PERF_ENABLE - .perf_memsys_if (perf_memsys_if), + .perf_memsys_if (perf_memsys_if), .perf_pipeline_if (perf_pipeline_if), `endif .cmt_to_csr_if (cmt_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if), + `ifdef EXT_TEX_ENABLE + .tex_csr_if (tex_csr_if), + `endif .csr_req_if (csr_req_if), .csr_commit_if (csr_commit_if), .fpu_pending (fpu_pending), @@ -93,7 +190,7 @@ module VX_execute #( `RESET_RELAY (fpu_reset); VX_fpu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) fpu_unit ( .clk (clk), .reset (fpu_reset), @@ -122,12 +219,17 @@ module VX_execute #( `endif VX_gpu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) gpu_unit ( `SCOPE_BIND_VX_execute_gpu_unit .clk (clk), .reset (gpu_reset), .gpu_req_if (gpu_req_if), + `ifdef EXT_TEX_ENABLE + .tex_csr_if (tex_csr_if), + .dcache_req_if (tex_dcache_req_if), + .dcache_rsp_if (tex_dcache_rsp_if), + `endif .warp_ctl_if (warp_ctl_if), .gpu_commit_if (gpu_commit_if) ); @@ -139,4 +241,4 @@ module VX_execute #( && (`BR_OP(alu_req_if.op_type) == `BR_EBREAK || `BR_OP(alu_req_if.op_type) == `BR_ECALL); -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpu_unit.v b/hw/rtl/VX_gpu_unit.v index 9d0615bb..ee7a04d2 100644 --- a/hw/rtl/VX_gpu_unit.v +++ b/hw/rtl/VX_gpu_unit.v @@ -11,25 +11,43 @@ module VX_gpu_unit #( // Inputs VX_gpu_req_if gpu_req_if, +`ifdef EXT_TEX_ENABLE + VX_tex_csr_if tex_csr_if, + + VX_dcache_req_if dcache_req_if, + VX_dcache_rsp_if dcache_rsp_if, +`endif + // Outputs VX_warp_ctl_if warp_ctl_if, VX_commit_if gpu_commit_if ); `UNUSED_PARAM (CORE_ID) - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) + + wire rsp_valid; + wire [`NW_BITS-1:0] rsp_wid; + wire [`NUM_THREADS-1:0] rsp_tmask; + wire [31:0] rsp_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; + wire [`NUM_THREADS-1:0][31:0] rsp_data; gpu_tmc_t tmc; gpu_wspawn_t wspawn; gpu_barrier_t barrier; gpu_split_t split; + wire [(`NUM_THREADS * 32)-1:0] warp_ctl_data; + wire is_warp_ctl; + + wire stall_in, stall_out; + wire is_wspawn = (gpu_req_if.op_type == `GPU_WSPAWN); wire is_tmc = (gpu_req_if.op_type == `GPU_TMC); wire is_split = (gpu_req_if.op_type == `GPU_SPLIT); wire is_bar = (gpu_req_if.op_type == `GPU_BAR); - + // tmc wire [`NUM_THREADS-1:0] tmc_new_mask; @@ -41,7 +59,7 @@ module VX_gpu_unit #( // wspawn - wire [31:0] wspawn_pc = gpu_req_if.rs2_data; + wire [31:0] wspawn_pc = gpu_req_if.rs2_data[0]; wire [`NUM_WARPS-1:0] wspawn_wmask; for (genvar i = 0; i < `NUM_WARPS; i++) begin assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]); @@ -71,37 +89,113 @@ module VX_gpu_unit #( assign barrier.valid = is_bar; assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0]; - assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1); + assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data[0] - 1); + + // pack warp ctl result + `IGNORE_WARNINGS_BEGIN + assign warp_ctl_data = {tmc, wspawn, barrier, split}; + `IGNORE_WARNINGS_END + + // texture + +`ifdef EXT_TEX_ENABLE + + `UNUSED_VAR (gpu_req_if.op_mod) + + VX_tex_req_if tex_req_if(); + VX_tex_rsp_if tex_rsp_if(); + + wire is_tex = (gpu_req_if.op_type == `GPU_TEX); + + assign tex_req_if.valid = gpu_req_if.valid && is_tex; + assign tex_req_if.wid = gpu_req_if.wid; + assign tex_req_if.tmask = gpu_req_if.tmask; + assign tex_req_if.PC = gpu_req_if.PC; + assign tex_req_if.rd = gpu_req_if.rd; + assign tex_req_if.wb = gpu_req_if.wb; + + assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0]; + assign tex_req_if.coords[0] = gpu_req_if.rs1_data; + assign tex_req_if.coords[1] = gpu_req_if.rs2_data; + assign tex_req_if.lod = gpu_req_if.rs3_data; + + VX_tex_unit #( + .CORE_ID(CORE_ID) + ) tex_unit ( + .clk (clk), + .reset (reset), + .tex_req_if (tex_req_if), + .tex_csr_if (tex_csr_if), + .tex_rsp_if (tex_rsp_if), + .dcache_req_if (dcache_req_if), + .dcache_rsp_if (dcache_rsp_if) + ); + + assign tex_rsp_if.ready = !stall_out; + + assign stall_in = (is_tex && ~tex_req_if.ready) + || (~is_tex && (tex_rsp_if.valid || stall_out)); + + assign is_warp_ctl = !(is_tex || tex_rsp_if.valid); + + assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex); + assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid; + assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask; + assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC; + assign rsp_rd = tex_rsp_if.rd; + assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb; + assign rsp_data = tex_rsp_if.valid ? tex_rsp_if.data : warp_ctl_data; + +`else + + `UNUSED_VAR (gpu_req_if.op_mod) + `UNUSED_VAR (gpu_req_if.rs2_data) + `UNUSED_VAR (gpu_req_if.rs3_data) + `UNUSED_VAR (gpu_req_if.wb) + `UNUSED_VAR (gpu_req_if.rd) + + assign stall_in = stall_out; + assign is_warp_ctl = 1; + + assign rsp_valid = gpu_req_if.valid; + assign rsp_wid = gpu_req_if.wid; + assign rsp_tmask = gpu_req_if.tmask; + assign rsp_PC = gpu_req_if.PC; + assign rsp_rd = 0; + assign rsp_wb = 0; + assign rsp_data = warp_ctl_data; + +`endif + + wire is_warp_ctl_r; // output - - wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid; + assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `GPU_TMC_SIZE + `GPU_WSPAWN_SIZE + `GPU_SPLIT_SIZE + `GPU_BARRIER_SIZE), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), - .enable (!stall), - .data_in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}), - .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) + .enable (!stall_out), + .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), + .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, gpu_commit_if.data, is_warp_ctl_r}) ); assign gpu_commit_if.eop = 1'b1; - assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready; - assign warp_ctl_if.wid = gpu_commit_if.wid; + // warp control reponse + + `IGNORE_WARNINGS_BEGIN + assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.barrier, warp_ctl_if.split} = gpu_commit_if.data; + `IGNORE_WARNINGS_END + assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r; + assign warp_ctl_if.wid = gpu_commit_if.wid; // can accept new request? - assign gpu_req_if.ready = ~stall; + assign gpu_req_if.ready = ~stall_in; - `SCOPE_ASSIGN (gpu_req_fire, gpu_req_if.valid && gpu_req_if.ready); - `SCOPE_ASSIGN (gpu_req_wid, gpu_req_if.wid); - `SCOPE_ASSIGN (gpu_req_tmask, gpu_req_if.tmask); - `SCOPE_ASSIGN (gpu_req_op_type, gpu_req_if.op_type); - `SCOPE_ASSIGN (gpu_req_rs1, gpu_req_if.rs1_data[0]); - `SCOPE_ASSIGN (gpu_req_rs2, gpu_req_if.rs2_data); `SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid); `SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid); `SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc); diff --git a/hw/rtl/VX_instr_demux.v b/hw/rtl/VX_instr_demux.v index 42d4fc5e..c7d83018 100644 --- a/hw/rtl/VX_instr_demux.v +++ b/hw/rtl/VX_instr_demux.v @@ -112,14 +112,14 @@ module VX_instr_demux ( wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU); VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)) + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) //update number of bits ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index 9f2495f1..1edd07f6 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -219,6 +219,8 @@ module VX_issue #( `PRINT_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS); $write(", rs2_data="); `PRINT_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS); + $write(", rs3_data="); + `PRINT_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS); $write("\n"); end end diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index 3c26f6c7..f269e84b 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -197,7 +197,7 @@ module VX_mem_unit # ( .TAG_WIDTH (`DCORE_TAG_WIDTH-`SM_ENABLE) ) smem_rsp_if(); - VX_smem_arb #( + VX_smem_arb #( .NUM_REQS (2), .LANES (`NUM_THREADS), .DATA_SIZE (4), diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index 85b3d6dd..b53a1e52 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -108,7 +108,10 @@ module VX_pipeline #( /////////////////////////////////////////////////////////////////////////// - VX_cmt_to_csr_if cmt_to_csr_if(); + VX_cmt_to_csr_if #( + .SIZE ($clog2(3*`NUM_THREADS+1)) + ) cmt_to_csr_if(); + VX_decode_if decode_if(); VX_branch_ctl_if branch_ctl_if(); VX_warp_ctl_if warp_ctl_if(); diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index bdb894a3..9c1ae729 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -85,11 +85,32 @@ `define UP(x) (((x) > 0) ? x : 1) -`define SAFE_RNG(h,l) `MAX(h,l) : l +`define SAFE_RNG(h, l) `MAX(h,l) : l -`define RTRIM(x,s) x[$bits(x)-1:($bits(x)-s)] +`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)] -`define LTRIM(x,s) x[s-1:0] +`define LTRIM(x, s) x[s-1:0] + +`define PRINT_ARRAY1D(a, m) \ + $write("{"); \ + for (integer i = (m-1); i >= 0; --i) begin \ + if (i != (m-1)) $write(", "); \ + $write("0x%0h", a[i]); \ + end \ + $write("}"); \ + +`define PRINT_ARRAY2D(a, m, n) \ + $write("{"); \ + for (integer i = n-1; i >= 0; --i) begin \ + if (i != (n-1)) $write(", "); \ + $write("{"); \ + for (integer j = (m-1); j >= 0; --j) begin \ + if (j != (m-1)) $write(", "); \ + $write("0x%0h", a[i][j]); \ + end \ + $write("}"); \ + end \ + $write("}") `define PRINT_ARRAY1D(a, m) \ $write("{"); \ diff --git a/hw/rtl/VX_print_instr.vh b/hw/rtl/VX_print_instr.vh index 4559c199..88d47707 100644 --- a/hw/rtl/VX_print_instr.vh +++ b/hw/rtl/VX_print_instr.vh @@ -128,6 +128,7 @@ task print_ex_op ( `GPU_SPLIT: $write("SPLIT"); `GPU_JOIN: $write("JOIN"); `GPU_BAR: $write("BAR"); + `GPU_TEX: $write("TEX"); default: $write("?"); endcase end diff --git a/hw/rtl/VX_writeback.v b/hw/rtl/VX_writeback.v index 3020a619..424dc147 100644 --- a/hw/rtl/VX_writeback.v +++ b/hw/rtl/VX_writeback.v @@ -11,6 +11,7 @@ module VX_writeback #( VX_commit_if ld_commit_if, VX_commit_if csr_commit_if, VX_commit_if fpu_commit_if, + VX_commit_if gpu_commit_if, // outputs VX_writeback_if writeback_if @@ -28,28 +29,30 @@ module VX_writeback #( wire [`NUM_THREADS-1:0][31:0] wb_data; wire wb_eop; - wire [3:0][DATAW-1:0] rsp_data; - wire [3:0] rsp_ready; + wire [4:0][DATAW-1:0] rsp_data; + wire [4:0] rsp_ready; wire stall; wire ld_valid = ld_commit_if.valid && ld_commit_if.wb; wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb; wire csr_valid = csr_commit_if.valid && csr_commit_if.wb; wire alu_valid = alu_commit_if.valid && alu_commit_if.wb; + wire gpu_valid = gpu_commit_if.valid && gpu_commit_if.wb; assign rsp_data[0] = { ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop}; assign rsp_data[1] = {fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop}; assign rsp_data[2] = {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop}; assign rsp_data[3] = {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop}; + assign rsp_data[4] = {gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop}; VX_stream_arbiter #( - .NUM_REQS (4), + .NUM_REQS (5), .DATAW (DATAW), .TYPE ("X") ) rsp_arb ( .clk (clk), .reset (reset), - .valid_in ({alu_valid, csr_valid, fpu_valid, ld_valid}), + .valid_in ({gpu_valid, alu_valid, csr_valid, fpu_valid, ld_valid}), .data_in (rsp_data), .ready_in (rsp_ready), .valid_out (wb_valid), @@ -61,6 +64,7 @@ module VX_writeback #( assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb; assign alu_commit_if.ready = rsp_ready[3] || ~alu_commit_if.wb; + assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; assign stall = ~writeback_if.ready && writeback_if.valid; diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index 75c55518..1a084490 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -45,8 +45,10 @@ localparam CCI_DATA_WIDTH = $bits(t_ccip_clData); localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8; localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE); + localparam AVS_RD_QUEUE_SIZE = 4; -localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH)); +localparam _AVS_REQ_TAGW_VX = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH); +localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX); localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH)); localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI); diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 6e8a89fb..4fc2f0a1 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -253,7 +253,7 @@ module VX_bank #( `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_pc_st0, debug_wid_st0} = tag_st0[CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS]; + assign {debug_pc_st0, debug_wid_st0} = tag_st0[`CACHE_REQ_INFO_RNG]; end else begin assign {debug_pc_st0, debug_wid_st0} = 0; end @@ -322,7 +322,7 @@ module VX_bank #( `ifdef DBG_CACHE_REQ_INFO if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin - assign {debug_pc_st1, debug_wid_st1} = tag_st1[CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS]; + assign {debug_pc_st1, debug_wid_st1} = tag_st1[`CACHE_REQ_INFO_RNG]; end else begin assign {debug_pc_st1, debug_wid_st1} = 0; end diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index 17b37a12..5299f387 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -1,5 +1,5 @@ -`ifndef VX_CACHE_CONFIG -`define VX_CACHE_CONFIG +`ifndef VX_CACHE_DEFINE +`define VX_CACHE_DEFINE `include "VX_platform.vh" @@ -51,17 +51,19 @@ `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] +`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1:(CORE_TAG_WIDTH-`NW_BITS-32) + /////////////////////////////////////////////////////////////////////////////// `define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS) `define BANK_READY_COUNT ((SHARED_BANK_READY != 0) ? 1 : NUM_BANKS) -`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET] +`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET] -`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS] +`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS] -`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)} +`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)} `define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))} diff --git a/hw/rtl/cache/VX_shared_mem.v b/hw/rtl/cache/VX_shared_mem.v index 9156e98c..e5c9c9f9 100644 --- a/hw/rtl/cache/VX_shared_mem.v +++ b/hw/rtl/cache/VX_shared_mem.v @@ -328,7 +328,7 @@ module VX_shared_mem #( end else begin $display("%t: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i], - debug_wid_st1[i], debug_pc_st1[i]); + debug_wid_st1[i], debug_pc_st1[i]); end end end diff --git a/hw/rtl/fp_cores/VX_fpu_fpnew.v b/hw/rtl/fp_cores/VX_fpu_fpnew.v index 3a8a8106..450a8594 100644 --- a/hw/rtl/fp_cores/VX_fpu_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpu_fpnew.v @@ -3,8 +3,7 @@ `include "defs_div_sqrt_mvp.sv" `TRACING_OFF -module VX_fpu_fpnew -#( +module VX_fpu_fpnew #( parameter TAGW = 1, parameter FMULADD = 1, parameter FDIVSQRT = 1, diff --git a/hw/rtl/interfaces/VX_cmt_to_csr_if.v b/hw/rtl/interfaces/VX_cmt_to_csr_if.v index d805f5b4..d8d11e6e 100644 --- a/hw/rtl/interfaces/VX_cmt_to_csr_if.v +++ b/hw/rtl/interfaces/VX_cmt_to_csr_if.v @@ -3,10 +3,12 @@ `include "VX_define.vh" -interface VX_cmt_to_csr_if (); +interface VX_cmt_to_csr_if #( + parameter SIZE +)(); - wire valid; - wire [$clog2(3*`NUM_THREADS+1)-1:0] commit_size; + wire valid; + wire [SIZE-1:0] commit_size; endinterface diff --git a/hw/rtl/interfaces/VX_gpu_req_if.v b/hw/rtl/interfaces/VX_gpu_req_if.v index 5f024ae9..499358d2 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.v +++ b/hw/rtl/interfaces/VX_gpu_req_if.v @@ -12,8 +12,10 @@ interface VX_gpu_req_if(); wire [31:0] PC; wire [31:0] next_PC; wire [`GPU_BITS-1:0] op_type; + wire [`MOD_BITS-1:0] op_mod; wire [`NUM_THREADS-1:0][31:0] rs1_data; - wire [31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs3_data; wire [`NR_BITS-1:0] rd; wire wb; diff --git a/hw/rtl/interfaces/VX_tex_csr_if.v b/hw/rtl/interfaces/VX_tex_csr_if.v new file mode 100644 index 00000000..9315a59d --- /dev/null +++ b/hw/rtl/interfaces/VX_tex_csr_if.v @@ -0,0 +1,14 @@ +`ifndef VX_TEX_CSR_IF +`define VX_TEX_CSR_IF + +`include "VX_define.vh" + +interface VX_tex_csr_if (); + + wire write_enable; + wire [`CSR_ADDR_BITS-1:0] write_addr; + wire [31:0] write_data; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_tex_req_if.v b/hw/rtl/interfaces/VX_tex_req_if.v new file mode 100644 index 00000000..e00a2e0e --- /dev/null +++ b/hw/rtl/interfaces/VX_tex_req_if.v @@ -0,0 +1,25 @@ +`ifndef VX_TEX_REQ_IF +`define VX_TEX_REQ_IF + +`include "VX_define.vh" + +interface VX_tex_req_if (); + + wire valid; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] tmask; + wire [31:0] PC; + wire [`NR_BITS-1:0] rd; + wire wb; + + wire [`NTEX_BITS-1:0] unit; + wire [1:0][`NUM_THREADS-1:0][31:0] coords; + wire [`NUM_THREADS-1:0][31:0] lod; + + wire ready; + +endinterface +`endif + + + \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_tex_rsp_if.v b/hw/rtl/interfaces/VX_tex_rsp_if.v new file mode 100644 index 00000000..e0e3cbea --- /dev/null +++ b/hw/rtl/interfaces/VX_tex_rsp_if.v @@ -0,0 +1,21 @@ +`ifndef VX_TEX_RSP_IF +`define VX_TEX_RSP_IF + +`include "VX_define.vh" + +interface VX_tex_rsp_if (); + + wire valid; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] tmask; + wire [31:0] PC; + wire [`NR_BITS-1:0] rd; + wire wb; + wire [`NUM_THREADS-1:0][31:0] data; + wire ready; + +endinterface + +`endif + + diff --git a/hw/rtl/tex_unit/VX_tex_addr.v b/hw/rtl/tex_unit/VX_tex_addr.v new file mode 100644 index 00000000..149af193 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_addr.v @@ -0,0 +1,151 @@ +`include "VX_tex_define.vh" + +module VX_tex_addr #( + parameter CORE_ID = 0, + parameter REQ_INFO_WIDTH = 1, + parameter NUM_REQS = 1 +) ( + input wire clk, + input wire reset, + + // inputs + + input wire req_valid, + input wire [NUM_REQS-1:0] req_tmask, + input wire [1:0][NUM_REQS-1:0][31:0] req_coords, + input wire [`TEX_FORMAT_BITS-1:0] req_format, + input wire [`TEX_FILTER_BITS-1:0] req_filter, + input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, + input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, + input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoffset, + input wire [1:0][NUM_REQS-1:0][`TEX_DIM_BITS-1:0] req_logdims, + input wire [REQ_INFO_WIDTH-1:0] req_info, + output wire req_ready, + + // outputs + + output wire rsp_valid, + output wire [NUM_REQS-1:0] rsp_tmask, + output wire [`TEX_FILTER_BITS-1:0] rsp_filter, + output wire [`TEX_STRIDE_BITS-1:0] rsp_stride, + output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, + output wire [1:0][NUM_REQS-1:0][`BLEND_FRAC-1:0] rsp_blends, + output wire [REQ_INFO_WIDTH-1:0] rsp_info, + input wire rsp_ready +); + + `UNUSED_PARAM (CORE_ID) + + wire valid_s0; + wire [NUM_REQS-1:0] tmask_s0; + wire [`TEX_FILTER_BITS-1:0] filter_s0; + wire [REQ_INFO_WIDTH-1:0] req_info_s0; + + wire [1:0][NUM_REQS-1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0; + wire [1:0][NUM_REQS-1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0; + wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0; + wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0; + wire [1:0][NUM_REQS-1:0][`TEX_DIM_BITS-1:0] log_dims_s0; + + wire stall_out; + + // stride + + VX_tex_stride #( + .CORE_ID (CORE_ID) + ) tex_stride ( + .format (req_format), + .log_stride (log_stride) + ); + + // addressing mode + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 2; ++j) begin + wire [31:0] coord_lo, coord_hi; + + assign coord_lo = req_coords[j][i] - (req_filter ? (`FIXED_HALF >> req_logdims[j][i]) : 0); + assign coord_hi = req_coords[j][i] + (req_filter ? (`FIXED_HALF >> req_logdims[j][i]) : 0); + + VX_tex_wrap #( + .CORE_ID (CORE_ID) + ) tex_wrap_lo ( + .wrap_i (req_wraps[j]), + .coord_i (coord_lo), + .coord_o (clamped_lo[j][i]) + ); + + VX_tex_wrap #( + .CORE_ID (CORE_ID) + ) tex_wrap_hi ( + .wrap_i (req_wraps[j]), + .coord_i (coord_hi), + .coord_o (clamped_hi[j][i]) + ); + end + assign mip_addr[i] = req_baseaddr + 32'(req_mipoffset[i]); + end + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFO_WIDTH + NUM_REQS * (2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)), + .RESETW (1) + ) pipe_reg0 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, req_logdims, mip_addr, clamped_lo, clamped_hi}), + .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) + ); + + // addresses generation + + wire [1:0][NUM_REQS-1:0][`FIXED_INT-1:0] scaled_lo, scaled_hi; + wire [1:0][NUM_REQS-1:0][`BLEND_FRAC-1:0] blends; + wire [NUM_REQS-1:0][3:0][31:0] addr; + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 2; ++j) begin + assign scaled_lo[j][i] = `FIXED_INT'(clamped_lo_s0[j][i] >> ((`FIXED_FRAC) - log_dims_s0[j][i])); + assign scaled_hi[j][i] = `FIXED_INT'(clamped_hi_s0[j][i] >> ((`FIXED_FRAC) - log_dims_s0[j][i])); + assign blends[j][i] = filter_s0 ? clamped_lo_s0[j][i][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0); + end + end + + for (genvar i = 0; i < NUM_REQS; ++i) begin + assign addr[i][0] = mip_addr_s0[i] + (32'(scaled_lo[0][i]) + (32'(scaled_lo[1][i]) << log_dims_s0[0][i])) << log_stride_s0; + assign addr[i][1] = mip_addr_s0[i] + (32'(scaled_hi[0][i]) + (32'(scaled_lo[1][i]) << log_dims_s0[0][i])) << log_stride_s0; + assign addr[i][2] = mip_addr_s0[i] + (32'(scaled_lo[0][i]) + (32'(scaled_hi[1][i]) << log_dims_s0[0][i])) << log_stride_s0; + assign addr[i][3] = mip_addr_s0[i] + (32'(scaled_hi[0][i]) + (32'(scaled_hi[1][i]) << log_dims_s0[0][i])) << log_stride_s0; + end + + assign stall_out = rsp_valid && ~rsp_ready; + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFO_WIDTH), + .RESETW (1) + ) pipe_reg1 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), + .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info}) + ); + + assign req_ready = ~stall_out; + + `ifdef DBG_PRINT_TEX + wire [`NW_BITS-1:0] rsp_wid; + wire [31:0] rsp_PC; + assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; + + always @(posedge clk) begin + if (rsp_valid && rsp_ready) begin + $write("%t: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride); + `PRINT_ARRAY2D(rsp_addr, 4, NUM_REQS); + $write("\n"); + end + end +`endif + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh new file mode 100644 index 00000000..2b87d4da --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -0,0 +1,42 @@ +`ifndef VX_TEX_DEFINE +`define VX_TEX_DEFINE + +`include "VX_define.vh" + +`define FIXED_FRAC 20 +`define FIXED_INT (32 - `FIXED_FRAC) +`define FIXED_ONE (2 ** `FIXED_FRAC) +`define FIXED_HALF (`FIXED_ONE >> 1) +`define FIXED_MASK (`FIXED_ONE - 1) + +`define CLAMP(x,lo,hi) (($signed(x) < $signed(lo)) ? lo : ((x > hi) ? hi : x)) + +`define TEX_ADDR_BITS 32 +`define TEX_FORMAT_BITS 3 +`define TEX_WRAP_BITS 2 +`define TEX_DIM_BITS 4 +`define TEX_FILTER_BITS 1 + +`define TEX_MIPOFF_BITS (2*12+1) +`define TEX_STRIDE_BITS 2 + +`define TEX_LOD_BITS 4 +`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS) + +`define TEX_WRAP_CLAMP 0 +`define TEX_WRAP_REPEAT 1 +`define TEX_WRAP_MIRROR 2 + +`define TEX_COLOR_BITS 8 + +`define BLEND_FRAC 8 +`define BLEND_ONE (2 ** `BLEND_FRAC) + +`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0) +`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1) +`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2) +`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3) +`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4) +`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5) + +`endif \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_format.v b/hw/rtl/tex_unit/VX_tex_format.v new file mode 100644 index 00000000..e3e7351d --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_format.v @@ -0,0 +1,58 @@ +`include "VX_tex_define.vh" + +module VX_tex_format #( + parameter CORE_ID = 0 +) ( + input wire [`TEX_FORMAT_BITS-1:0] format, + input wire [31:0] texel_in, + output wire [31:0] texel_out +); + `UNUSED_PARAM (CORE_ID) + + reg [31:0] texel_out_r; + + always @(*) begin + case (format) + `TEX_FORMAT_R5G6B5: begin + texel_out_r[07:00] = `TEX_COLOR_BITS'({texel_in[15:11],texel_in[15:13]}); + texel_out_r[15:08] = `TEX_COLOR_BITS'({texel_in[10:5],texel_in[10:9]}); + texel_out_r[23:16] = `TEX_COLOR_BITS'({texel_in[4:0],texel_in[4:2]}); + texel_out_r[31:24] = {`TEX_COLOR_BITS{1'b1}}; + end + `TEX_FORMAT_R4G4B4A4: begin + texel_out_r[07:00] = `TEX_COLOR_BITS'({texel_in[11:8],texel_in[15:12]}); + texel_out_r[15:08] = `TEX_COLOR_BITS'({2{texel_in[7:4]}}); + texel_out_r[23:16] = `TEX_COLOR_BITS'({2{texel_in[3:0]}}); + texel_out_r[31:24] = `TEX_COLOR_BITS'({2{texel_in[15:12]}}); + end + `TEX_FORMAT_L8A8: begin + texel_out_r[07:00] = `TEX_COLOR_BITS'(texel_in[7:0]); + texel_out_r[15:08] = `TEX_COLOR_BITS'(texel_in[7:0]); + texel_out_r[23:16] = `TEX_COLOR_BITS'(texel_in[7:0]); + texel_out_r[31:24] = `TEX_COLOR_BITS'(texel_in[15:8]); + end + `TEX_FORMAT_A8: begin + texel_out_r[07:00] = `TEX_COLOR_BITS'(0); + texel_out_r[15:08] = `TEX_COLOR_BITS'(0); + texel_out_r[23:16] = `TEX_COLOR_BITS'(0); + texel_out_r[31:24] = `TEX_COLOR_BITS'(texel_in[7:0]); + end + `TEX_FORMAT_L8: begin + texel_out_r[07:00] = `TEX_COLOR_BITS'(texel_in[7:0]); + texel_out_r[15:08] = `TEX_COLOR_BITS'(texel_in[7:0]); + texel_out_r[23:16] = `TEX_COLOR_BITS'(texel_in[7:0]); + texel_out_r[31:24] = {`TEX_COLOR_BITS{1'b1}}; + end + // `TEX_FORMAT_R8G8B8A8 + default: begin + texel_out_r[07:00] = `TEX_COLOR_BITS'(texel_in[7:0]); + texel_out_r[15:08] = `TEX_COLOR_BITS'(texel_in[15:8]); + texel_out_r[23:16] = `TEX_COLOR_BITS'(texel_in[23:16]); + texel_out_r[31:24] = `TEX_COLOR_BITS'(texel_in[31:24]); + end + endcase + end + + assign texel_out = texel_out_r; + +endmodule diff --git a/hw/rtl/tex_unit/VX_tex_lerp.v b/hw/rtl/tex_unit/VX_tex_lerp.v new file mode 100644 index 00000000..5495688e --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_lerp.v @@ -0,0 +1,17 @@ +`include "VX_tex_define.vh" + +module VX_tex_lerp #( +) ( + input wire [`BLEND_FRAC-1:0] blend, + input wire [31:0] in1, + input wire [31:0] in2, + output wire [31:0] out +); + for (genvar i = 0; i < 4; ++i) begin + wire [8:0] blend_m1 = `BLEND_ONE - blend; + wire [16:0] sum = in1[i*8+:8] * blend_m1 + in2[i*8+:8] * blend; + `UNUSED_VAR (sum) + assign out[i*8+:8] = sum[15:8]; + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_lsu_arb.v b/hw/rtl/tex_unit/VX_tex_lsu_arb.v new file mode 100644 index 00000000..04e742e8 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_lsu_arb.v @@ -0,0 +1,128 @@ +`include "../cache/VX_cache_define.vh" + +module VX_tex_lsu_arb #( + parameter NUM_REQS = 1, + parameter LANES = 1, + parameter WORD_SIZE = 1, + parameter TAG_IN_WIDTH = 1, + parameter TAG_OUT_WIDTH = 1, + parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) +) ( + input wire clk, + input wire reset, + + // input requests + input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in, + input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in, + input wire [NUM_REQS-1:0][LANES-1:0][WORD_SIZE-1:0] req_byteen_in, + input wire [NUM_REQS-1:0][LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_in, + input wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] req_data_in, + input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in, + output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in, + + // output request + output wire [LANES-1:0] req_valid_out, + output wire [LANES-1:0] req_rw_out, + output wire [LANES-1:0][WORD_SIZE-1:0] req_byteen_out, + output wire [LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_out, + output wire [LANES-1:0][`WORD_WIDTH-1:0] req_data_out, + output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out, + input wire [LANES-1:0] req_ready_out, + + // input response + input wire [LANES-1:0] rsp_valid_in, + input wire [LANES-1:0][`WORD_WIDTH-1:0] rsp_data_in, + input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, + output wire rsp_ready_in, + + // output responses + output wire [NUM_REQS-1:0][LANES-1:0] rsp_valid_out, + output wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] rsp_data_out, + output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, + input wire [NUM_REQS-1:0] rsp_ready_out +); + localparam REQ_DATAW = LANES * (1 + TAG_IN_WIDTH + `WORD_ADDR_WIDTH + 1 + WORD_SIZE + `WORD_WIDTH); + + if (NUM_REQS > 1) begin + + wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_in; + wire [NUM_REQS-1:0] req_valid_in_any; + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign req_merged_data_in[i] = {req_valid_in[i], req_tag_in[i], req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]}; + assign req_valid_in_any[i] = (| req_valid_in[i]); + end + + wire sel_valid; + wire [LOG_NUM_REQS-1:0] sel_idx; + wire [NUM_REQS-1:0] sel_1hot; + + wire sel_enable = (| req_ready_out); + + VX_rr_arbiter #( + .NUM_REQS(NUM_REQS), + .LOCK_ENABLE(1) + ) sel_arb ( + .clk (clk), + .reset (reset), + .requests (req_valid_in_any), + .enable (sel_enable), + .grant_valid (sel_valid), + .grant_index (sel_idx), + .grant_onehot (sel_1hot) + ); + + wire [LANES-1:0] req_valid_out_unqual; + wire [LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_out_unqual; + + assign {req_valid_out_unqual, req_tag_out_unqual, req_addr_out, req_rw_out, req_byteen_out, req_data_out} = req_merged_data_in[sel_idx]; + + assign req_valid_out = req_valid_out_unqual & {LANES{sel_valid}}; + + for (genvar i = 0; i < LANES; i++) begin + assign req_tag_out[i] = {req_tag_out_unqual[i], sel_idx}; + end + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign req_ready_in[i] = req_ready_out & {LANES{sel_1hot[i]}}; + end + + /////////////////////////////////////////////////////////////////////// + + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[LOG_NUM_REQS-1:0]; + + reg [NUM_REQS-1:0][LANES-1:0] rsp_valid_out_unqual; + always @(*) begin + rsp_valid_out_unqual = '0; + rsp_valid_out_unqual[rsp_sel] = rsp_valid_in; + end + assign rsp_valid_out = rsp_valid_out_unqual; + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign rsp_data_out[i] = rsp_data_in; + assign rsp_tag_out[i] = rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH]; + end + + assign rsp_ready_in = rsp_ready_out[rsp_sel]; + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + assign req_valid_out = req_valid_in; + assign req_tag_out = req_tag_in; + assign req_addr_out = req_addr_in; + assign req_rw_out = req_rw_in; + assign req_byteen_out = req_byteen_in; + assign req_data_out = req_data_in; + assign req_ready_in = req_ready_out; + + assign rsp_valid_out = rsp_valid_in; + assign rsp_tag_out = rsp_tag_in; + assign rsp_data_out = rsp_data_in; + assign rsp_ready_in = rsp_ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_memory.v b/hw/rtl/tex_unit/VX_tex_memory.v new file mode 100644 index 00000000..8c307e03 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_memory.v @@ -0,0 +1,288 @@ +`include "VX_tex_define.vh" +module VX_tex_memory #( + parameter CORE_ID = 0, + parameter REQ_INFO_WIDTH = 1, + parameter NUM_REQS = 1 +) ( + input wire clk, + input wire reset, + + // memory interface + VX_dcache_req_if dcache_req_if, + VX_dcache_rsp_if dcache_rsp_if, + + // inputs + input wire req_valid, + input wire [NUM_REQS-1:0] req_tmask, + input wire [`TEX_FILTER_BITS-1:0] req_filter, + input wire [`TEX_STRIDE_BITS-1:0] req_stride, + input wire [NUM_REQS-1:0][3:0][31:0] req_addr, + input wire [REQ_INFO_WIDTH-1:0] req_info, + output wire req_ready, + + // outputs + output wire rsp_valid, + output wire [NUM_REQS-1:0] rsp_tmask, + output wire [NUM_REQS-1:0][3:0][31:0] rsp_data, + output wire [REQ_INFO_WIDTH-1:0] rsp_info, + input wire rsp_ready +); + + `UNUSED_PARAM (CORE_ID) + + localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1); + + wire [3:0] dup_reqs; + wire [3:0][NUM_REQS-1:0][29:0] req_addr_w; + wire [3:0][NUM_REQS-1:0][1:0] align_offs; + + // reorder address into quads + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 4; ++j) begin + assign req_addr_w[j][i] = req_addr[i][j][31:2]; + assign align_offs[j][i] = req_addr[i][j][1:0]; + end + end + + // find duplicate addresses + + for (genvar i = 0; i < 4; ++i) begin + wire [NUM_REQS-1:0] addr_matches; + for (genvar j = 0; j < NUM_REQS; j++) begin + assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j]; + end + assign dup_reqs[i] = req_tmask[0] && (& addr_matches); + end + + // save request addresses into fifo + + wire reqq_push, reqq_pop, reqq_empty, reqq_full; + + wire [3:0][NUM_REQS-1:0][29:0] q_req_addr; + wire [NUM_REQS-1:0] q_req_tmask; + wire [`TEX_FILTER_BITS-1:0] q_req_filter; + wire [REQ_INFO_WIDTH-1:0] q_req_info; + wire [`TEX_STRIDE_BITS-1:0] q_req_stride; + wire [3:0][NUM_REQS-1:0][1:0] q_align_offs; + wire [3:0] q_dup_reqs; + + assign reqq_push = req_valid && req_ready; + + VX_fifo_queue #( + .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFO_WIDTH + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4), + .SIZE (`LSUQ_SIZE), + .OUTPUT_REG (1) + ) req_queue ( + .clk (clk), + .reset (reset), + .push (reqq_push), + .pop (reqq_pop), + .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}), + .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}), + .empty (reqq_empty), + .full (reqq_full), + `UNUSED_PIN (alm_full), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (size) + ); + + // can take more requests? + assign req_ready = ~reqq_full; + + /////////////////////////////////////////////////////////////////////////// + + wire req_texel_valid; + wire sent_all_ready, last_texel_sent; + wire req_texel_dup; + wire [NUM_REQS-1:0][29:0] req_texel_addr; + reg [1:0] req_texel_idx; + reg req_texels_done; + + always @(posedge clk) begin + if (reset || last_texel_sent) begin + req_texel_idx <= 0; + end else if (req_texel_valid && sent_all_ready) begin + req_texel_idx <= req_texel_idx + 1; + end + end + + always @(posedge clk) begin + if (reset || reqq_pop) begin + req_texels_done <= 0; + end else if (last_texel_sent) begin + req_texels_done <= 1; + end + end + + assign req_texel_valid = ~reqq_empty && ~req_texels_done; + assign req_texel_addr = q_req_addr[req_texel_idx]; + assign req_texel_dup = q_dup_reqs[req_texel_idx]; + + wire is_last_texel = (req_texel_idx == (q_req_filter ? 3 : 0)); + assign last_texel_sent = req_texel_valid && sent_all_ready && is_last_texel; + + // DCache Request + + reg [NUM_REQS-1:0] texel_sent_mask; + wire [NUM_REQS-1:0] dcache_req_fire; + wire [NUM_REQS-1:0] req_dup_mask; + + assign dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready; + + assign sent_all_ready = (&(dcache_req_if.ready | texel_sent_mask | ~q_req_tmask)) + || (req_texel_dup & dcache_req_if.ready[0]); + + always @(posedge clk) begin + if (reset || sent_all_ready) begin + texel_sent_mask <= 0; + end else begin + texel_sent_mask <= texel_sent_mask | dcache_req_fire; + end + end + + assign req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1}; + + assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask; + assign dcache_req_if.rw = {NUM_REQS{1'b0}}; + assign dcache_req_if.addr = req_texel_addr; + assign dcache_req_if.byteen = {NUM_REQS{4'b1111}}; + assign dcache_req_if.data = 'x; + +`ifdef DBG_CACHE_REQ_INFO + wire [`NW_BITS-1:0] q_req_wid; + wire [31:0] q_req_PC; + assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0]; + assign dcache_req_if.tag = {NUM_REQS{q_req_PC, q_req_wid, req_texel_idx}}; +`else + assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}}; +`endif + + // Dcache Response + + reg [3:0][NUM_REQS-1:0][31:0] rsp_texels, rsp_texels_n; + wire [NUM_REQS-1:0][3:0][31:0] rsp_texels_qual; + reg [NUM_REQS-1:0][31:0] rsp_data_qual; + reg [RSP_CTR_W-1:0] rsp_rem_ctr; + wire [NUM_REQS-1:0] rsp_cur_tmask; + wire [$clog2(NUM_REQS + 1)-1:0] rsp_cur_cnt; + wire dcache_rsp_fire; + wire [1:0] rsp_texel_idx; + wire rsp_texel_dup; + + assign rsp_texel_idx = dcache_rsp_if.tag[1:0]; + + assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx]; + + assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; + + assign rsp_cur_tmask = rsp_texel_dup ? q_req_tmask : dcache_rsp_if.tmask; + + assign rsp_cur_cnt = $countones(rsp_cur_tmask); + + for (genvar i = 0; i < NUM_REQS; i++) begin + wire [31:0] src_mask = {32{dcache_rsp_if.tmask[i]}}; + wire [31:0] src_data = ((i == 0 || rsp_texel_dup) ? dcache_rsp_if.data[0] : (dcache_rsp_if.data[i]) & src_mask); + + reg [31:0] rsp_data_shifted; + always @(*) begin + rsp_data_shifted[31:16] = src_data[31:16]; + rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0]; + rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; + end + + always @(*) begin + case (q_req_stride) + 0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]); + 1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]); + default: rsp_data_qual[i] = rsp_data_shifted; + endcase + end + end + + always @(*) begin + rsp_texels_n = rsp_texels; + rsp_texels_n[rsp_texel_idx] |= rsp_data_qual; + end + + always @(posedge clk) begin + if (reset || reqq_pop) begin + rsp_texels <= '0; + end else if (dcache_rsp_fire) begin + rsp_texels <= rsp_texels_n; + end + end + + always @(posedge clk) begin + if (reset) begin + rsp_rem_ctr <= 0; + end else begin + if ((| dcache_req_fire) && 0 == rsp_rem_ctr) begin + rsp_rem_ctr <= q_req_filter ? {$countones(q_req_tmask), 2'b0} : {2'b0, $countones(q_req_tmask)}; + end else if (dcache_rsp_fire) begin + rsp_rem_ctr <= rsp_rem_ctr - RSP_CTR_W'(rsp_cur_cnt); + end + end + end + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 4; ++j) begin + assign rsp_texels_qual[i][j] = rsp_texels_n[j][i]; + end + end + + wire stall_out = rsp_valid && ~rsp_ready; + + wire rsp_texels_done = dcache_rsp_fire && (rsp_rem_ctr == RSP_CTR_W'(rsp_cur_cnt)); + + assign reqq_pop = rsp_texels_done && ~stall_out; + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFO_WIDTH + (4 * NUM_REQS * 32)), + .RESETW (1) + ) rsp_pipe_reg ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({rsp_texels_done, q_req_tmask, q_req_info, rsp_texels_qual}), + .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) + ); + + // Can accept new cache response? + assign dcache_rsp_if.ready = ~stall_out || (rsp_rem_ctr != RSP_CTR_W'(rsp_cur_cnt)); + +`ifdef DBG_PRINT_TEX + wire [`NW_BITS-1:0] req_wid, rsp_wid; + wire [31:0] req_PC, rsp_PC; + assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0]; + assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; + + always @(posedge clk) begin + if ((| dcache_req_fire)) begin + $write("%t: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, dcache_req_if.tag); + `PRINT_ARRAY1D(req_texel_addr, NUM_REQS); + $write(", is_dup=%b\n", req_texel_dup); + end + if (dcache_rsp_fire) begin + $write("%t: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.valid, dcache_rsp_if.tag); + `PRINT_ARRAY1D(rsp_data_qual, NUM_REQS); + $write("\n"); + end + if (req_valid && req_ready) begin + $write("%t: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=", + $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride); + `PRINT_ARRAY2D(req_addr, 4, NUM_REQS); + $write("\n"); + end + if (rsp_valid && rsp_ready) begin + $write("%t: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask); + `PRINT_ARRAY2D(rsp_data, 4, NUM_REQS); + $write("\n"); + end + end +`endif + +endmodule diff --git a/hw/rtl/tex_unit/VX_tex_sampler.v b/hw/rtl/tex_unit/VX_tex_sampler.v new file mode 100644 index 00000000..75d0a5bb --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_sampler.v @@ -0,0 +1,136 @@ +`include "VX_tex_define.vh" + +module VX_tex_sampler #( + parameter CORE_ID = 0, + parameter REQ_INFO_WIDTH = 1, + parameter NUM_REQS = 1 +) ( + input wire clk, + input wire reset, + + // inputs + input wire req_valid, + input wire [`NUM_THREADS-1:0] req_tmask, + input wire [`TEX_FORMAT_BITS-1:0] req_format, + input wire [1:0][NUM_REQS-1:0][`BLEND_FRAC-1:0] req_blends, + input wire [NUM_REQS-1:0][3:0][31:0] req_data, + input wire [REQ_INFO_WIDTH-1:0] req_info, + output wire req_ready, + + // ouputs + output wire rsp_valid, + output wire [`NUM_THREADS-1:0] rsp_tmask, + output wire [NUM_REQS-1:0][31:0] rsp_data, + output wire [REQ_INFO_WIDTH-1:0] rsp_info, + input wire rsp_ready +); + + `UNUSED_PARAM (CORE_ID) + + wire valid_s0; + wire [`NUM_THREADS-1:0] tmask_s0; + wire [REQ_INFO_WIDTH-1:0] req_info_s0; + wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; + wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; + wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v_s0; + wire [NUM_REQS-1:0][31:0] texel_v; + + wire stall_out; + + for (genvar i = 0; i < NUM_REQS; i++) begin + + wire [3:0][31:0] fmt_texels; + + for (genvar j = 0; j < 4; j++) begin + VX_tex_format #( + .CORE_ID (CORE_ID) + ) tex_format ( + .format (req_format), + .texel_in (req_data[i][j]), + .texel_out (fmt_texels[j]) + ); + end + + VX_tex_lerp #( + ) tex_lerp_ul ( + .blend (req_blends[0][i]), + .in1 (fmt_texels[0]), + .in2 (fmt_texels[1]), + .out (texel_ul[i]) + ); + + VX_tex_lerp #( + ) tex_lerp_uh ( + .blend (req_blends[0][i]), + .in1 (fmt_texels[2]), + .in2 (fmt_texels[3]), + .out (texel_uh[i]) + ); + end + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFO_WIDTH + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)), + .RESETW (1) + ) pipe_reg0 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({req_valid, req_tmask, req_info, req_blends[1], texel_ul, texel_uh}), + .data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0}) + ); + + for (genvar i = 0; i < NUM_REQS; i++) begin + VX_tex_lerp #( + ) tex_lerp_v ( + .blend (blend_v_s0[i]), + .in1 (texel_ul_s0[i]), + .in2 (texel_uh_s0[i]), + .out (texel_v[i]) + ); + end + + assign stall_out = rsp_valid && ~rsp_ready; + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFO_WIDTH + (NUM_REQS * 32)), + .RESETW (1) + ) pipe_reg1 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}), + .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) + ); + + // can accept new request? + assign req_ready = ~stall_out; + +`ifdef DBG_PRINT_TEX + + wire [`NW_BITS-1:0] req_wid, rsp_wid; + wire [31:0] req_PC, rsp_PC; + + assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0]; + assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; + + always @(posedge clk) begin + if (req_valid && req_ready) begin + $write("%t: core%0d-tex-sampler-req: wid=%0d, PC=%0h, tmask=%b, format=%0d, data=", + $time, CORE_ID, req_wid, req_PC, req_tmask, req_format); + `PRINT_ARRAY2D(req_data, 4, NUM_REQS); + $write(", u0="); + `PRINT_ARRAY1D(req_blends[0], NUM_REQS); + $write(", v0="); + `PRINT_ARRAY1D(req_blends[1], NUM_REQS); + $write("\n"); + end + if (rsp_valid && rsp_ready) begin + $write("%t: core%0d-tex-sampler-rsp: wid=%0d, PC=%0h, tmask=%b, data=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask); + `PRINT_ARRAY1D(rsp_data, NUM_REQS); + $write("\n"); + end + end +`endif + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_stride.v b/hw/rtl/tex_unit/VX_tex_stride.v new file mode 100644 index 00000000..50393fe9 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_stride.v @@ -0,0 +1,27 @@ +`include "VX_tex_define.vh" + +module VX_tex_stride #( + parameter CORE_ID = 0 +) ( + input wire [`TEX_FORMAT_BITS-1:0] format, + output wire [`TEX_STRIDE_BITS-1:0] log_stride +); + `UNUSED_PARAM (CORE_ID) + + reg [`TEX_STRIDE_BITS-1:0] log_stride_r; + + always @(*) begin + case (format) + `TEX_FORMAT_A8: log_stride_r = 0; + `TEX_FORMAT_L8: log_stride_r = 0; + `TEX_FORMAT_L8A8: log_stride_r = 1; + `TEX_FORMAT_R5G6B5: log_stride_r = 1; + `TEX_FORMAT_R4G4B4A4: log_stride_r = 1; + //`TEX_FORMAT_R8G8B8A8 + default: log_stride_r = 2; + endcase + end + + assign log_stride = log_stride_r; + +endmodule diff --git a/hw/rtl/tex_unit/VX_tex_unit.v b/hw/rtl/tex_unit/VX_tex_unit.v new file mode 100644 index 00000000..bb36b335 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_unit.v @@ -0,0 +1,226 @@ +`include "VX_tex_define.vh" + +module VX_tex_unit #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + // Texture unit <-> Memory Unit + VX_dcache_req_if dcache_req_if, + VX_dcache_rsp_if dcache_rsp_if, + + // Inputs + VX_tex_req_if tex_req_if, + VX_tex_csr_if tex_csr_if, + + // Outputs + VX_tex_rsp_if tex_rsp_if +); + + localparam REQ_INFO_WIDTH_S = `NR_BITS + 1 + `NW_BITS + 32; + localparam REQ_INFO_WIDTH_A = `TEX_FORMAT_BITS + REQ_INFO_WIDTH_S; + localparam REQ_INFO_WIDTH_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFO_WIDTH_A; + + reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [`TEX_DIM_BITS-1:0] tex_dims [1:0][`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + + reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; + reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; + reg [`TEX_WRAP_BITS-1:0] tex_wraps [1:0][`NUM_TEX_UNITS-1:0]; + reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; + + // CSRs programming + + for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin + wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS]; + always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + case (tex_csr_if.write_addr) + `CSR_TEX_ADDR(i) : begin + tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; + end + `CSR_TEX_FORMAT(i) : begin + tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; + end + `CSR_TEX_WRAP(i) : begin + tex_wraps[0][i] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; + tex_wraps[1][i] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS]; + end + `CSR_TEX_FILTER(i) : begin + tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; + end + `CSR_TEX_MIPOFF(i) : begin + tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + end + `CSR_TEX_WIDTH(i) : begin + tex_dims[0][i][mip_level] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; + end + `CSR_TEX_HEIGHT(i) : begin + tex_dims[1][i][mip_level] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; + end + endcase + end + end + end + + // mipmap attributes + + wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; + wire [1:0][`NUM_THREADS-1:0][`TEX_DIM_BITS-1:0] sel_dims; + + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; + wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS]; + assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; + assign sel_dims[0][i] = tex_dims[0][unit][mip_level]; + assign sel_dims[1][i] = tex_dims[1][unit][mip_level]; + end + + // address generation + + wire mem_req_valid; + wire [`NUM_THREADS-1:0] mem_req_tmask; + wire [`TEX_FILTER_BITS-1:0] mem_req_filter; + wire [`TEX_STRIDE_BITS-1:0] mem_req_stride; + wire [1:0][`NUM_THREADS-1:0][`BLEND_FRAC-1:0] mem_req_blends; + wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; + wire [REQ_INFO_WIDTH_A-1:0] mem_req_info; + wire mem_req_ready; + + VX_tex_addr #( + .CORE_ID (CORE_ID), + .REQ_INFO_WIDTH (REQ_INFO_WIDTH_A), + .NUM_REQS (`NUM_THREADS) + ) tex_addr ( + .clk (clk), + .reset (reset), + + .req_valid (tex_req_if.valid), + .req_tmask (tex_req_if.tmask), + .req_coords (tex_req_if.coords), + .req_format (tex_format[tex_req_if.unit]), + .req_filter (tex_filter[tex_req_if.unit]), + .req_wraps ({tex_wraps[1][tex_req_if.unit], tex_wraps[0][tex_req_if.unit]}), + .req_baseaddr(tex_baddr[tex_req_if.unit]), + .req_mipoffset(sel_mipoff), + .req_logdims(sel_dims), + .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), + .req_ready (tex_req_if.ready), + + .rsp_valid (mem_req_valid), + .rsp_tmask (mem_req_tmask), + .rsp_filter (mem_req_filter), + .rsp_stride (mem_req_stride), + .rsp_addr (mem_req_addr), + .rsp_blends (mem_req_blends), + .rsp_info (mem_req_info), + .rsp_ready (mem_req_ready) + ); + + // retrieve texel values from memory + + wire mem_rsp_valid; + wire [`NUM_THREADS-1:0] mem_rsp_tmask; + wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data; + wire [REQ_INFO_WIDTH_M-1:0] mem_rsp_info; + wire mem_rsp_ready; + + VX_tex_memory #( + .CORE_ID (CORE_ID), + .REQ_INFO_WIDTH (REQ_INFO_WIDTH_M), + .NUM_REQS (`NUM_THREADS) + ) tex_memory ( + .clk (clk), + .reset (reset), + + // memory interface + .dcache_req_if (dcache_req_if), + .dcache_rsp_if (dcache_rsp_if), + + // inputs + .req_valid (mem_req_valid), + .req_tmask (mem_req_tmask), + .req_filter(mem_req_filter), + .req_stride(mem_req_stride), + .req_addr (mem_req_addr), + .req_info ({mem_req_blends, mem_req_info}), + .req_ready (mem_req_ready), + + // outputs + .rsp_valid (mem_rsp_valid), + .rsp_tmask (mem_rsp_tmask), + .rsp_data (mem_rsp_data), + .rsp_info (mem_rsp_info), + .rsp_ready (mem_rsp_ready) + ); + + // apply sampler + + wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends; + wire [`TEX_FORMAT_BITS-1:0] rsp_format; + wire [REQ_INFO_WIDTH_S-1:0] rsp_info; + + assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info; + + VX_tex_sampler #( + .CORE_ID (CORE_ID), + .REQ_INFO_WIDTH (REQ_INFO_WIDTH_S), + .NUM_REQS (`NUM_THREADS) + ) tex_sampler ( + .clk (clk), + .reset (reset), + + // inputs + .req_valid (mem_rsp_valid), + .req_tmask (mem_rsp_tmask), + .req_data (mem_rsp_data), + .req_format (rsp_format), + .req_blends (rsp_blends), + .req_info (rsp_info), + .req_ready (mem_rsp_ready), + + // outputs + .rsp_valid (tex_rsp_if.valid), + .rsp_tmask (tex_rsp_if.tmask), + .rsp_data (tex_rsp_if.data), + .rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), + .rsp_ready (tex_rsp_if.ready) + ); + +`ifdef DBG_PRINT_TEX + for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin + always @(posedge clk) begin + if (tex_csr_if.write_enable + && (tex_csr_if.write_addr >= `CSR_TEX_BEGIN(i) + && tex_csr_if.write_addr < `CSR_TEX_BEGIN(i+1))) begin + $display("%t: core%0d-tex-csr: tex%0d_addr=%0h", $time, CORE_ID, i, tex_baddr[i]); + $display("%t: core%0d-tex-csr: tex%0d_format=%0h", $time, CORE_ID, i, tex_format[i]); + $display("%t: core%0d-tex-csr: tex%0d_wrap_u=%0h", $time, CORE_ID, i, tex_wraps[0][i]); + $display("%t: core%0d-tex-csr: tex%0d_wrap_v=%0h", $time, CORE_ID, i, tex_wraps[1][i]); + $display("%t: core%0d-tex-csr: tex%0d_filter=%0h", $time, CORE_ID, i, tex_filter[i]); + $display("%t: core%0d-tex-csr: tex%0d_mipoff[0]=%0h", $time, CORE_ID, i, tex_mipoff[i][0]); + $display("%t: core%0d-tex-csr: tex%0d_width[0]=%0h", $time, CORE_ID, i, tex_dims[0][i][0]); + $display("%t: core%0d-tex-csr: tex%0d_height[0]=%0h", $time, CORE_ID, i, tex_dims[1][i][0]); + end + end + end + always @(posedge clk) begin + if (tex_req_if.valid && tex_req_if.ready) begin + $display("%t: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=", + $time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod); + `PRINT_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS); + $write(", v="); + `PRINT_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS); + $write("\n"); + end + if (tex_rsp_if.valid && tex_rsp_if.ready) begin + $write("%t: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=", + $time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask); + `PRINT_ARRAY1D(tex_rsp_if.data, `NUM_THREADS); + $write("\n"); + end + end +`endif + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_wrap.v b/hw/rtl/tex_unit/VX_tex_wrap.v new file mode 100644 index 00000000..75857f8f --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_wrap.v @@ -0,0 +1,32 @@ +`include "VX_tex_define.vh" + +module VX_tex_wrap #( + parameter CORE_ID = 0 +) ( + input wire [`TEX_WRAP_BITS-1:0] wrap_i, + input wire [31:0] coord_i, + output wire [`FIXED_FRAC-1:0] coord_o +); + + `UNUSED_PARAM (CORE_ID) + + reg [`FIXED_FRAC-1:0] coord_r; + + wire [31:0] clamp = `CLAMP(coord_i, 0, `FIXED_MASK); + + `UNUSED_VAR (clamp) + + always @(*) begin + case (wrap_i) + `TEX_WRAP_CLAMP: + coord_r = clamp[`FIXED_FRAC-1:0]; + `TEX_WRAP_MIRROR: + coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}}; + default: //`TEX_WRAP_REPEAT + coord_r = coord_i[`FIXED_FRAC-1:0]; + endcase + end + + assign coord_o = coord_r; + +endmodule \ No newline at end of file diff --git a/hw/simulate/Makefile b/hw/simulate/Makefile index 69a78505..d1c55fd3 100644 --- a/hw/simulate/Makefile +++ b/hw/simulate/Makefile @@ -17,6 +17,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE +DBG_PRINT_FLAGS += -DDBG_PRINT_TEX DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO @@ -36,7 +37,8 @@ RTL_DIR=../rtl DPI_DIR=../dpi FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src -RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) +TEX_INCLUDE = -I$(RTL_DIR)/tex_unit +RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE) SRCS = simulator.cpp main.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp @@ -96,4 +98,4 @@ build-mt: gen-mt make -j -C obj_dir -f VVortex.mk clean: - rm -rf obj_dir + rm -rf obj_dir \ No newline at end of file diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 2164849c..f83e7044 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -33,7 +33,8 @@ CONFIG32 := -DNUM_CLUSTERS=8 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY) -RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu +TEX_INCLUDE = -I$(RTL_DIR)/tex_unit +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE) $(TEX_INCLUDE) CFLAGS += $(RTL_INCLUDE) diff --git a/hw/syn/quartus/core/Makefile b/hw/syn/quartus/core/Makefile index 37e42df0..b976110c 100644 --- a/hw/syn/quartus/core/Makefile +++ b/hw/syn/quartus/core/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/pipeline/Makefile b/hw/syn/quartus/pipeline/Makefile index bd259697..6de824b8 100644 --- a/hw/syn/quartus/pipeline/Makefile +++ b/hw/syn/quartus/pipeline/Makefile @@ -12,12 +12,12 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE) + PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf -# Part, Family -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on diff --git a/hw/syn/quartus/top1/Makefile b/hw/syn/quartus/top1/Makefile index 465cb192..374f84e1 100644 --- a/hw/syn/quartus/top1/Makefile +++ b/hw/syn/quartus/top1/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top16/Makefile b/hw/syn/quartus/top16/Makefile index bd31b20a..055740b7 100644 --- a/hw/syn/quartus/top16/Makefile +++ b/hw/syn/quartus/top16/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top2/Makefile b/hw/syn/quartus/top2/Makefile index 31234939..f8801373 100644 --- a/hw/syn/quartus/top2/Makefile +++ b/hw/syn/quartus/top2/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top32/Makefile b/hw/syn/quartus/top32/Makefile index 558359e5..a565fca2 100644 --- a/hw/syn/quartus/top32/Makefile +++ b/hw/syn/quartus/top32/Makefile @@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top4/Makefile b/hw/syn/quartus/top4/Makefile index 2bdd4f1d..93699407 100644 --- a/hw/syn/quartus/top4/Makefile +++ b/hw/syn/quartus/top4/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top64/Makefile b/hw/syn/quartus/top64/Makefile index 89e23aa4..17d7a2aa 100644 --- a/hw/syn/quartus/top64/Makefile +++ b/hw/syn/quartus/top64/Makefile @@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top8/Makefile b/hw/syn/quartus/top8/Makefile index 07605567..d3287844 100644 --- a/hw/syn/quartus/top8/Makefile +++ b/hw/syn/quartus/top8/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/unittest/Makefile b/hw/syn/quartus/unittest/Makefile index 81219d6f..3b1bc6da 100644 --- a/hw/syn/quartus/unittest/Makefile +++ b/hw/syn/quartus/unittest/Makefile @@ -12,7 +12,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) + PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Executable Configuration diff --git a/hw/syn/quartus/vortex/Makefile b/hw/syn/quartus/vortex/Makefile index 48e40608..6874cce3 100644 --- a/hw/syn/quartus/vortex/Makefile +++ b/hw/syn/quartus/vortex/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/unit_tests/cache/Makefile b/hw/unit_tests/cache/Makefile index b6552316..d430badc 100644 --- a/hw/unit_tests/cache/Makefile +++ b/hw/unit_tests/cache/Makefile @@ -1,16 +1,16 @@ -PARAM += -DCACHE_SIZE=4096 -DWORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DNUM_BANKS=4 -DCREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4 - +TOP = VX_cache +PARAMS += -DCACHE_SIZE=4096 -DWORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DNUM_BANKS=4 -DCREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4 # control RTL debug print states DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \ -DDBG_PRINT_CORE_DCACHE \ -DDBG_PRINT_CACHE_BANK \ -DDBG_PRINT_CACHE_SNP \ - -DDBG_PRINT_CACHE_MSHR \ + -DDBG_PRINT_CACHE_MSHR \ -DDBG_PRINT_CACHE_TAG \ -DDBG_PRINT_CACHE_DATA \ - -DDBG_PRINT_MEM \ + -DDBG_PRINT_MEM \ -DDBG_PRINT_OPAE \ -DDBG_PRINT_AVS @@ -18,29 +18,27 @@ DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \ INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs - SRCS = cachesim.cpp testbench.cpp all: build CF += -std=c++11 -fms-extensions -I../.. +CF += $(PARAMS) VF += --language 1800-2009 --assert -Wall --trace #-Wpedantic VF += -Wno-DECLFILENAME VF += --x-initial unique VF += -exe $(SRCS) $(INCLUDE) - -DBG += -DVCD_OUTPUT $(DBG_PRINT) - +VF += $(PARAMS) gen: - verilator $(VF) -DNDEBUG -cc VX_cache.v $(PARAM) -CFLAGS '$(CF) -DNDEBUG $(PARAM)' --exe $(SRCS) + verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS) build: gen - (cd obj_dir && make -j -f VVX_cache.mk) + (cd obj_dir && make -j -f V$(TOP).mk) run: build - (cd obj_dir && ./VVX_cache) + (cd obj_dir && ./V$(TOP)) clean: rm -rf obj_dir diff --git a/hw/unit_tests/cache/cachesim.cpp b/hw/unit_tests/cache/cachesim.cpp index 951740d8..736b5cb2 100644 --- a/hw/unit_tests/cache/cachesim.cpp +++ b/hw/unit_tests/cache/cachesim.cpp @@ -173,10 +173,10 @@ void CacheSim::stall_mem(){ } void CacheSim::send_snoop_req(){ - cache_->snp_req_valid = 1; + /*cache_->snp_req_valid = 1; cache_->snp_req_addr = 0x12222222; cache_->snp_req_invalidate = 1; - cache_->snp_req_tag = 0xff; + cache_->snp_req_tag = 0xff; */ } void CacheSim::eval_mem_bus() { @@ -274,9 +274,9 @@ bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){ //DEBUG void CacheSim::display_miss(){ - int i = (unsigned int)cache_->miss_vec; - std::bitset<8> x(i); - if (i) std::cout << "Miss Vec " << x << std::endl; + //int i = (unsigned int)cache_->miss_vec; + //std::bitset<8> x(i); + //if (i) std::cout << "Miss Vec " << x << std::endl; //std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl; } diff --git a/hw/unit_tests/generic_queue/Makefile b/hw/unit_tests/generic_queue/Makefile index 76d53af2..f13d14a1 100644 --- a/hw/unit_tests/generic_queue/Makefile +++ b/hw/unit_tests/generic_queue/Makefile @@ -1,11 +1,30 @@ -all: testbench.iv +TOP = VX_fifo_queue -testbench.iv: testbench.v - iverilog testbench.v -o testbench.iv -I ../../rtl/ +PARAMS ?= -run: testbench.iv - ! vvp testbench.iv | grep 'ERROR' || false +INCLUDE = -I../../rtl/ -I../../rtl/libs + +SRCS = main.cpp + +all: build + +CF += -std=c++11 -fms-extensions -I../.. +VF += $(PARAMS) + +VF += --language 1800-2009 --assert -Wall --trace +VF += -Wno-DECLFILENAME +VF += --x-initial unique +VF += -exe $(SRCS) $(INCLUDE) +VF += $(PARAMS) + +gen: + verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS) + +build: gen + (cd obj_dir && make -j -f V$(TOP).mk) + +run: build + (cd obj_dir && ./V$(TOP)) clean: - rm testbench.iv - + rm -rf obj_dir diff --git a/hw/unit_tests/generic_queue/main.cpp b/hw/unit_tests/generic_queue/main.cpp new file mode 100644 index 00000000..c753a7c8 --- /dev/null +++ b/hw/unit_tests/generic_queue/main.cpp @@ -0,0 +1,93 @@ +#include "vl_simulator.h" +#include "VVX_fifo_queue.h" +#include + +#define MAX_TICKS 20 + +#define CHECK(x) \ + do { \ + if (x) \ + break; \ + std::cout << "FAILED: " << #x << std::endl; \ + std::abort(); \ + } while (false) + +uint64_t ticks = 0; + +double sc_time_stamp() { + return ticks; +} + +using Device = VVX_fifo_queue; + +int main(int argc, char **argv) { + // Initialize Verilators variables + Verilated::commandArgs(argc, argv); + + vl_simulator sim; + + // run test + ticks = sim.reset(0); + while (ticks < MAX_TICKS) { + switch (ticks) { + case 0: + // initial values + sim->pop = 0; + sim->push = 0; + ticks = sim.step(ticks, 2); + break; + case 2: + // Verify outputs + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x1); + // push 0xa + sim->pop = 0; + sim->push = 1; + sim->data_in = 0xa; + break; + case 4: + // verify outputs + CHECK(sim->data_out == 0xa); + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x0); + // push 0xb + sim->pop = 0; + sim->push = 1; + sim->data_in = 0xb; + break; + case 6: + // verify outputs + CHECK(sim->data_out == 0xa); + CHECK(sim->full == 0x1); + CHECK(sim->empty == 0x0); + // pop + sim->pop = 1; + sim->push = 0; + break; + case 8: + // verify outputs + CHECK(sim->data_out == 0xb); + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x0); + // pop + sim->pop = 1; + sim->push = 0; + break; + case 10: + // verify outputs + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x1); + sim->pop = 0; + sim->push = 0; + break; + } + + // advance clock + ticks = sim.step(ticks, 2); + } + + std::cout << "PASSED!" << std::endl; + std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/hw/unit_tests/generic_queue/vl_simulator.h b/hw/unit_tests/generic_queue/vl_simulator.h new file mode 100644 index 00000000..16486adf --- /dev/null +++ b/hw/unit_tests/generic_queue/vl_simulator.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include +#include "verilated.h" + +#ifdef VM_TRACE +#include // Trace file format header +#endif + +template +class vl_simulator { +private: + + T top_; +#ifdef VM_TRACE + VerilatedVcdC tfp_; +#endif + +public: + + vl_simulator() { + top_.clk = 0; + top_.reset = 0; + #ifdef VM_TRACE + Verilated::traceEverOn(true); + top_.trace(&tfp_, 99); + tfp_.open("trace.vcd"); + #endif + } + + ~vl_simulator() { + #ifdef VM_TRACE + tfp_.close(); + #endif + top_.final(); + } + + uint64_t reset(uint64_t ticks) { + top_.reset = 1; + ticks = this->step(ticks, 2); + top_.reset = 0; + return ticks; + } + + uint64_t step(uint64_t ticks, uint32_t count = 1) { + while (count--) { + top_.eval(); + #ifdef VM_TRACE + tfp_.dump(ticks); + #endif + top_.clk = !top_.clk; + ++ticks; + } + return ticks; + } + + T* operator->() { + return &top_; + } +}; + +template +void vl_setw(uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + sig[i] = arr[i]; + } +} + +template +int vl_cmpw(const uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + if (sig[i] < arr[i]) + return -1; + if (sig[i] > arr[i]) + return 1; + } + return 0; +} \ No newline at end of file diff --git a/hw/unit_tests/tex_unit/tex_sampler/Makefile b/hw/unit_tests/tex_unit/tex_sampler/Makefile new file mode 100644 index 00000000..c6de8aa1 --- /dev/null +++ b/hw/unit_tests/tex_unit/tex_sampler/Makefile @@ -0,0 +1,30 @@ +TOP = VX_tex_sampler + +PARAMS ?= + +INCLUDE = -I../../../rtl/ -I../../../rtl/libs -I../../../rtl/tex_unit + +SRCS = main.cpp + +all: build + +CF += -std=c++11 -fms-extensions -I../.. +VF += $(PARAMS) + +VF += --language 1800-2009 --assert -Wall --trace +VF += -Wno-DECLFILENAME +VF += --x-initial unique +VF += -exe $(SRCS) $(INCLUDE) +VF += $(PARAMS) + +gen: + verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS) + +build: gen + (cd obj_dir && make -j -f V$(TOP).mk) + +run: build + (cd obj_dir && ./V$(TOP)) + +clean: + rm -rf obj_dir diff --git a/hw/unit_tests/tex_unit/tex_sampler/main.cpp b/hw/unit_tests/tex_unit/tex_sampler/main.cpp new file mode 100644 index 00000000..a67b38cb --- /dev/null +++ b/hw/unit_tests/tex_unit/tex_sampler/main.cpp @@ -0,0 +1,215 @@ +#include "vl_simulator.h" +#include "VVX_tex_sampler.h" +#include +#include + +#define MAX_TICKS 20 +#define MAX_UNIT_CYCLES 5 +#define NUM_THREADS + +#define CHECK(x) \ + do { \ + if (x) \ + break; \ + std::cout << "FAILED: " << #x << std::endl; \ + std::abort(); \ + } while (false) + +uint64_t ticks = 0; + +// using Device = VVX_tex_sampler; + +template +class testbench +{ +private: + vl_simulator sim; + std::map input_map; + std::map output_map; + +public: + + struct UnitTest { + bool use_reset; + unsigned int num_cycles; + bool use_cmodel; + struct Output outputs[MAX_UNIT_CYCLES]; + struct Input inputs[MAX_UNIT_CYCLES]; + unsigned int num_output_check; + unsigned int check_output_cycle[MAX_UNIT_CYCLES]; + } + + struct Input { + bool req_valid; + unsigned int req_wid; + unsigned int req_tmask; + unsigned int req_PC; + unsigned int req_rd; + unsigned int req_wb; + unsigned int req_filter; + unsigned int req_format; + unsigned int req_u[NUM_THREADS]; + unsigned int req_v[NUM_THREADS]; + unsigned int req_texels[NUM_THREADS][4]; + bool rsp_ready; + } + + struct Output { + int output_cycle; + // outputs + bool req_ready; + bool rsp_valid; + unsigned int rsp_wid; + unsigned int rsp_tmask; + unsigned int rsp_PC; + unsigned int rsp_rd; + bool rsp_wb; + unsigned int rsp_data[NUM_THREADS]; + } + + testbench(/* args */){ + + } + + ~testbench(){ + } + + void unittest_Cmodel(struct UnitTest * test){ + int cycles = test->num_cycles; + int num_outputs = test->num_output_check; + + // struct Input* inputs = new (struct Input)[cycles]; + struct Output* outputs = new (struct Output)[num_outputs]; + + // implement c model and assign outputs to struct + + if (test->inputs[0]->req_filter == 0){ + for (int i = 0; i < NUM_THREADS; i++) + outputs[0]->rsp_data[0] = test->inputs->req_texels[i][0]; + } else { + // for (int i = 0; i < NUM_THREADS; i++){ + // uint32_t low[4], high[4]; + // for (int j = 0; j < 4; j++){ + // low[j] = test->inputs->req_texels[i][j] & 0x00ff00ff; + // high[j] = (test->inputs->req_texels[i][j] >> 8) & 0x00ff00ff; + // } + + // } + } + outputs[0]->output_cycle = 1; + test->num_cycles = 1; + test->outputs = &outputs; + + } + + void generate_test_vectors(struct UnitTest * tests, int num_tests, bool is_pipe){ + // for all unit tests create output test vectors (w w/o c-model) + int prev_test_cycle = 0; + + for (int i = 0; i < num_tests; i++) + { + int op_counter = 0; + int ip_counter = 0; + + int test_cycle = 0; + int last_ip_cycle = 0; + + struct UnitTest curr_test = tests[i]; + + if (curr_test->use_cmodel){ + unittest_Cmodel(&curr_test); + } + + for (int j = 0; j < curr_test->num_cycles; j++) + { + if (curr_test->inputs[ip_counter]->input_cycle == test_cycle){ + input_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->inputs[j])); + last_ip_cycle = prev_test_cycle + test_cycle; + ip_counter++; + } + + if (curr_test->outputs[op_counter]->output_cycle == test_cycle){ + output_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->outputs[op_counter])); + op_counter++; + } + + test_cycle++; + } + + if(!is_pipe){ + prev_test_cycle += (test_cycle - 1); + } + else{ + prev_test_cycle = last_ip_cycle + 1; + } + + } + + } + + void run(){ + + ticks = sim.reset(0); + int cycle = 0; + + while (ticks < MAX_TICKS) { + + auto input = input_map.find(cycle); + auto output = output_map.find(cycle); + + if (input != input_map.end()){ + sim->req_valid = input->req_valid; + sim->req_wid = input->req_wid; + sim->req_tmask = input->req_tmask; + sim->req_PC = input->req_PC; + sim->req_rd = input->req_rd; + sim->req_wb = input->req_wb; + sim->req_filter = input->req_filter; + sim->req_format = input->req_format; + // sim->req_u = input->req_u[NUM_THREADS]; + // sim->req_v = input->req_v[NUM_THREADS]; + vl_setw(sim->req_texels, input->req_texels) + // sim->req_texels = input->req_texels[NUM_THREADS][4]; + sim->rsp_ready = input->rsp_ready; + } else{ + std::cout << "Warning! No Input on Cycle " << cycle << std::endl; + } + + if(output != output_map.end()){ + CHECK(sim->req_ready == output->req_ready); + CHECK(sim->rsp_valid == output->rsp_valid); + CHECK(sim->rsp_wid == output->rsp_wid); + CHECK(sim->rsp_tmask == output->rsp_tmask); + CHECK(sim->rsp_PC == output->rsp_PC); + CHECK(sim->rsp_rd == output->rsp_rd); + CHECK(sim->rsp_wb == output->rsp_wb); + CHECK(vl_cmpw(sim->rsp_data, output->rsp_data)); + } + + cycle++; + ticks = sim.step(ticks,2); + } + } + + std::cout << "PASSED!" << std::endl; + std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl; + +}; + + +double sc_time_stamp() { + return ticks; +} + +int main(int argc, char **argv) { + // Initialize Verilators variables + Verilated::commandArgs(argc, argv); + + testbench sampler_testbench; + + sampler_testbench.generate_test_vectors(tests, 1, 0); + sampler_test_bench.run(); + + + return 0; +} \ No newline at end of file diff --git a/hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h b/hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h new file mode 100644 index 00000000..16486adf --- /dev/null +++ b/hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include +#include "verilated.h" + +#ifdef VM_TRACE +#include // Trace file format header +#endif + +template +class vl_simulator { +private: + + T top_; +#ifdef VM_TRACE + VerilatedVcdC tfp_; +#endif + +public: + + vl_simulator() { + top_.clk = 0; + top_.reset = 0; + #ifdef VM_TRACE + Verilated::traceEverOn(true); + top_.trace(&tfp_, 99); + tfp_.open("trace.vcd"); + #endif + } + + ~vl_simulator() { + #ifdef VM_TRACE + tfp_.close(); + #endif + top_.final(); + } + + uint64_t reset(uint64_t ticks) { + top_.reset = 1; + ticks = this->step(ticks, 2); + top_.reset = 0; + return ticks; + } + + uint64_t step(uint64_t ticks, uint32_t count = 1) { + while (count--) { + top_.eval(); + #ifdef VM_TRACE + tfp_.dump(ticks); + #endif + top_.clk = !top_.clk; + ++ticks; + } + return ticks; + } + + T* operator->() { + return &top_; + } +}; + +template +void vl_setw(uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + sig[i] = arr[i]; + } +} + +template +int vl_cmpw(const uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + if (sig[i] < arr[i]) + return -1; + if (sig[i] > arr[i]) + return 1; + } + return 0; +} \ No newline at end of file diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index 41e1b5a5..e624455d 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -5,7 +5,62 @@ #ifdef __cplusplus extern "C" { + #endif +#ifdef __ASSEMBLY__ +#define __ASM_STR(x) x +#else +#define __ASM_STR(x) #x +#endif + +#define vx_csr_swap(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ + __v; \ +}) + +#define vx_csr_read(csr) ({ \ + register unsigned __v; \ + __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ + __v; \ +}) + +#define vx_csr_write(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +}) + +#define vx_csr_read_set(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ + __v; \ +}) + +#define vx_csr_set(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +}) + +#define vx_csr_read_clear(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ + __v; \ +}) + +#define vx_csr_clear(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +}) + +// Texture load +#define vx_tex(unit, u, v, l) ({ \ + unsigned __r; \ + unsigned __u = u; \ + unsigned __v = v; \ + unsigned __l = l; \ + __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ + __r; \ +}) #ifdef __ASSEMBLY__ #define __ASM_STR(x) x @@ -52,6 +107,16 @@ extern "C" { __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ }) +// Texture load +#define vx_tex(unit, u, v, l) ({ \ + unsigned __r; \ + unsigned __u = u; \ + unsigned __v = v; \ + unsigned __l = l; \ + __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ + __r; \ +}) + // Set thread mask inline void vx_tmc(unsigned num_threads) { asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(num_threads)); @@ -76,7 +141,7 @@ inline void vx_join() { // Warp Barrier inline void vx_barrier(unsigned barried_id, unsigned num_warps) { - asm volatile (".insn s 0x6b, 4, %1, 0cd (%0)" :: "r"(barried_id), "r"(num_warps)); + asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps)); } // Return active warp's thread id diff --git a/simX/execute.cpp b/simX/execute.cpp index 84dd1ac8..3feae285 100644 --- a/simX/execute.cpp +++ b/simX/execute.cpp @@ -555,20 +555,18 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { // FSGNJ.S, FSGNJN.S, FSGNJX.S case 0x10: { - bool fsign1 = rsdata[0] & 0x80000000; + bool fsign1 = (rsdata[0] >> 31); uint32_t fdata1 = rsdata[0] & 0x7FFFFFFF; - bool fsign2 = rsdata[1] & 0x80000000; + bool fsign2 = (rsdata[1] >> 31); switch (func3) { case 0: // FSGNJ.S rddata = (fsign2 << 31) | fdata1; break; case 1: // FSGNJN.S - fsign2 = !fsign2; - rddata = (fsign2 << 31) | fdata1; + rddata = (!fsign2 << 31) | fdata1; break; case 2: { // FSGNJX.S - bool sign = fsign1 ^ fsign2; - rddata = (sign << 31) | fdata1; + rddata = ((fsign1 ^ fsign2) << 31) | fdata1; } break; } } break; diff --git a/tests/regression/Makefile b/tests/regression/Makefile index 714e3f27..8fd38f2b 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -7,6 +7,7 @@ all: $(MAKE) -C printf $(MAKE) -C diverge $(MAKE) -C fence + $(MAKE) -C tex run-simx: $(MAKE) -C basic run-simx @@ -17,6 +18,7 @@ run-simx: $(MAKE) -C printf run-simx $(MAKE) -C diverge run-simx $(MAKE) -C fence run-simx + $(MAKE) -C tex run-simx run-vlsim: $(MAKE) -C basic run-vlsim @@ -27,6 +29,7 @@ run-vlsim: $(MAKE) -C printf run-vlsim $(MAKE) -C diverge run-vlsim $(MAKE) -C fence run-vlsim + $(MAKE) -C tex run-vlsim clean: $(MAKE) -C basic clean @@ -37,6 +40,7 @@ clean: $(MAKE) -C printf clean $(MAKE) -C diverge clean $(MAKE) -C fence clean + $(MAKE) -C tex clean clean-all: $(MAKE) -C basic clean-all @@ -47,4 +51,5 @@ clean-all: $(MAKE) -C printf clean-all $(MAKE) -C diverge clean-all $(MAKE) -C fence clean-all + $(MAKE) -C tex clean-all diff --git a/tests/regression/tex/Makefile b/tests/regression/tex/Makefile new file mode 100644 index 00000000..25cd1f61 --- /dev/null +++ b/tests/regression/tex/Makefile @@ -0,0 +1,70 @@ +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +VORTEX_DRV_PATH ?= $(realpath ../../../driver) +VORTEX_RT_PATH ?= $(wildcard ../../../runtime) + +OPTS ?= -f1 + +VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ +VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump +VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw + +VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a + +VX_SRCS = kernel.c + +#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors +CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors + +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include + +LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex + +PROJECT = tex + +SRCS = main.cpp utils.cpp + +all: $(PROJECT) kernel.bin kernel.dump + +kernel.dump: kernel.elf + $(VX_DP) -D kernel.elf > kernel.dump + +kernel.bin: kernel.elf + $(VX_CP) -O binary kernel.elf kernel.bin + +kernel.elf: $(VX_SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-simx: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-fpga: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.elf *.bin *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/tests/regression/tex/blitter.h b/tests/regression/tex/blitter.h new file mode 100644 index 00000000..1e50dfd6 --- /dev/null +++ b/tests/regression/tex/blitter.h @@ -0,0 +1,260 @@ +#include "format.h" + +struct SurfaceDesc { + ePixelFormat Format; + uint8_t *pBits; + uint32_t Width; + uint32_t Height; + uint32_t Pitch; +}; + +class BlitTable { +public: + typedef int (*PfnCopy)(const SurfaceDesc &dstDesc, + uint32_t dstOffsetX, + uint32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + uint32_t srcOffsetX, + uint32_t srcOffsetY); + + BlitTable() { + for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { + for (uint32_t d = 0; d < FORMAT_COLOR_SIZE_; ++d) { + copyFuncs_[s][d] = CopyInvalid; + } + } + + for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { + switch (s) { + case FORMAT_A8: + case FORMAT_L8: + copyFuncs_[s][s] = CopyFast; + break; + + case FORMAT_A8L8: + copyFuncs_[FORMAT_A8L8][FORMAT_A8] = Copy; + copyFuncs_[FORMAT_A8L8][FORMAT_A8L8] = CopyFast; + break; + + case FORMAT_R5G6B5: + copyFuncs_[FORMAT_R5G6B5][FORMAT_L8] = Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_R5G6B5] = CopyFast; + copyFuncs_[FORMAT_R5G6B5][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_A8B8G8R8] = + Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_A8R8G8B8] = + Copy; + break; + + case FORMAT_A1R5G5B5: + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8R8G8B8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R5G5B5A1] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R4G4B4A4] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8B8G8R8] = + Copy; + break; + + case FORMAT_A4R4G4B4: + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8R8G8B8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R5G5B5A1] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R4G4B4A4] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8B8G8R8] = + Copy; + break; + + case FORMAT_R8G8B8: + copyFuncs_[FORMAT_R8G8B8][FORMAT_L8] = Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_R5G6B5] = + Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_R8G8B8] = CopyFast; + copyFuncs_[FORMAT_R8G8B8][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_A8B8G8R8] = + Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_A8R8G8B8] = + Copy; + break; + + case FORMAT_A8R8G8B8: + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G6B5] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8R8G8B8] = CopyFast; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G5B5A1] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R4G4B4A4] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8B8G8R8] = + Copy; + break; + + case FORMAT_R5G5B5A1: + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_RGB] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_ARGB] = + Copy; + break; + + case FORMAT_R4G4B4A4: + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_RGB] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_ARGB] = + Copy; + break; + + case FORMAT_B8G8R8: + copyFuncs_[FORMAT_B8G8R8][FORMAT_L8] = Copy; + copyFuncs_[FORMAT_B8G8R8][FORMAT_RGB] = Copy; + break; + + case FORMAT_A8B8G8R8: + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_RGB] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_ARGB] = + Copy; + break; + } + } + } + + PfnCopy get(uint32_t srcFormat, uint32_t dstFormat) const { + assert(srcFormat < FORMAT_COLOR_SIZE_); + assert(dstFormat < FORMAT_COLOR_SIZE_); + return copyFuncs_[srcFormat][dstFormat]; + } + +private: + template + static int Copy(const SurfaceDesc &dstDesc, + uint32_t dstOffsetX, + uint32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + uint32_t srcOffsetX, + uint32_t srcOffsetY) { + auto srcBPP = TFormatInfo::CBSIZE; + auto dstBPP = TFormatInfo::CBSIZE; + auto srcNextLine = srcDesc.Pitch; + auto dstNextLine = dstDesc.Pitch; + + auto pbSrc = srcDesc.pBits + srcOffsetX * srcBPP + srcOffsetY * srcDesc.Pitch; + auto pbDst = dstDesc.pBits + dstOffsetX * dstBPP + dstOffsetY * dstDesc.Pitch; + + while (copyHeight--) { + auto pSrc = reinterpret_cast::TYPE *>(pbSrc); + for (auto *pDst = reinterpret_cast::TYPE *>( + pbDst), + *const pEnd = pDst + copyWidth; + pDst != pEnd; ++pDst, ++pSrc) { + auto tmp = Format::ConvertFrom(pSrc); + Format::ConvertTo(pDst, tmp); + } + + pbSrc += srcNextLine; + pbDst += dstNextLine; + } + return 0; + } + + template + static int CopyFast(const SurfaceDesc &dstDesc, + uint32_t dstOffsetX, + uint32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + uint32_t srcOffsetX, + uint32_t srcOffsetY) { + auto nBPP = sizeof(Type); + auto srcNextLine = srcDesc.Pitch; + auto dstNextLine = dstDesc.Pitch; + + auto pbSrc = srcDesc.pBits + srcOffsetX * nBPP + srcOffsetY * srcDesc.Pitch; + auto pbDst = dstDesc.pBits + dstOffsetX * nBPP + dstOffsetY * dstDesc.Pitch; + + while (copyHeight--) { + auto pSrc = reinterpret_cast(pbSrc); + for (auto *pDst = reinterpret_cast(pbDst), *const pEnd = pDst + copyWidth; + pDst != pEnd; ++pDst, ++pSrc) { + *pDst = *pSrc; + } + pbSrc += srcNextLine; + pbDst += dstNextLine; + } + return 0; + } + + static int CopyInvalid(const SurfaceDesc & /*dstDesc*/, + uint32_t /*dstOffsetX*/, + uint32_t /*dstOffsetY*/, + uint32_t /*copyWidth*/, + uint32_t /*copyHeight*/, + const SurfaceDesc & /*srcDesc*/, + uint32_t /*srcOffsetX*/, + uint32_t /*srcOffsetY*/) + { + std::cout << "Error: invalid format" << std::endl; + return -1; + } + + PfnCopy copyFuncs_[FORMAT_COLOR_SIZE_][FORMAT_COLOR_SIZE_]; +}; \ No newline at end of file diff --git a/tests/regression/tex/color.h b/tests/regression/tex/color.h new file mode 100644 index 00000000..708565a3 --- /dev/null +++ b/tests/regression/tex/color.h @@ -0,0 +1,68 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include +#include + +struct ColorARGB { + union { + struct { + uint32_t value; + }; + struct { + uint8_t b, g, r, a; + }; + struct { + uint8_t m[4]; + }; + }; + + ColorARGB() {} + + ColorARGB(int a, int r, int g, int b) { + assert((a >= 0) && (a <= 0xff)); + assert((r >= 0) && (r <= 0xff)); + assert((g >= 0) && (g <= 0xff)); + assert((b >= 0) && (b <= 0xff)); + + this->b = static_cast(b); + this->g = static_cast(g); + this->r = static_cast(r); + this->a = static_cast(a); + } + + ColorARGB(int r, int g, int b) { + assert((r >= 0) && (r <= 0xff)); + assert((g >= 0) && (g <= 0xff)); + assert((b >= 0) && (b <= 0xff)); + + this->b = static_cast(b); + this->g = static_cast(g); + this->r = static_cast(r); + } + + ColorARGB(int value) { + this->value = value; + } + + void operator=(const ColorARGB &rhs) { + this->value = rhs.value; + } + + operator uint32_t() const { + return this->value; + } +}; \ No newline at end of file diff --git a/tests/regression/tex/common.h b/tests/regression/tex/common.h new file mode 100644 index 00000000..d23c55fb --- /dev/null +++ b/tests/regression/tex/common.h @@ -0,0 +1,25 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +struct kernel_arg_t { + uint32_t num_tasks; + uint8_t format; + uint8_t filter; + uint8_t wrap; + uint8_t use_sw; + uint32_t lod; + uint8_t src_logWidth; + uint8_t src_logHeight; + uint8_t src_stride; + uint8_t src_pitch; + uint32_t src_ptr; + uint32_t dst_width; + uint32_t dst_height; + uint8_t dst_stride; + uint32_t dst_pitch; + uint32_t dst_ptr; +}; + +#endif \ No newline at end of file diff --git a/tests/regression/tex/earth.tga b/tests/regression/tex/earth.tga new file mode 100644 index 00000000..3c6782e1 Binary files /dev/null and b/tests/regression/tex/earth.tga differ diff --git a/tests/regression/tex/flower.tga b/tests/regression/tex/flower.tga new file mode 100644 index 00000000..94c9c57d Binary files /dev/null and b/tests/regression/tex/flower.tga differ diff --git a/tests/regression/tex/football.tga b/tests/regression/tex/football.tga new file mode 100644 index 00000000..a5544ec7 Binary files /dev/null and b/tests/regression/tex/football.tga differ diff --git a/tests/regression/tex/format.h b/tests/regression/tex/format.h new file mode 100644 index 00000000..4ee8268e --- /dev/null +++ b/tests/regression/tex/format.h @@ -0,0 +1,1022 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include "int24.h" +#include "color.h" +#include + +enum ePixelFormat { + FORMAT_UNKNOWN, + FORMAT_A8, + FORMAT_L8, + FORMAT_A8L8, + FORMAT_R5G6B5, + FORMAT_A8R8G8B8, + FORMAT_A1R5G5B5, + FORMAT_R8G8B8, + FORMAT_A4R4G4B4, + FORMAT_A8B8G8R8, + FORMAT_R5G5B5A1, + FORMAT_B8G8R8, + FORMAT_R4G4B4A4, + FORMAT_COLOR_SIZE_, + FORMAT_D16 = FORMAT_COLOR_SIZE_, + FORMAT_X8S8D16, + FORMAT_PAL4_B8G8R8, + FORMAT_PAL4_A8B8G8R8, + FORMAT_PAL4_R5G6B5, + FORMAT_PAL4_R4G4B4A4, + FORMAT_PAL4_R5G5B5A1, + FORMAT_PAL8_B8G8R8, + FORMAT_PAL8_A8B8G8R8, + FORMAT_PAL8_R5G6B5, + FORMAT_PAL8_R4G4B4A4, + FORMAT_PAL8_R5G5B5A1, + FORMAT_SIZE_, +}; + +#define FORMAT_A FORMAT_A8 +#define FORMAT_RGB FORMAT_R5G6B5 +#define FORMAT_RGB_ FORMAT_R8G8B8 +#define FORMAT_ARGB FORMAT_A8R8G8B8 +#define FORMAT_ARGB_ FORMAT_A4R4G4B4 + +template +struct TFormatInfo {}; + +template <> +struct TFormatInfo { + typedef uint8_t TYPE; + + enum { + CBSIZE = 0, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + RED = 5, + GREEN = 6, + BLUE = 5, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint24_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint24_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint32_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint32_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint8_t TYPE; + + enum { + CBSIZE = 1, + ALPHA = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint8_t TYPE; + + enum { + CBSIZE = 1, + LUMINANCE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 8, + LUMINANCE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + DEPTH = 16, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 4, + DEPTH = 16, + STENCIL = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 4, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 4, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + RED = 5, + GREEN = 6, + BLUE = 5, + PALETTE = 4, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + PALETTE = 4, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + PALETTE = 4, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + RED = 5, + GREEN = 6, + BLUE = 5, + PALETTE = 8, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + PALETTE = 8, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + PALETTE = 8, + LERP = 5, + }; +}; + +/////////////////////////////////////////////////////////////////////////////// + +#define DEF_GET_ENUM_VALUE(Name, Default) \ + template \ + struct enum_get_##Name { \ + static constexpr int value = Default; \ + }; \ + template \ + struct enum_get_##Name::type> { \ + static constexpr int value = T::Name; \ + } + +#define __formatInfo(format) \ + { \ + TFormatInfo::CBSIZE, FormatSize>::RED, \ + FormatSize>::GREEN, \ + FormatSize>::BLUE, \ + FormatSize>::ALPHA, \ + FormatSize>::LUMINANCE, \ + FormatSize>::DEPTH, \ + FormatSize>::STENCIL, \ + FormatSize>::PALETTE, \ + FormatSize>::LERP \ + } + +/////////////////////////////////////////////////////////////////////////////// + +struct FormatInfo { + uint8_t BytePerPixel; + uint8_t Red; + uint8_t Green; + uint8_t Blue; + uint8_t Alpha; + uint8_t Luminance; + uint8_t Depth; + uint8_t Stencil; + uint8_t PaletteBits; + uint8_t LerpBits; +}; + +template +class FormatSize { +protected: + DEF_GET_ENUM_VALUE(RED, 0); + DEF_GET_ENUM_VALUE(GREEN, 0); + DEF_GET_ENUM_VALUE(BLUE, 0); + DEF_GET_ENUM_VALUE(ALPHA, 0); + DEF_GET_ENUM_VALUE(LUMINANCE, 0); + DEF_GET_ENUM_VALUE(DEPTH, 0); + DEF_GET_ENUM_VALUE(STENCIL, 0); + DEF_GET_ENUM_VALUE(PALETTE, 0); + DEF_GET_ENUM_VALUE(LERP, 0); + +public: + enum { + RED = enum_get_RED::value, + GREEN = enum_get_GREEN::value, + BLUE = enum_get_BLUE::value, + ALPHA = enum_get_ALPHA::value, + LUMINANCE = enum_get_LUMINANCE::value, + DEPTH = enum_get_DEPTH::value, + STENCIL = enum_get_STENCIL::value, + PALETTE = enum_get_PALETTE::value, + LERP = enum_get_LERP::value, + + RGB = RED + GREEN + BLUE + LUMINANCE, + RGBA = RGB + ALPHA + }; +}; + +namespace Format { + +inline static const FormatInfo &GetInfo(ePixelFormat pixelFormat) { + static const FormatInfo sc_formatInfos[FORMAT_SIZE_] = { + __formatInfo(FORMAT_UNKNOWN), + __formatInfo(FORMAT_A8), + __formatInfo(FORMAT_L8), + __formatInfo(FORMAT_A8L8), + __formatInfo(FORMAT_RGB), + __formatInfo(FORMAT_ARGB), + __formatInfo(FORMAT_A1R5G5B5), + __formatInfo(FORMAT_RGB_), + __formatInfo(FORMAT_ARGB_), + __formatInfo(FORMAT_R4G4B4A4), + __formatInfo(FORMAT_R5G5B5A1), + __formatInfo(FORMAT_B8G8R8), + __formatInfo(FORMAT_A8B8G8R8), + __formatInfo(FORMAT_D16), + __formatInfo(FORMAT_X8S8D16), + __formatInfo(FORMAT_PAL4_B8G8R8), + __formatInfo(FORMAT_PAL4_A8B8G8R8), + __formatInfo(FORMAT_PAL4_R5G6B5), + __formatInfo(FORMAT_PAL4_R4G4B4A4), + __formatInfo(FORMAT_PAL4_R5G5B5A1), + __formatInfo(FORMAT_PAL8_B8G8R8), + __formatInfo(FORMAT_PAL8_A8B8G8R8), + __formatInfo(FORMAT_PAL8_R5G6B5), + __formatInfo(FORMAT_PAL8_R4G4B4A4), + __formatInfo(FORMAT_PAL8_R5G5B5A1), + }; + assert(pixelFormat < FORMAT_SIZE_); + return sc_formatInfos[pixelFormat]; +} + +#undef __formatInfo +#undef DEF_GET_ENUM_VALUE + +typedef ColorARGB (*pfn_convert_from)(const void *pIn); + +typedef void (*pfn_convert_to)(void *pOut, const ColorARGB &in); + +template +static uint32_t ConvertTo(const ColorARGB &color); + +template +static void ConvertTo(void *pOut, const ColorARGB &in) { + *reinterpret_cast::TYPE *>(pOut) = + static_cast::TYPE>( + ConvertTo(in)); +} + +template +static ColorARGB ConvertFrom(uint32_t in); + +template +static ColorARGB ConvertFrom(const void *pIn) { + return ConvertFrom( + *reinterpret_cast::TYPE *>(pIn)); +} + +inline static pfn_convert_to GetConvertTo(ePixelFormat pixelFormat) { + switch (pixelFormat) { + case FORMAT_A8: + return &ConvertTo; + case FORMAT_L8: + return &ConvertTo; + case FORMAT_A8L8: + return &ConvertTo; + case FORMAT_R5G6B5: + return &ConvertTo; + case FORMAT_A1R5G5B5: + return &ConvertTo; + case FORMAT_A4R4G4B4: + return &ConvertTo; + case FORMAT_R8G8B8: + return &ConvertTo; + case FORMAT_A8R8G8B8: + return &ConvertTo; + case FORMAT_R5G5B5A1: + return &ConvertTo; + case FORMAT_R4G4B4A4: + return &ConvertTo; + case FORMAT_B8G8R8: + return &ConvertTo; + case FORMAT_A8B8G8R8: + return &ConvertTo; + case FORMAT_D16: + return &ConvertTo; + case FORMAT_X8S8D16: + return &ConvertTo; + default: + return &ConvertTo; + } + return nullptr; +} + +inline static pfn_convert_from GetConvertFrom(ePixelFormat pixelFormat, + bool bForceAlpha) { + if (bForceAlpha) { + switch (pixelFormat) { + case FORMAT_A8: + return &ConvertFrom; + case FORMAT_L8: + return &ConvertFrom; + case FORMAT_A8L8: + return &ConvertFrom; + case FORMAT_R5G6B5: + return &ConvertFrom; + case FORMAT_A1R5G5B5: + return &ConvertFrom; + case FORMAT_A4R4G4B4: + return &ConvertFrom; + case FORMAT_R8G8B8: + return &ConvertFrom; + case FORMAT_A8R8G8B8: + return &ConvertFrom; + case FORMAT_R5G5B5A1: + return &ConvertFrom; + case FORMAT_R4G4B4A4: + return &ConvertFrom; + case FORMAT_B8G8R8: + return &ConvertFrom; + case FORMAT_A8B8G8R8: + return &ConvertFrom; + case FORMAT_D16: + return &ConvertFrom; + case FORMAT_X8S8D16: + return &ConvertFrom; + default: + return &ConvertFrom; + } + } else { + switch (pixelFormat) { + case FORMAT_A8: + return &ConvertFrom; + case FORMAT_L8: + return &ConvertFrom; + case FORMAT_A8L8: + return &ConvertFrom; + case FORMAT_R5G6B5: + return &ConvertFrom; + case FORMAT_A1R5G5B5: + return &ConvertFrom; + case FORMAT_A4R4G4B4: + return &ConvertFrom; + case FORMAT_R8G8B8: + return &ConvertFrom; + case FORMAT_A8R8G8B8: + return &ConvertFrom; + case FORMAT_R5G5B5A1: + return &ConvertFrom; + case FORMAT_R4G4B4A4: + return &ConvertFrom; + case FORMAT_B8G8R8: + return &ConvertFrom; + case FORMAT_A8B8G8R8: + return &ConvertFrom; + case FORMAT_D16: + return &ConvertFrom; + case FORMAT_X8S8D16: + return &ConvertFrom; + default: + return &ConvertFrom; + } + } + + return nullptr; +} + +inline static uint32_t GetNativeFormat(ePixelFormat pixelFormat) { + switch (pixelFormat) { + case FORMAT_PAL4_B8G8R8: + case FORMAT_PAL8_B8G8R8: + return FORMAT_B8G8R8; + + case FORMAT_PAL4_A8B8G8R8: + case FORMAT_PAL8_A8B8G8R8: + return FORMAT_A8B8G8R8; + + case FORMAT_PAL4_R5G6B5: + case FORMAT_PAL8_R5G6B5: + return FORMAT_R5G6B5; + + case FORMAT_PAL4_R4G4B4A4: + case FORMAT_PAL8_R4G4B4A4: + return FORMAT_R4G4B4A4; + + case FORMAT_PAL4_R5G5B5A1: + case FORMAT_PAL8_R5G5B5A1: + return FORMAT_R5G5B5A1; + + default: + return pixelFormat; + } +} + +/////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &/*in*/) { + return 0; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t /*in*/) { + return 0; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t /*in*/) { + return 0; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.r & 0xf8) << 8) | ((in.g & 0xfc) << 3) | (in.b >> 3); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = ((in >> 11) << 3) | (in >> 13); + ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = ((in >> 11) << 3) | (in >> 13); + ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a ? 0x8000 : 0) | ((in.r & 0xf8) << 7) | ((in.g & 0xf8) << 2) | + (in.b >> 3); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in >> 15); + ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); + ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in >> 15); + ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); + ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.r & 0xf8) << 8) | ((in.g & 0xf8) << 3) | ((in.b & 0xf8) >> 2) | + (in.a ? 0x1 : 0); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in & 0x1); + ret.r = ((in >> 8) & 0xf8) | (in >> 13); + ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); + ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in & 0x1); + ret.r = ((in >> 8) & 0xf8) | (in >> 13); + ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); + ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.a & 0xf0) << 8) | ((in.r & 0xf0) << 4) | ((in.g & 0xf0) << 0) | + (in.b >> 4); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in >> 8) & 0xf0) | (in >> 12); + ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in >> 8) & 0xf0) | (in >> 12); + ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.r & 0xf0) << 8) | ((in.g & 0xf0) << 4) | ((in.b & 0xf0) << 0) | + (in.a >> 4); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + ret.r = ((in >> 8) & 0xf0) | (in >> 12); + ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + ret.r = ((in >> 8) & 0xf0) | (in >> 12); + ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.r << 16) | (in.g << 8) | in.b; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = in >> 16; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = in >> 16; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.b << 16) | (in.g << 8) | in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in >> 16; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in >> 16; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a << 24) | (in.r << 16) | (in.g << 8) | in.b; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = (in >> 16) & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = (in >> 16) & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a << 24) | (in.b << 16) | (in.g << 8) | in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = (in >> 16) & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = (in >> 16) & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.a; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = in; + ret.g = in; + ret.b = in; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = in; + ret.g = in; + ret.b = in; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a << 8) | in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 8; + ret.r = in & 0xff; + ret.g = in & 0xff; + ret.b = in & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 8; + ret.r = in & 0xff; + ret.g = in & 0xff; + ret.b = in & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.value; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.value = in; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.b; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.value = in; + return ret; +} + +} // namespace Format \ No newline at end of file diff --git a/tests/regression/tex/int24.h b/tests/regression/tex/int24.h new file mode 100644 index 00000000..b08537a7 --- /dev/null +++ b/tests/regression/tex/int24.h @@ -0,0 +1,37 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include + +struct uint24_t { + uint8_t m[3]; + + explicit uint24_t(uint32_t value) { + m[0] = (value >> 0) & 0xff; + m[1] = (value >> 8) & 0xff; + m[2] = (value >> 16) & 0xff; + } + + explicit uint24_t(uint8_t x, uint8_t y, uint8_t z) { + m[0] = x; + m[1] = y; + m[2] = z; + } + + operator uint32_t() const { + return (m[2] << 16) | (m[1] << 8) | m[0]; + } +}; diff --git a/tests/regression/tex/kernel.bin b/tests/regression/tex/kernel.bin new file mode 100755 index 00000000..cb5d62ec Binary files /dev/null and b/tests/regression/tex/kernel.bin differ diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c new file mode 100644 index 00000000..6a04b801 --- /dev/null +++ b/tests/regression/tex/kernel.c @@ -0,0 +1,67 @@ +#include +#include +#include "common.h" +#include "texsw.h" + +#define ENABLE_SW + +struct tile_arg_t { + struct kernel_arg_t* state; + uint32_t tile_width; + uint32_t tile_height; + float deltaX; + float deltaY; +}; + +void kernel_body(int task_id, void* arg) { + struct tile_arg_t* _arg = (struct tile_arg_t*)(arg); + struct kernel_arg_t* state = _arg->state; + + uint32_t xoffset = 0; + uint32_t yoffset = task_id * _arg->tile_height; + uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + + float fv = yoffset * _arg->deltaY; + for (uint32_t y = 0; y < _arg->tile_height; ++y) { + uint32_t* dst_row = (uint32_t*)dst_ptr; + float fu = xoffset * _arg->deltaX; + for (uint32_t x = 0; x < _arg->tile_width; ++x) { + int32_t u = (int32_t)(fu * (1<<20)); + int32_t v = (int32_t)(fv * (1<<20)); + #ifdef ENABLE_SW + if (state->use_sw) { + dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod); + } else { + #endif + dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod); + #ifdef ENABLE_SW + } + #endif + fu += _arg->deltaX; + } + dst_ptr += state->dst_pitch; + fv += _arg->deltaY; + } +} + +int main() { + struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + + // configure texture unit + vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr); + vx_csr_write(CSR_TEX_MIPOFF(0), 0); + vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth); + vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight); + vx_csr_write(CSR_TEX_FORMAT(0), arg->format); + vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap); + vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0)); + + struct tile_arg_t targ; + targ.state = arg; + targ.tile_width = arg->dst_width; + targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks; + targ.deltaX = 1.0f / arg->dst_width; + targ.deltaY = 1.0f / arg->dst_height; + + vx_spawn_tasks(arg->num_tasks, kernel_body, &targ); +} \ No newline at end of file diff --git a/tests/regression/tex/kernel.dump b/tests/regression/tex/kernel.dump new file mode 100644 index 00000000..ca725c4d --- /dev/null +++ b/tests/regression/tex/kernel.dump @@ -0,0 +1,1514 @@ + +kernel.elf: file format elf32-littleriscv + + +Disassembly of section .init: + +80000000 <_start>: +80000000: 00001597 auipc a1,0x1 +80000004: d8458593 addi a1,a1,-636 # 80000d84 +80000008: fc102573 csrr a0,0xfc1 +8000000c: 00b5106b 0xb5106b +80000010: 575000ef jal ra,80000d84 +80000014: 00100513 li a0,1 +80000018: 0005006b 0x5006b +8000001c: 00003517 auipc a0,0x3 +80000020: 9d850513 addi a0,a0,-1576 # 800029f4 +80000024: 00003617 auipc a2,0x3 +80000028: a5060613 addi a2,a2,-1456 # 80002a74 <__BSS_END__> +8000002c: 40a60633 sub a2,a2,a0 +80000030: 00000593 li a1,0 +80000034: 2e8010ef jal ra,8000131c +80000038: 00001517 auipc a0,0x1 +8000003c: e2450513 addi a0,a0,-476 # 80000e5c <__libc_fini_array> +80000040: 294010ef jal ra,800012d4 +80000044: 57d000ef jal ra,80000dc0 <__libc_init_array> +80000048: 008000ef jal ra,80000050
+8000004c: 29c0106f j 800012e8 + +Disassembly of section .text: + +80000050
: +80000050: fd010113 addi sp,sp,-48 +80000054: 7ffff7b7 lui a5,0x7ffff +80000058: 02112623 sw ra,44(sp) +8000005c: 0107a703 lw a4,16(a5) # 7ffff010 <__stack_size+0x7fffec10> +80000060: fd071073 csrw 0xfd0,a4 +80000064: fd405073 csrwi 0xfd4,0 +80000068: 00c7c703 lbu a4,12(a5) +8000006c: fd571073 csrw 0xfd5,a4 +80000070: 00d7c703 lbu a4,13(a5) +80000074: fd671073 csrw 0xfd6,a4 +80000078: 0047c703 lbu a4,4(a5) +8000007c: fd171073 csrw 0xfd1,a4 +80000080: 0067c683 lbu a3,6(a5) +80000084: 00269713 slli a4,a3,0x2 +80000088: 00d76733 or a4,a4,a3 +8000008c: fd271073 csrw 0xfd2,a4 +80000090: 0057c703 lbu a4,5(a5) +80000094: 00e03733 snez a4,a4 +80000098: fd371073 csrw 0xfd3,a4 +8000009c: 0147a703 lw a4,20(a5) +800000a0: 800036b7 lui a3,0x80003 +800000a4: 9ec6a787 flw fa5,-1556(a3) # 800029ec <__stack_top+0x810029ec> +800000a8: 00e12823 sw a4,16(sp) +800000ac: 0007a503 lw a0,0(a5) +800000b0: 0187a683 lw a3,24(a5) +800000b4: d0177753 fcvt.s.wu fa4,a4 +800000b8: 800005b7 lui a1,0x80000 +800000bc: 00a68733 add a4,a3,a0 +800000c0: fff70713 addi a4,a4,-1 +800000c4: 02a75733 divu a4,a4,a0 +800000c8: d016f6d3 fcvt.s.wu fa3,a3 +800000cc: 18e7f753 fdiv.s fa4,fa5,fa4 +800000d0: 00c10613 addi a2,sp,12 +800000d4: 6c458593 addi a1,a1,1732 # 800006c4 <__stack_top+0x810006c4> +800000d8: 00f12623 sw a5,12(sp) +800000dc: 18d7f7d3 fdiv.s fa5,fa5,fa3 +800000e0: 00e12c27 fsw fa4,24(sp) +800000e4: 00e12a23 sw a4,20(sp) +800000e8: 00f12e27 fsw fa5,28(sp) +800000ec: 68d000ef jal ra,80000f78 +800000f0: 02c12083 lw ra,44(sp) +800000f4: 03010113 addi sp,sp,48 +800000f8: 00008067 ret + +800000fc : +800000fc: 00000793 li a5,0 +80000100: 00078863 beqz a5,80000110 +80000104: 80001537 lui a0,0x80001 +80000108: e5c50513 addi a0,a0,-420 # 80000e5c <__stack_top+0x81000e5c> +8000010c: 1c80106f j 800012d4 +80000110: 00008067 ret + +80000114 : +80000114: 00554783 lbu a5,5(a0) +80000118: 00c54303 lbu t1,12(a0) +8000011c: 00d54803 lbu a6,13(a0) +80000120: 00454683 lbu a3,4(a0) +80000124: 00654e83 lbu t4,6(a0) +80000128: 01052883 lw a7,16(a0) +8000012c: 2e078063 beqz a5,8000040c +80000130: 00080e37 lui t3,0x80 +80000134: 406e5733 sra a4,t3,t1 +80000138: 00100f13 li t5,1 +8000013c: 410e5e33 sra t3,t3,a6 +80000140: 40e587b3 sub a5,a1,a4 +80000144: 41c60533 sub a0,a2,t3 +80000148: 00b70733 add a4,a4,a1 +8000014c: 01c60633 add a2,a2,t3 +80000150: 3bee8e63 beq t4,t5,8000050c +80000154: fff7c593 not a1,a5 +80000158: 41f5d593 srai a1,a1,0x1f +8000015c: 00b7f7b3 and a5,a5,a1 +80000160: 001005b7 lui a1,0x100 +80000164: 42b7d463 bge a5,a1,8000058c +80000168: fff54593 not a1,a0 +8000016c: 41f5d593 srai a1,a1,0x1f +80000170: 00b57533 and a0,a0,a1 +80000174: 001005b7 lui a1,0x100 +80000178: 3cb55a63 bge a0,a1,8000054c +8000017c: fff74593 not a1,a4 +80000180: 41f5d593 srai a1,a1,0x1f +80000184: 00b77733 and a4,a4,a1 +80000188: 001005b7 lui a1,0x100 +8000018c: 3ab75263 bge a4,a1,80000530 +80000190: fff64593 not a1,a2 +80000194: 41f5d593 srai a1,a1,0x1f +80000198: 00b67633 and a2,a2,a1 +8000019c: 001005b7 lui a1,0x100 +800001a0: 38b65463 bge a2,a1,80000528 +800001a4: 01400593 li a1,20 +800001a8: 41058833 sub a6,a1,a6 +800001ac: 41055533 sra a0,a0,a6 +800001b0: 406585b3 sub a1,a1,t1 +800001b4: 41065633 sra a2,a2,a6 +800001b8: 00661633 sll a2,a2,t1 +800001bc: 40b7d833 sra a6,a5,a1 +800001c0: 40b75733 sra a4,a4,a1 +800001c4: 006515b3 sll a1,a0,t1 +800001c8: 00b80333 add t1,a6,a1 +800001cc: 00c807b3 add a5,a6,a2 +800001d0: 00b705b3 add a1,a4,a1 +800001d4: 00c70733 add a4,a4,a2 +800001d8: 00259593 slli a1,a1,0x2 +800001dc: 00231613 slli a2,t1,0x2 +800001e0: 00279793 slli a5,a5,0x2 +800001e4: 00271713 slli a4,a4,0x2 +800001e8: 00c88633 add a2,a7,a2 +800001ec: 00b885b3 add a1,a7,a1 +800001f0: 00f887b3 add a5,a7,a5 +800001f4: 00e88733 add a4,a7,a4 +800001f8: 00400893 li a7,4 +800001fc: 00062f03 lw t5,0(a2) +80000200: 0007ae83 lw t4,0(a5) +80000204: 0005a603 lw a2,0(a1) # 100000 <__stack_size+0xffc00> +80000208: 00072783 lw a5,0(a4) +8000020c: 0ff87813 andi a6,a6,255 +80000210: 0ff57593 andi a1,a0,255 +80000214: 3f168c63 beq a3,a7,8000060c +80000218: 14d8ee63 bltu a7,a3,80000374 +8000021c: 00300713 li a4,3 +80000220: 0ae69063 bne a3,a4,800002c0 +80000224: 008f1713 slli a4,t5,0x8 +80000228: 008e9513 slli a0,t4,0x8 +8000022c: 00861313 slli t1,a2,0x8 +80000230: 00879893 slli a7,a5,0x8 +80000234: 00ff0e37 lui t3,0xff0 +80000238: 0ffe0e13 addi t3,t3,255 # ff00ff <__stack_size+0xfefcff> +8000023c: 01e76f33 or t5,a4,t5 +80000240: 01d56533 or a0,a0,t4 +80000244: 00f8e7b3 or a5,a7,a5 +80000248: 00c36633 or a2,t1,a2 +8000024c: 01c57eb3 and t4,a0,t3 +80000250: 01cf7f33 and t5,t5,t3 +80000254: 01c7f533 and a0,a5,t3 +80000258: 01c67633 and a2,a2,t3 +8000025c: 41d50533 sub a0,a0,t4 +80000260: 41e607b3 sub a5,a2,t5 +80000264: 030787b3 mul a5,a5,a6 +80000268: 00000713 li a4,0 +8000026c: 03050833 mul a6,a0,a6 +80000270: 4087d793 srai a5,a5,0x8 +80000274: 01e787b3 add a5,a5,t5 +80000278: 01c7f7b3 and a5,a5,t3 +8000027c: 40885513 srai a0,a6,0x8 +80000280: 01d50533 add a0,a0,t4 +80000284: 01c57533 and a0,a0,t3 +80000288: 40f50533 sub a0,a0,a5 +8000028c: 02b50533 mul a0,a0,a1 +80000290: 40855513 srai a0,a0,0x8 +80000294: 00f50533 add a0,a0,a5 +80000298: 01c57533 and a0,a0,t3 +8000029c: 00400793 li a5,4 +800002a0: 24d7e663 bltu a5,a3,800004ec +800002a4: 00300793 li a5,3 +800002a8: 22f69463 bne a3,a5,800004d0 +800002ac: 40855793 srai a5,a0,0x8 +800002b0: 00a7e533 or a0,a5,a0 +800002b4: 01051513 slli a0,a0,0x10 +800002b8: 01055513 srli a0,a0,0x10 +800002bc: 00008067 ret +800002c0: 30069263 bnez a3,800005c4 +800002c4: 00ff0537 lui a0,0xff0 +800002c8: 0ff50313 addi t1,a0,255 # ff00ff <__stack_size+0xfefcff> +800002cc: 006f72b3 and t0,t5,t1 +800002d0: 00667e33 and t3,a2,t1 +800002d4: 4087d893 srai a7,a5,0x8 +800002d8: 405e0e33 sub t3,t3,t0 +800002dc: 006effb3 and t6,t4,t1 +800002e0: 0067f733 and a4,a5,t1 +800002e4: 408f5f13 srai t5,t5,0x8 +800002e8: 0068f7b3 and a5,a7,t1 +800002ec: 40865613 srai a2,a2,0x8 +800002f0: 030e08b3 mul a7,t3,a6 +800002f4: 408ede93 srai t4,t4,0x8 +800002f8: 006f7f33 and t5,t5,t1 +800002fc: 006efeb3 and t4,t4,t1 +80000300: 00667633 and a2,a2,t1 +80000304: 41f70733 sub a4,a4,t6 +80000308: 41d787b3 sub a5,a5,t4 +8000030c: 41e60633 sub a2,a2,t5 +80000310: 03070733 mul a4,a4,a6 +80000314: 4088d893 srai a7,a7,0x8 +80000318: 00588533 add a0,a7,t0 +8000031c: 006578b3 and a7,a0,t1 +80000320: 03060633 mul a2,a2,a6 +80000324: 40875513 srai a0,a4,0x8 +80000328: 01f50533 add a0,a0,t6 +8000032c: 00657533 and a0,a0,t1 +80000330: 41150533 sub a0,a0,a7 +80000334: 03078833 mul a6,a5,a6 +80000338: 40865793 srai a5,a2,0x8 +8000033c: 01e787b3 add a5,a5,t5 +80000340: 0067f7b3 and a5,a5,t1 +80000344: 40885713 srai a4,a6,0x8 +80000348: 01d70733 add a4,a4,t4 +8000034c: 00677733 and a4,a4,t1 +80000350: 02b50533 mul a0,a0,a1 +80000354: 40f70733 sub a4,a4,a5 +80000358: 02b70733 mul a4,a4,a1 +8000035c: 40855513 srai a0,a0,0x8 +80000360: 01150533 add a0,a0,a7 +80000364: 00657533 and a0,a0,t1 +80000368: 40875713 srai a4,a4,0x8 +8000036c: 00f70733 add a4,a4,a5 +80000370: f2dff06f j 8000029c +80000374: 00500713 li a4,5 +80000378: f4e696e3 bne a3,a4,800002c4 +8000037c: 00cf1693 slli a3,t5,0xc +80000380: 00c61313 slli t1,a2,0xc +80000384: 00ce9513 slli a0,t4,0xc +80000388: 00c79893 slli a7,a5,0xc +8000038c: 0f0f1737 lui a4,0xf0f1 +80000390: f0f70713 addi a4,a4,-241 # f0f0f0f <__stack_size+0xf0f0b0f> +80000394: 00f8e7b3 or a5,a7,a5 +80000398: 01e6ef33 or t5,a3,t5 +8000039c: 00c36633 or a2,t1,a2 +800003a0: 01d56eb3 or t4,a0,t4 +800003a4: 00ef7f33 and t5,t5,a4 +800003a8: 00eefeb3 and t4,t4,a4 +800003ac: 00e67633 and a2,a2,a4 +800003b0: 00e7f733 and a4,a5,a4 +800003b4: 41d70733 sub a4,a4,t4 +800003b8: 41e607b3 sub a5,a2,t5 +800003bc: 030787b3 mul a5,a5,a6 +800003c0: 00ff0537 lui a0,0xff0 +800003c4: 0ff50513 addi a0,a0,255 # ff00ff <__stack_size+0xfefcff> +800003c8: 03070833 mul a6,a4,a6 +800003cc: 4087d793 srai a5,a5,0x8 +800003d0: 01e787b3 add a5,a5,t5 +800003d4: 00a7f7b3 and a5,a5,a0 +800003d8: 40885713 srai a4,a6,0x8 +800003dc: 01d70733 add a4,a4,t4 +800003e0: 00a77733 and a4,a4,a0 +800003e4: 40f70733 sub a4,a4,a5 +800003e8: 02b70733 mul a4,a4,a1 +800003ec: 40875713 srai a4,a4,0x8 +800003f0: 00f70733 add a4,a4,a5 +800003f4: 00a77533 and a0,a4,a0 +800003f8: 40c55793 srai a5,a0,0xc +800003fc: 00a7e533 or a0,a5,a0 +80000400: 01051513 slli a0,a0,0x10 +80000404: 01055513 srli a0,a0,0x10 +80000408: 00008067 ret +8000040c: 00100793 li a5,1 +80000410: 18fe8c63 beq t4,a5,800005a8 +80000414: fff5c713 not a4,a1 +80000418: 41f75713 srai a4,a4,0x1f +8000041c: 00e5f733 and a4,a1,a4 +80000420: 001007b7 lui a5,0x100 +80000424: 14f75663 bge a4,a5,80000570 +80000428: fff64793 not a5,a2 +8000042c: 41f7d793 srai a5,a5,0x1f +80000430: 00f67633 and a2,a2,a5 +80000434: 001007b7 lui a5,0x100 +80000438: 12f65863 bge a2,a5,80000568 +8000043c: 01400593 li a1,20 +80000440: 41058833 sub a6,a1,a6 +80000444: 41065633 sra a2,a2,a6 +80000448: 406585b3 sub a1,a1,t1 +8000044c: 40b75733 sra a4,a4,a1 +80000450: 00661633 sll a2,a2,t1 +80000454: 00e60633 add a2,a2,a4 +80000458: 00261613 slli a2,a2,0x2 +8000045c: 00c88733 add a4,a7,a2 +80000460: 00400793 li a5,4 +80000464: 00072503 lw a0,0(a4) +80000468: 22f68a63 beq a3,a5,8000069c +8000046c: 02d7ea63 bltu a5,a3,800004a0 +80000470: 00300793 li a5,3 +80000474: 08f69a63 bne a3,a5,80000508 +80000478: 00851793 slli a5,a0,0x8 +8000047c: 00a7e533 or a0,a5,a0 +80000480: 00ff07b7 lui a5,0xff0 +80000484: 0ff78793 addi a5,a5,255 # ff00ff <__stack_size+0xfefcff> +80000488: 00f577b3 and a5,a0,a5 +8000048c: 4087d513 srai a0,a5,0x8 +80000490: 00f56533 or a0,a0,a5 +80000494: 01051513 slli a0,a0,0x10 +80000498: 01055513 srli a0,a0,0x10 +8000049c: 00008067 ret +800004a0: 00500793 li a5,5 +800004a4: 10f69c63 bne a3,a5,800005bc +800004a8: 00c51793 slli a5,a0,0xc +800004ac: 00a7e533 or a0,a5,a0 +800004b0: 0f0f17b7 lui a5,0xf0f1 +800004b4: f0f78793 addi a5,a5,-241 # f0f0f0f <__stack_size+0xf0f0b0f> +800004b8: 00f577b3 and a5,a0,a5 +800004bc: 40c7d513 srai a0,a5,0xc +800004c0: 00f56533 or a0,a0,a5 +800004c4: 01051513 slli a0,a0,0x10 +800004c8: 01055513 srli a0,a0,0x10 +800004cc: 00008067 ret +800004d0: 0e069863 bnez a3,800005c0 +800004d4: ff0107b7 lui a5,0xff010 +800004d8: f0078793 addi a5,a5,-256 # ff00ff00 <__stack_top+0xff00> +800004dc: 00871713 slli a4,a4,0x8 +800004e0: 00f77733 and a4,a4,a5 +800004e4: 00e56533 or a0,a0,a4 +800004e8: 00008067 ret +800004ec: 00500793 li a5,5 +800004f0: fef692e3 bne a3,a5,800004d4 +800004f4: 40c55793 srai a5,a0,0xc +800004f8: 00a7e533 or a0,a5,a0 +800004fc: 01051513 slli a0,a0,0x10 +80000500: 01055513 srli a0,a0,0x10 +80000504: 00008067 ret +80000508: 00008067 ret +8000050c: 001005b7 lui a1,0x100 +80000510: fff58593 addi a1,a1,-1 # fffff <__stack_size+0xffbff> +80000514: 00b7f7b3 and a5,a5,a1 +80000518: 00b57533 and a0,a0,a1 +8000051c: 00b77733 and a4,a4,a1 +80000520: 00b67633 and a2,a2,a1 +80000524: c81ff06f j 800001a4 +80000528: fff58613 addi a2,a1,-1 +8000052c: c79ff06f j 800001a4 +80000530: fff58713 addi a4,a1,-1 +80000534: fff64593 not a1,a2 +80000538: 41f5d593 srai a1,a1,0x1f +8000053c: 00b67633 and a2,a2,a1 +80000540: 001005b7 lui a1,0x100 +80000544: c6b640e3 blt a2,a1,800001a4 +80000548: fe1ff06f j 80000528 +8000054c: fff58513 addi a0,a1,-1 # fffff <__stack_size+0xffbff> +80000550: fff74593 not a1,a4 +80000554: 41f5d593 srai a1,a1,0x1f +80000558: 00b77733 and a4,a4,a1 +8000055c: 001005b7 lui a1,0x100 +80000560: c2b748e3 blt a4,a1,80000190 +80000564: fcdff06f j 80000530 +80000568: fff78613 addi a2,a5,-1 +8000056c: ed1ff06f j 8000043c +80000570: fff78713 addi a4,a5,-1 +80000574: fff64793 not a5,a2 +80000578: 41f7d793 srai a5,a5,0x1f +8000057c: 00f67633 and a2,a2,a5 +80000580: 001007b7 lui a5,0x100 +80000584: eaf64ce3 blt a2,a5,8000043c +80000588: fe1ff06f j 80000568 +8000058c: fff58793 addi a5,a1,-1 # fffff <__stack_size+0xffbff> +80000590: fff54593 not a1,a0 +80000594: 41f5d593 srai a1,a1,0x1f +80000598: 00b57533 and a0,a0,a1 +8000059c: 001005b7 lui a1,0x100 +800005a0: bcb54ee3 blt a0,a1,8000017c +800005a4: fa9ff06f j 8000054c +800005a8: 001007b7 lui a5,0x100 +800005ac: fff78793 addi a5,a5,-1 # fffff <__stack_size+0xffbff> +800005b0: 00f5f733 and a4,a1,a5 +800005b4: 00f67633 and a2,a2,a5 +800005b8: e85ff06f j 8000043c +800005bc: 00008067 ret +800005c0: 00008067 ret +800005c4: 41e606b3 sub a3,a2,t5 +800005c8: 41d787b3 sub a5,a5,t4 +800005cc: 030686b3 mul a3,a3,a6 +800005d0: 00ff0537 lui a0,0xff0 +800005d4: 0ff50713 addi a4,a0,255 # ff00ff <__stack_size+0xfefcff> +800005d8: 030787b3 mul a5,a5,a6 +800005dc: 4086d693 srai a3,a3,0x8 +800005e0: 01e686b3 add a3,a3,t5 +800005e4: 00e6f6b3 and a3,a3,a4 +800005e8: 4087d513 srai a0,a5,0x8 +800005ec: 01d50533 add a0,a0,t4 +800005f0: 00e57533 and a0,a0,a4 +800005f4: 40d50533 sub a0,a0,a3 +800005f8: 02b50533 mul a0,a0,a1 +800005fc: 40855513 srai a0,a0,0x8 +80000600: 00d50533 add a0,a0,a3 +80000604: 00e57533 and a0,a0,a4 +80000608: 00008067 ret +8000060c: 010f1713 slli a4,t5,0x10 +80000610: 01061893 slli a7,a2,0x10 +80000614: 010e9513 slli a0,t4,0x10 +80000618: 01079693 slli a3,a5,0x10 +8000061c: 07e10337 lui t1,0x7e10 +80000620: 81f30313 addi t1,t1,-2017 # 7e0f81f <__stack_size+0x7e0f41f> +80000624: 00f6e7b3 or a5,a3,a5 +80000628: 01e76f33 or t5,a4,t5 +8000062c: 00c8e633 or a2,a7,a2 +80000630: 01d56533 or a0,a0,t4 +80000634: 00657eb3 and t4,a0,t1 +80000638: 006f7f33 and t5,t5,t1 +8000063c: 0067f533 and a0,a5,t1 +80000640: 00667633 and a2,a2,t1 +80000644: 41d50533 sub a0,a0,t4 +80000648: 41e607b3 sub a5,a2,t5 +8000064c: 030787b3 mul a5,a5,a6 +80000650: 00ff0737 lui a4,0xff0 +80000654: 0ff70713 addi a4,a4,255 # ff00ff <__stack_size+0xfefcff> +80000658: 03050833 mul a6,a0,a6 +8000065c: 4087d793 srai a5,a5,0x8 +80000660: 01e787b3 add a5,a5,t5 +80000664: 00e7f7b3 and a5,a5,a4 +80000668: 40885513 srai a0,a6,0x8 +8000066c: 01d50533 add a0,a0,t4 +80000670: 00e57533 and a0,a0,a4 +80000674: 40f50533 sub a0,a0,a5 +80000678: 02b50533 mul a0,a0,a1 +8000067c: 40855513 srai a0,a0,0x8 +80000680: 00f50533 add a0,a0,a5 +80000684: 00e577b3 and a5,a0,a4 +80000688: 4107d513 srai a0,a5,0x10 +8000068c: 00f56533 or a0,a0,a5 +80000690: 01051513 slli a0,a0,0x10 +80000694: 01055513 srli a0,a0,0x10 +80000698: 00008067 ret +8000069c: 01051793 slli a5,a0,0x10 +800006a0: 00a7e533 or a0,a5,a0 +800006a4: 07e107b7 lui a5,0x7e10 +800006a8: 81f78793 addi a5,a5,-2017 # 7e0f81f <__stack_size+0x7e0f41f> +800006ac: 00f577b3 and a5,a0,a5 +800006b0: 4107d513 srai a0,a5,0x10 +800006b4: 00f56533 or a0,a0,a5 +800006b8: 01051513 slli a0,a0,0x10 +800006bc: 01055513 srli a0,a0,0x10 +800006c0: 00008067 ret + +800006c4 : +800006c4: 0085a703 lw a4,8(a1) # 100008 <__stack_size+0xffc08> +800006c8: f7010113 addi sp,sp,-144 +800006cc: 08812423 sw s0,136(sp) +800006d0: 02e507b3 mul a5,a0,a4 +800006d4: 0005a403 lw s0,0(a1) +800006d8: 05312027 fsw fs3,64(sp) +800006dc: 03512c27 fsw fs5,56(sp) +800006e0: 02042e03 lw t3,32(s0) +800006e4: 02442683 lw a3,36(s0) +800006e8: 0105aa87 flw fs5,16(a1) +800006ec: 08112623 sw ra,140(sp) +800006f0: 08912223 sw s1,132(sp) +800006f4: 09212023 sw s2,128(sp) +800006f8: 03c78e33 mul t3,a5,t3 +800006fc: d017f9d3 fcvt.s.wu fs3,a5 +80000700: 07312e23 sw s3,124(sp) +80000704: 07412c23 sw s4,120(sp) +80000708: 07512a23 sw s5,116(sp) +8000070c: 07612823 sw s6,112(sp) +80000710: 07712623 sw s7,108(sp) +80000714: 07812423 sw s8,104(sp) +80000718: 07912223 sw s9,100(sp) +8000071c: 07a12023 sw s10,96(sp) +80000720: 00de07b3 add a5,t3,a3 +80000724: 05b12e23 sw s11,92(sp) +80000728: 04812627 fsw fs0,76(sp) +8000072c: 04912427 fsw fs1,72(sp) +80000730: 05212227 fsw fs2,68(sp) +80000734: 03412e27 fsw fs4,60(sp) +80000738: 00f12623 sw a5,12(sp) +8000073c: 1159f9d3 fmul.s fs3,fs3,fs5 +80000740: 28070e63 beqz a4,800009dc +80000744: 80003737 lui a4,0x80003 +80000748: 9e872487 flw fs1,-1560(a4) # 800029e8 <__stack_top+0x810029e8> +8000074c: fff00737 lui a4,0xfff00 +80000750: 00b70713 addi a4,a4,11 # fff0000b <__stack_top+0xf0000b> +80000754: 00c5a907 flw fs2,12(a1) +80000758: f0000a53 fmv.w.x fs4,zero +8000075c: 00e12c23 sw a4,24(sp) +80000760: 07e10737 lui a4,0x7e10 +80000764: 81f70713 addi a4,a4,-2017 # 7e0f81f <__stack_size+0x7e0f41f> +80000768: 02e12223 sw a4,36(sp) +8000076c: 11497a53 fmul.s fs4,fs2,fs4 +80000770: 00010737 lui a4,0x10 +80000774: fff70713 addi a4,a4,-1 # ffff <__stack_size+0xfbff> +80000778: 00e12a23 sw a4,20(sp) +8000077c: 0045a783 lw a5,4(a1) +80000780: 0f0f1737 lui a4,0xf0f1 +80000784: f0f70713 addi a4,a4,-241 # f0f0f0f <__stack_size+0xf0f0b0f> +80000788: 00ff0bb7 lui s7,0xff0 +8000078c: 00058a13 mv s4,a1 +80000790: 00100ab7 lui s5,0x100 +80000794: 00012823 sw zero,16(sp) +80000798: 00200c13 li s8,2 +8000079c: 02e12423 sw a4,40(sp) +800007a0: 0ffb8b93 addi s7,s7,255 # ff00ff <__stack_size+0xfefcff> +800007a4: 22078c63 beqz a5,800009dc +800007a8: 1099f7d3 fmul.s fa5,fs3,fs1 +800007ac: 001007b7 lui a5,0x100 +800007b0: fff78793 addi a5,a5,-1 # fffff <__stack_size+0xffbff> +800007b4: 00744b03 lbu s6,7(s0) +800007b8: c00799d3 fcvt.w.s s3,fa5,rtz +800007bc: fff9cd13 not s10,s3 +800007c0: 41fd5d13 srai s10,s10,0x1f +800007c4: 00f9f7b3 and a5,s3,a5 +800007c8: 01a9fd33 and s10,s3,s10 +800007cc: 00f12e23 sw a5,28(sp) +800007d0: 015d4463 blt s10,s5,800007d8 +800007d4: fffa8d13 addi s10,s5,-1 # fffff <__stack_size+0xffbff> +800007d8: ff0107b7 lui a5,0xff010 +800007dc: 214a0453 fmv.s fs0,fs4 +800007e0: 00c12903 lw s2,12(sp) +800007e4: f0078793 addi a5,a5,-256 # ff00ff00 <__stack_top+0xff00> +800007e8: 00000493 li s1,0 +800007ec: 00100313 li t1,1 +800007f0: 01400d93 li s11,20 +800007f4: 00400c93 li s9,4 +800007f8: 00500893 li a7,5 +800007fc: 00300813 li a6,3 +80000800: 02f12023 sw a5,32(sp) +80000804: 109477d3 fmul.s fa5,fs0,fs1 +80000808: c00795d3 fcvt.w.s a1,fa5,rtz +8000080c: 220b0263 beqz s6,80000a30 +80000810: 00544683 lbu a3,5(s0) +80000814: 25868a63 beq a3,s8,80000a68 +80000818: 00c44603 lbu a2,12(s0) +8000081c: 00d44783 lbu a5,13(s0) +80000820: 00444703 lbu a4,4(s0) +80000824: 00644f83 lbu t6,6(s0) +80000828: 01042503 lw a0,16(s0) +8000082c: 24068c63 beqz a3,80000a84 +80000830: 000806b7 lui a3,0x80 +80000834: 40c6d3b3 sra t2,a3,a2 +80000838: 40f6d6b3 sra a3,a3,a5 +8000083c: 407582b3 sub t0,a1,t2 +80000840: 007585b3 add a1,a1,t2 +80000844: 40d983b3 sub t2,s3,a3 +80000848: 00d986b3 add a3,s3,a3 +8000084c: 406f8463 beq t6,t1,80000c54 +80000850: fff2cf93 not t6,t0 +80000854: 41ffdf93 srai t6,t6,0x1f +80000858: 01f2f2b3 and t0,t0,t6 +8000085c: 0152c463 blt t0,s5,80000864 +80000860: fffa8293 addi t0,s5,-1 +80000864: fff3cf93 not t6,t2 +80000868: 41ffdf93 srai t6,t6,0x1f +8000086c: 01f3f3b3 and t2,t2,t6 +80000870: 0153c463 blt t2,s5,80000878 +80000874: fffa8393 addi t2,s5,-1 +80000878: fff5cf93 not t6,a1 +8000087c: 41ffdf93 srai t6,t6,0x1f +80000880: 01f5f5b3 and a1,a1,t6 +80000884: 0155c463 blt a1,s5,8000088c +80000888: fffa8593 addi a1,s5,-1 +8000088c: fff6cf93 not t6,a3 +80000890: 41ffdf93 srai t6,t6,0x1f +80000894: 01f6f6b3 and a3,a3,t6 +80000898: 0156c463 blt a3,s5,800008a0 +8000089c: fffa8693 addi a3,s5,-1 +800008a0: 40fd87b3 sub a5,s11,a5 +800008a4: 40cd8fb3 sub t6,s11,a2 +800008a8: 40f6d6b3 sra a3,a3,a5 +800008ac: 40f3d3b3 sra t2,t2,a5 +800008b0: 00c397b3 sll a5,t2,a2 +800008b4: 41f2d2b3 sra t0,t0,t6 +800008b8: 41f5d5b3 sra a1,a1,t6 +800008bc: 00c69633 sll a2,a3,a2 +800008c0: 00f28fb3 add t6,t0,a5 +800008c4: 00c286b3 add a3,t0,a2 +800008c8: 00f587b3 add a5,a1,a5 +800008cc: 00c585b3 add a1,a1,a2 +800008d0: 002f9f93 slli t6,t6,0x2 +800008d4: 00279793 slli a5,a5,0x2 +800008d8: 00269693 slli a3,a3,0x2 +800008dc: 00259593 slli a1,a1,0x2 +800008e0: 01f50fb3 add t6,a0,t6 +800008e4: 00f507b3 add a5,a0,a5 +800008e8: 00d506b3 add a3,a0,a3 +800008ec: 00b505b3 add a1,a0,a1 +800008f0: 000faf83 lw t6,0(t6) +800008f4: 0007a783 lw a5,0(a5) +800008f8: 0006a683 lw a3,0(a3) # 80000 <__stack_size+0x7fc00> +800008fc: 0005a603 lw a2,0(a1) +80000900: 0ff2f293 andi t0,t0,255 +80000904: 0ff3f393 andi t2,t2,255 +80000908: 3f970063 beq a4,s9,80000ce8 +8000090c: 28ece863 bltu s9,a4,80000b9c +80000910: 1d071863 bne a4,a6,80000ae0 +80000914: 008f9593 slli a1,t6,0x8 +80000918: 00879513 slli a0,a5,0x8 +8000091c: 00869e93 slli t4,a3,0x8 +80000920: 00861e13 slli t3,a2,0x8 +80000924: 01f5e5b3 or a1,a1,t6 +80000928: 00f567b3 or a5,a0,a5 +8000092c: 00dee6b3 or a3,t4,a3 +80000930: 00ce6633 or a2,t3,a2 +80000934: 0175f5b3 and a1,a1,s7 +80000938: 0176f6b3 and a3,a3,s7 +8000093c: 0177f7b3 and a5,a5,s7 +80000940: 01767633 and a2,a2,s7 +80000944: 40b787b3 sub a5,a5,a1 +80000948: 40d60633 sub a2,a2,a3 +8000094c: 025787b3 mul a5,a5,t0 +80000950: 00000f93 li t6,0 +80000954: 025602b3 mul t0,a2,t0 +80000958: 4087d793 srai a5,a5,0x8 +8000095c: 00b785b3 add a1,a5,a1 +80000960: 0175f5b3 and a1,a1,s7 +80000964: 4082d293 srai t0,t0,0x8 +80000968: 00d287b3 add a5,t0,a3 +8000096c: 0177f7b3 and a5,a5,s7 +80000970: 40b787b3 sub a5,a5,a1 +80000974: 027783b3 mul t2,a5,t2 +80000978: 4083d393 srai t2,t2,0x8 +8000097c: 00b385b3 add a1,t2,a1 +80000980: 0175f533 and a0,a1,s7 +80000984: 20ece863 bltu s9,a4,80000b94 +80000988: 2b071a63 bne a4,a6,80000c3c +8000098c: 40855793 srai a5,a0,0x8 +80000990: 00a7e533 or a0,a5,a0 +80000994: 01412783 lw a5,20(sp) +80000998: 00f57533 and a0,a0,a5 +8000099c: 00a92023 sw a0,0(s2) +800009a0: 004a2783 lw a5,4(s4) +800009a4: 00148493 addi s1,s1,1 +800009a8: 01247453 fadd.s fs0,fs0,fs2 +800009ac: 00490913 addi s2,s2,4 +800009b0: e4f4eae3 bltu s1,a5,80000804 +800009b4: 01012603 lw a2,16(sp) +800009b8: 00c12583 lw a1,12(sp) +800009bc: 02042683 lw a3,32(s0) +800009c0: 008a2703 lw a4,8(s4) +800009c4: 00160613 addi a2,a2,1 +800009c8: 00d586b3 add a3,a1,a3 +800009cc: 00c12823 sw a2,16(sp) +800009d0: 00d12623 sw a3,12(sp) +800009d4: 0159f9d3 fadd.s fs3,fs3,fs5 +800009d8: dce666e3 bltu a2,a4,800007a4 +800009dc: 08c12083 lw ra,140(sp) +800009e0: 08812403 lw s0,136(sp) +800009e4: 08412483 lw s1,132(sp) +800009e8: 08012903 lw s2,128(sp) +800009ec: 07c12983 lw s3,124(sp) +800009f0: 07812a03 lw s4,120(sp) +800009f4: 07412a83 lw s5,116(sp) +800009f8: 07012b03 lw s6,112(sp) +800009fc: 06c12b83 lw s7,108(sp) +80000a00: 06812c03 lw s8,104(sp) +80000a04: 06412c83 lw s9,100(sp) +80000a08: 06012d03 lw s10,96(sp) +80000a0c: 05c12d83 lw s11,92(sp) +80000a10: 04c12407 flw fs0,76(sp) +80000a14: 04812487 flw fs1,72(sp) +80000a18: 04412907 flw fs2,68(sp) +80000a1c: 04012987 flw fs3,64(sp) +80000a20: 03c12a07 flw fs4,60(sp) +80000a24: 03812a87 flw fs5,56(sp) +80000a28: 09010113 addi sp,sp,144 +80000a2c: 00008067 ret +80000a30: 00544703 lbu a4,5(s0) +80000a34: 00842783 lw a5,8(s0) +80000a38: 01870863 beq a4,s8,80000a48 +80000a3c: 7935d76b 0x7935d76b +80000a40: 00e92023 sw a4,0(s2) +80000a44: f5dff06f j 800009a0 +80000a48: 7935d76b 0x7935d76b +80000a4c: 01812683 lw a3,24(sp) +80000a50: 00f6d463 bge a3,a5,80000a58 +80000a54: 00068793 mv a5,a3 +80000a58: 015787b3 add a5,a5,s5 +80000a5c: 7935d7eb 0x7935d7eb +80000a60: 00e92023 sw a4,0(s2) +80000a64: f3dff06f j 800009a0 +80000a68: 00098613 mv a2,s3 +80000a6c: 00040513 mv a0,s0 +80000a70: ea4ff0ef jal ra,80000114 +80000a74: 00100313 li t1,1 +80000a78: 00500893 li a7,5 +80000a7c: 00300813 li a6,3 +80000a80: f1dff06f j 8000099c +80000a84: 1e6f8663 beq t6,t1,80000c70 +80000a88: fff5c693 not a3,a1 +80000a8c: 41f6d693 srai a3,a3,0x1f +80000a90: 00d5f5b3 and a1,a1,a3 +80000a94: 0155c463 blt a1,s5,80000a9c +80000a98: fffa8593 addi a1,s5,-1 +80000a9c: 000d0693 mv a3,s10 +80000aa0: 40fd87b3 sub a5,s11,a5 +80000aa4: 40f6d7b3 sra a5,a3,a5 +80000aa8: 40cd86b3 sub a3,s11,a2 +80000aac: 00c797b3 sll a5,a5,a2 +80000ab0: 40d5d5b3 sra a1,a1,a3 +80000ab4: 00b787b3 add a5,a5,a1 +80000ab8: 00279793 slli a5,a5,0x2 +80000abc: 00f507b3 add a5,a0,a5 +80000ac0: 0007a503 lw a0,0(a5) +80000ac4: 21970063 beq a4,s9,80000cc4 +80000ac8: 14ecee63 bltu s9,a4,80000c24 +80000acc: ed0718e3 bne a4,a6,8000099c +80000ad0: 00851793 slli a5,a0,0x8 +80000ad4: 00a7e533 or a0,a5,a0 +80000ad8: 01757533 and a0,a0,s7 +80000adc: eb1ff06f j 8000098c +80000ae0: 1a071263 bnez a4,80000c84 +80000ae4: 017ff5b3 and a1,t6,s7 +80000ae8: 0177fe33 and t3,a5,s7 +80000aec: 40be0e33 sub t3,t3,a1 +80000af0: 025e0e33 mul t3,t3,t0 +80000af4: 408fdf93 srai t6,t6,0x8 +80000af8: 4087d793 srai a5,a5,0x8 +80000afc: 0176f533 and a0,a3,s7 +80000b00: 40865f13 srai t5,a2,0x8 +80000b04: 017fffb3 and t6,t6,s7 +80000b08: 0177f7b3 and a5,a5,s7 +80000b0c: 01767633 and a2,a2,s7 +80000b10: 40a60633 sub a2,a2,a0 +80000b14: 41f78eb3 sub t4,a5,t6 +80000b18: 03c12623 sw t3,44(sp) +80000b1c: 02c12783 lw a5,44(sp) +80000b20: 02560e33 mul t3,a2,t0 +80000b24: 4086d693 srai a3,a3,0x8 +80000b28: 0176f6b3 and a3,a3,s7 +80000b2c: 017f7f33 and t5,t5,s7 +80000b30: 40df0f33 sub t5,t5,a3 +80000b34: 4087d793 srai a5,a5,0x8 +80000b38: 00b787b3 add a5,a5,a1 +80000b3c: 0177f633 and a2,a5,s7 +80000b40: 025e85b3 mul a1,t4,t0 +80000b44: 408e5793 srai a5,t3,0x8 +80000b48: 00a787b3 add a5,a5,a0 +80000b4c: 0177f7b3 and a5,a5,s7 +80000b50: 40c78533 sub a0,a5,a2 +80000b54: 025f02b3 mul t0,t5,t0 +80000b58: 4085d793 srai a5,a1,0x8 +80000b5c: 01f78fb3 add t6,a5,t6 +80000b60: 017fffb3 and t6,t6,s7 +80000b64: 4082d293 srai t0,t0,0x8 +80000b68: 00d286b3 add a3,t0,a3 +80000b6c: 0176f6b3 and a3,a3,s7 +80000b70: 02750533 mul a0,a0,t2 +80000b74: 41f687b3 sub a5,a3,t6 +80000b78: 027783b3 mul t2,a5,t2 +80000b7c: 40855513 srai a0,a0,0x8 +80000b80: 00c50533 add a0,a0,a2 +80000b84: 01757533 and a0,a0,s7 +80000b88: 4083d393 srai t2,t2,0x8 +80000b8c: 01f38fb3 add t6,t2,t6 +80000b90: deecfce3 bgeu s9,a4,80000988 +80000b94: 07170e63 beq a4,a7,80000c10 +80000b98: 0a80006f j 80000c40 +80000b9c: f51714e3 bne a4,a7,80000ae4 +80000ba0: 00c79593 slli a1,a5,0xc +80000ba4: 00f5e7b3 or a5,a1,a5 +80000ba8: 02812583 lw a1,40(sp) +80000bac: 00c69513 slli a0,a3,0xc +80000bb0: 00cf9713 slli a4,t6,0xc +80000bb4: 00c61e13 slli t3,a2,0xc +80000bb8: 00d566b3 or a3,a0,a3 +80000bbc: 01f76733 or a4,a4,t6 +80000bc0: 00ce6633 or a2,t3,a2 +80000bc4: 00b77733 and a4,a4,a1 +80000bc8: 00b6f6b3 and a3,a3,a1 +80000bcc: 00b7f7b3 and a5,a5,a1 +80000bd0: 00b67633 and a2,a2,a1 +80000bd4: 40e787b3 sub a5,a5,a4 +80000bd8: 40d60633 sub a2,a2,a3 +80000bdc: 025787b3 mul a5,a5,t0 +80000be0: 025602b3 mul t0,a2,t0 +80000be4: 4087d793 srai a5,a5,0x8 +80000be8: 00e787b3 add a5,a5,a4 +80000bec: 0177f7b3 and a5,a5,s7 +80000bf0: 4082d293 srai t0,t0,0x8 +80000bf4: 00d28733 add a4,t0,a3 +80000bf8: 01777733 and a4,a4,s7 +80000bfc: 40f70733 sub a4,a4,a5 +80000c00: 027703b3 mul t2,a4,t2 +80000c04: 4083d393 srai t2,t2,0x8 +80000c08: 00f387b3 add a5,t2,a5 +80000c0c: 0177f533 and a0,a5,s7 +80000c10: 40c55793 srai a5,a0,0xc +80000c14: 00a7e533 or a0,a5,a0 +80000c18: 01412783 lw a5,20(sp) +80000c1c: 00f57533 and a0,a0,a5 +80000c20: d7dff06f j 8000099c +80000c24: d7171ce3 bne a4,a7,8000099c +80000c28: 00c51793 slli a5,a0,0xc +80000c2c: 00a7e533 or a0,a5,a0 +80000c30: 02812783 lw a5,40(sp) +80000c34: 00f57533 and a0,a0,a5 +80000c38: fd9ff06f j 80000c10 +80000c3c: d60710e3 bnez a4,8000099c +80000c40: 02012783 lw a5,32(sp) +80000c44: 008f9f93 slli t6,t6,0x8 +80000c48: 00ffffb3 and t6,t6,a5 +80000c4c: 01f56533 or a0,a0,t6 +80000c50: d4dff06f j 8000099c +80000c54: 00100e37 lui t3,0x100 +80000c58: fffe0e93 addi t4,t3,-1 # fffff <__stack_size+0xffbff> +80000c5c: 01d2f2b3 and t0,t0,t4 +80000c60: 01d3f3b3 and t2,t2,t4 +80000c64: 01d5f5b3 and a1,a1,t4 +80000c68: 01d6f6b3 and a3,a3,t4 +80000c6c: c35ff06f j 800008a0 +80000c70: 001006b7 lui a3,0x100 +80000c74: fff68693 addi a3,a3,-1 # fffff <__stack_size+0xffbff> +80000c78: 00d5f5b3 and a1,a1,a3 +80000c7c: 01c12683 lw a3,28(sp) +80000c80: e21ff06f j 80000aa0 +80000c84: 41f787b3 sub a5,a5,t6 +80000c88: 40d60633 sub a2,a2,a3 +80000c8c: 025787b3 mul a5,a5,t0 +80000c90: 025602b3 mul t0,a2,t0 +80000c94: 4087d793 srai a5,a5,0x8 +80000c98: 01f787b3 add a5,a5,t6 +80000c9c: 0177f7b3 and a5,a5,s7 +80000ca0: 4082d293 srai t0,t0,0x8 +80000ca4: 00d28733 add a4,t0,a3 +80000ca8: 01777733 and a4,a4,s7 +80000cac: 40f70733 sub a4,a4,a5 +80000cb0: 027703b3 mul t2,a4,t2 +80000cb4: 4083d393 srai t2,t2,0x8 +80000cb8: 00f387b3 add a5,t2,a5 +80000cbc: 0177f533 and a0,a5,s7 +80000cc0: cddff06f j 8000099c +80000cc4: 01051793 slli a5,a0,0x10 +80000cc8: 00a7e533 or a0,a5,a0 +80000ccc: 02412783 lw a5,36(sp) +80000cd0: 00f57533 and a0,a0,a5 +80000cd4: 41055793 srai a5,a0,0x10 +80000cd8: 00a7e533 or a0,a5,a0 +80000cdc: 01412783 lw a5,20(sp) +80000ce0: 00f57533 and a0,a0,a5 +80000ce4: cb9ff06f j 8000099c +80000ce8: 01079593 slli a1,a5,0x10 +80000cec: 00f5e7b3 or a5,a1,a5 +80000cf0: 02412583 lw a1,36(sp) +80000cf4: 01069513 slli a0,a3,0x10 +80000cf8: 010f9713 slli a4,t6,0x10 +80000cfc: 01061e13 slli t3,a2,0x10 +80000d00: 00d566b3 or a3,a0,a3 +80000d04: 01f76733 or a4,a4,t6 +80000d08: 00ce6633 or a2,t3,a2 +80000d0c: 00b77733 and a4,a4,a1 +80000d10: 00b6f6b3 and a3,a3,a1 +80000d14: 00b7f7b3 and a5,a5,a1 +80000d18: 00b67633 and a2,a2,a1 +80000d1c: 40e787b3 sub a5,a5,a4 +80000d20: 40d60633 sub a2,a2,a3 +80000d24: 025787b3 mul a5,a5,t0 +80000d28: 025602b3 mul t0,a2,t0 +80000d2c: 4087d793 srai a5,a5,0x8 +80000d30: 00e787b3 add a5,a5,a4 +80000d34: 0177f7b3 and a5,a5,s7 +80000d38: 4082d293 srai t0,t0,0x8 +80000d3c: 00d28733 add a4,t0,a3 +80000d40: 01777733 and a4,a4,s7 +80000d44: 40f70733 sub a4,a4,a5 +80000d48: 027703b3 mul t2,a4,t2 +80000d4c: 4083d393 srai t2,t2,0x8 +80000d50: 00f387b3 add a5,t2,a5 +80000d54: 0177f7b3 and a5,a5,s7 +80000d58: 4107d513 srai a0,a5,0x10 +80000d5c: 00f56533 or a0,a0,a5 +80000d60: 01412783 lw a5,20(sp) +80000d64: 00f57533 and a0,a0,a5 +80000d68: c35ff06f j 8000099c + +80000d6c <_exit>: +80000d6c: 00050663 beqz a0,80000d78 +80000d70: 00050193 mv gp,a0 +80000d74: 00000073 ecall + +80000d78 : +80000d78: 348000ef jal ra,800010c0 +80000d7c: 00000513 li a0,0 +80000d80: 0005006b 0x5006b + +80000d84 : +80000d84: fc002573 csrr a0,0xfc0 +80000d88: 0005006b 0x5006b +80000d8c: 00002197 auipc gp,0x2 +80000d90: 03418193 addi gp,gp,52 # 80002dc0 <__global_pointer> +80000d94: 7efff117 auipc sp,0x7efff +80000d98: 26c10113 addi sp,sp,620 # ff000000 <__stack_top> +80000d9c: 40000593 li a1,1024 +80000da0: cc102673 csrr a2,0xcc1 +80000da4: 02c585b3 mul a1,a1,a2 +80000da8: 40b10133 sub sp,sp,a1 +80000dac: cc3026f3 csrr a3,0xcc3 +80000db0: 00068663 beqz a3,80000dbc +80000db4: 00000513 li a0,0 +80000db8: 0005006b 0x5006b + +80000dbc : +80000dbc: 00008067 ret + +80000dc0 <__libc_init_array>: +80000dc0: ff010113 addi sp,sp,-16 +80000dc4: 00812423 sw s0,8(sp) +80000dc8: 01212023 sw s2,0(sp) +80000dcc: 80002437 lui s0,0x80002 +80000dd0: 80002937 lui s2,0x80002 +80000dd4: 5b840793 addi a5,s0,1464 # 800025b8 <__stack_top+0x810025b8> +80000dd8: 5b890913 addi s2,s2,1464 # 800025b8 <__stack_top+0x810025b8> +80000ddc: 40f90933 sub s2,s2,a5 +80000de0: 00112623 sw ra,12(sp) +80000de4: 00912223 sw s1,4(sp) +80000de8: 40295913 srai s2,s2,0x2 +80000dec: 02090063 beqz s2,80000e0c <__libc_init_array+0x4c> +80000df0: 5b840413 addi s0,s0,1464 +80000df4: 00000493 li s1,0 +80000df8: 00042783 lw a5,0(s0) +80000dfc: 00148493 addi s1,s1,1 +80000e00: 00440413 addi s0,s0,4 +80000e04: 000780e7 jalr a5 +80000e08: fe9918e3 bne s2,s1,80000df8 <__libc_init_array+0x38> +80000e0c: 80002437 lui s0,0x80002 +80000e10: 80002937 lui s2,0x80002 +80000e14: 5b840793 addi a5,s0,1464 # 800025b8 <__stack_top+0x810025b8> +80000e18: 5bc90913 addi s2,s2,1468 # 800025bc <__stack_top+0x810025bc> +80000e1c: 40f90933 sub s2,s2,a5 +80000e20: 40295913 srai s2,s2,0x2 +80000e24: 02090063 beqz s2,80000e44 <__libc_init_array+0x84> +80000e28: 5b840413 addi s0,s0,1464 +80000e2c: 00000493 li s1,0 +80000e30: 00042783 lw a5,0(s0) +80000e34: 00148493 addi s1,s1,1 +80000e38: 00440413 addi s0,s0,4 +80000e3c: 000780e7 jalr a5 +80000e40: fe9918e3 bne s2,s1,80000e30 <__libc_init_array+0x70> +80000e44: 00c12083 lw ra,12(sp) +80000e48: 00812403 lw s0,8(sp) +80000e4c: 00412483 lw s1,4(sp) +80000e50: 00012903 lw s2,0(sp) +80000e54: 01010113 addi sp,sp,16 +80000e58: 00008067 ret + +80000e5c <__libc_fini_array>: +80000e5c: ff010113 addi sp,sp,-16 +80000e60: 00812423 sw s0,8(sp) +80000e64: 800027b7 lui a5,0x80002 +80000e68: 80002437 lui s0,0x80002 +80000e6c: 5bc40413 addi s0,s0,1468 # 800025bc <__stack_top+0x810025bc> +80000e70: 5bc78793 addi a5,a5,1468 # 800025bc <__stack_top+0x810025bc> +80000e74: 408787b3 sub a5,a5,s0 +80000e78: 00912223 sw s1,4(sp) +80000e7c: 00112623 sw ra,12(sp) +80000e80: 4027d493 srai s1,a5,0x2 +80000e84: 02048063 beqz s1,80000ea4 <__libc_fini_array+0x48> +80000e88: ffc78793 addi a5,a5,-4 +80000e8c: 00878433 add s0,a5,s0 +80000e90: 00042783 lw a5,0(s0) +80000e94: fff48493 addi s1,s1,-1 +80000e98: ffc40413 addi s0,s0,-4 +80000e9c: 000780e7 jalr a5 +80000ea0: fe0498e3 bnez s1,80000e90 <__libc_fini_array+0x34> +80000ea4: 00c12083 lw ra,12(sp) +80000ea8: 00812403 lw s0,8(sp) +80000eac: 00412483 lw s1,4(sp) +80000eb0: 01010113 addi sp,sp,16 +80000eb4: 00008067 ret + +80000eb8 : +80000eb8: fe010113 addi sp,sp,-32 +80000ebc: 00112e23 sw ra,28(sp) +80000ec0: 00812c23 sw s0,24(sp) +80000ec4: 00912a23 sw s1,20(sp) +80000ec8: 01212823 sw s2,16(sp) +80000ecc: 01312623 sw s3,12(sp) +80000ed0: fc0027f3 csrr a5,0xfc0 +80000ed4: 0007806b 0x7806b +80000ed8: cc5027f3 csrr a5,0xcc5 +80000edc: cc3029f3 csrr s3,0xcc3 +80000ee0: cc002773 csrr a4,0xcc0 +80000ee4: fc002673 csrr a2,0xfc0 +80000ee8: 00279693 slli a3,a5,0x2 +80000eec: 800037b7 lui a5,0x80003 +80000ef0: 9f478793 addi a5,a5,-1548 # 800029f4 <__stack_top+0x810029f4> +80000ef4: 00d787b3 add a5,a5,a3 +80000ef8: 0007a483 lw s1,0(a5) +80000efc: 0104a403 lw s0,16(s1) +80000f00: 00c4a683 lw a3,12(s1) +80000f04: 0089a933 slt s2,s3,s0 +80000f08: 00040793 mv a5,s0 +80000f0c: 00d90933 add s2,s2,a3 +80000f10: 03368433 mul s0,a3,s3 +80000f14: 00f9d463 bge s3,a5,80000f1c +80000f18: 00098793 mv a5,s3 +80000f1c: 00f40433 add s0,s0,a5 +80000f20: 0084a683 lw a3,8(s1) +80000f24: 02c40433 mul s0,s0,a2 +80000f28: 02e907b3 mul a5,s2,a4 +80000f2c: 00d40433 add s0,s0,a3 +80000f30: 00f40433 add s0,s0,a5 +80000f34: 00890933 add s2,s2,s0 +80000f38: 01245e63 bge s0,s2,80000f54 +80000f3c: 0004a783 lw a5,0(s1) +80000f40: 0044a583 lw a1,4(s1) +80000f44: 00040513 mv a0,s0 +80000f48: 00140413 addi s0,s0,1 +80000f4c: 000780e7 jalr a5 +80000f50: fe8916e3 bne s2,s0,80000f3c +80000f54: 0019b993 seqz s3,s3 +80000f58: 0009806b 0x9806b +80000f5c: 01c12083 lw ra,28(sp) +80000f60: 01812403 lw s0,24(sp) +80000f64: 01412483 lw s1,20(sp) +80000f68: 01012903 lw s2,16(sp) +80000f6c: 00c12983 lw s3,12(sp) +80000f70: 02010113 addi sp,sp,32 +80000f74: 00008067 ret + +80000f78 : +80000f78: fc010113 addi sp,sp,-64 +80000f7c: 02112e23 sw ra,60(sp) +80000f80: 02812c23 sw s0,56(sp) +80000f84: 02912a23 sw s1,52(sp) +80000f88: 03212823 sw s2,48(sp) +80000f8c: 03312623 sw s3,44(sp) +80000f90: fc2026f3 csrr a3,0xfc2 +80000f94: fc102873 csrr a6,0xfc1 +80000f98: fc002473 csrr s0,0xfc0 +80000f9c: cc5027f3 csrr a5,0xcc5 +80000fa0: 01f00713 li a4,31 +80000fa4: 0cf74463 blt a4,a5,8000106c +80000fa8: 030408b3 mul a7,s0,a6 +80000fac: 00100713 li a4,1 +80000fb0: 00a8d463 bge a7,a0,80000fb8 +80000fb4: 03154733 div a4,a0,a7 +80000fb8: 0ce6c863 blt a3,a4,80001088 +80000fbc: 0ae7d863 bge a5,a4,8000106c +80000fc0: fff68693 addi a3,a3,-1 +80000fc4: 02e54333 div t1,a0,a4 +80000fc8: 00030893 mv a7,t1 +80000fcc: 00f69663 bne a3,a5,80000fd8 +80000fd0: 02e56533 rem a0,a0,a4 +80000fd4: 006508b3 add a7,a0,t1 +80000fd8: 0288c4b3 div s1,a7,s0 +80000fdc: 0288e933 rem s2,a7,s0 +80000fe0: 0b04ca63 blt s1,a6,80001094 +80000fe4: 00100693 li a3,1 +80000fe8: 0304c733 div a4,s1,a6 +80000fec: 00070663 beqz a4,80000ff8 +80000ff0: 00070693 mv a3,a4 +80000ff4: 0304e733 rem a4,s1,a6 +80000ff8: 800039b7 lui s3,0x80003 +80000ffc: 9f498993 addi s3,s3,-1548 # 800029f4 <__stack_top+0x810029f4> +80001000: 00e12e23 sw a4,28(sp) +80001004: 00c10713 addi a4,sp,12 +80001008: 00b12623 sw a1,12(sp) +8000100c: 00c12823 sw a2,16(sp) +80001010: 00d12c23 sw a3,24(sp) +80001014: 02f30333 mul t1,t1,a5 +80001018: 00279793 slli a5,a5,0x2 +8000101c: 00f987b3 add a5,s3,a5 +80001020: 00e7a023 sw a4,0(a5) +80001024: 00612a23 sw t1,20(sp) +80001028: 06904c63 bgtz s1,800010a0 +8000102c: 04090063 beqz s2,8000106c +80001030: 02848433 mul s0,s1,s0 +80001034: 00812a23 sw s0,20(sp) +80001038: 0009006b 0x9006b +8000103c: cc5027f3 csrr a5,0xcc5 +80001040: cc202573 csrr a0,0xcc2 +80001044: 00279793 slli a5,a5,0x2 +80001048: 00f989b3 add s3,s3,a5 +8000104c: 0009a783 lw a5,0(s3) +80001050: 0087a683 lw a3,8(a5) +80001054: 0007a703 lw a4,0(a5) +80001058: 0047a583 lw a1,4(a5) +8000105c: 00d50533 add a0,a0,a3 +80001060: 000700e7 jalr a4 +80001064: 00100793 li a5,1 +80001068: 0007806b 0x7806b +8000106c: 03c12083 lw ra,60(sp) +80001070: 03812403 lw s0,56(sp) +80001074: 03412483 lw s1,52(sp) +80001078: 03012903 lw s2,48(sp) +8000107c: 02c12983 lw s3,44(sp) +80001080: 04010113 addi sp,sp,64 +80001084: 00008067 ret +80001088: 00068713 mv a4,a3 +8000108c: f2e7cae3 blt a5,a4,80000fc0 +80001090: fddff06f j 8000106c +80001094: 00000713 li a4,0 +80001098: 00100693 li a3,1 +8000109c: f5dff06f j 80000ff8 +800010a0: 00048713 mv a4,s1 +800010a4: 00985463 bge a6,s1,800010ac +800010a8: 00080713 mv a4,a6 +800010ac: 800017b7 lui a5,0x80001 +800010b0: eb878793 addi a5,a5,-328 # 80000eb8 <__stack_top+0x81000eb8> +800010b4: 00f7106b 0xf7106b +800010b8: e01ff0ef jal ra,80000eb8 +800010bc: f71ff06f j 8000102c + +800010c0 : +800010c0: cc5027f3 csrr a5,0xcc5 +800010c4: 00ff0737 lui a4,0xff0 +800010c8: 00e787b3 add a5,a5,a4 +800010cc: 00879793 slli a5,a5,0x8 +800010d0: b0002773 csrr a4,mcycle +800010d4: 00e7a023 sw a4,0(a5) +800010d8: b0102773 csrr a4,0xb01 +800010dc: 00e7a223 sw a4,4(a5) +800010e0: b0202773 csrr a4,minstret +800010e4: 00e7a423 sw a4,8(a5) +800010e8: b0302773 csrr a4,mhpmcounter3 +800010ec: 00e7a623 sw a4,12(a5) +800010f0: b0402773 csrr a4,mhpmcounter4 +800010f4: 00e7a823 sw a4,16(a5) +800010f8: b0502773 csrr a4,mhpmcounter5 +800010fc: 00e7aa23 sw a4,20(a5) +80001100: b0602773 csrr a4,mhpmcounter6 +80001104: 00e7ac23 sw a4,24(a5) +80001108: b0702773 csrr a4,mhpmcounter7 +8000110c: 00e7ae23 sw a4,28(a5) +80001110: b0802773 csrr a4,mhpmcounter8 +80001114: 02e7a023 sw a4,32(a5) +80001118: b0902773 csrr a4,mhpmcounter9 +8000111c: 02e7a223 sw a4,36(a5) +80001120: b0a02773 csrr a4,mhpmcounter10 +80001124: 02e7a423 sw a4,40(a5) +80001128: b0b02773 csrr a4,mhpmcounter11 +8000112c: 02e7a623 sw a4,44(a5) +80001130: b0c02773 csrr a4,mhpmcounter12 +80001134: 02e7a823 sw a4,48(a5) +80001138: b0d02773 csrr a4,mhpmcounter13 +8000113c: 02e7aa23 sw a4,52(a5) +80001140: b0e02773 csrr a4,mhpmcounter14 +80001144: 02e7ac23 sw a4,56(a5) +80001148: b0f02773 csrr a4,mhpmcounter15 +8000114c: 02e7ae23 sw a4,60(a5) +80001150: b1002773 csrr a4,mhpmcounter16 +80001154: 04e7a023 sw a4,64(a5) +80001158: b1102773 csrr a4,mhpmcounter17 +8000115c: 04e7a223 sw a4,68(a5) +80001160: b1202773 csrr a4,mhpmcounter18 +80001164: 04e7a423 sw a4,72(a5) +80001168: b1302773 csrr a4,mhpmcounter19 +8000116c: 04e7a623 sw a4,76(a5) +80001170: b1402773 csrr a4,mhpmcounter20 +80001174: 04e7a823 sw a4,80(a5) +80001178: b1502773 csrr a4,mhpmcounter21 +8000117c: 04e7aa23 sw a4,84(a5) +80001180: b1602773 csrr a4,mhpmcounter22 +80001184: 04e7ac23 sw a4,88(a5) +80001188: b1702773 csrr a4,mhpmcounter23 +8000118c: 04e7ae23 sw a4,92(a5) +80001190: b1802773 csrr a4,mhpmcounter24 +80001194: 06e7a023 sw a4,96(a5) +80001198: b1902773 csrr a4,mhpmcounter25 +8000119c: 06e7a223 sw a4,100(a5) +800011a0: b1a02773 csrr a4,mhpmcounter26 +800011a4: 06e7a423 sw a4,104(a5) +800011a8: b1b02773 csrr a4,mhpmcounter27 +800011ac: 06e7a623 sw a4,108(a5) +800011b0: b1c02773 csrr a4,mhpmcounter28 +800011b4: 06e7a823 sw a4,112(a5) +800011b8: b1d02773 csrr a4,mhpmcounter29 +800011bc: 06e7aa23 sw a4,116(a5) +800011c0: b1e02773 csrr a4,mhpmcounter30 +800011c4: 06e7ac23 sw a4,120(a5) +800011c8: b1f02773 csrr a4,mhpmcounter31 +800011cc: 06e7ae23 sw a4,124(a5) +800011d0: b8002773 csrr a4,mcycleh +800011d4: 08e7a023 sw a4,128(a5) +800011d8: b8102773 csrr a4,0xb81 +800011dc: 08e7a223 sw a4,132(a5) +800011e0: b8202773 csrr a4,minstreth +800011e4: 08e7a423 sw a4,136(a5) +800011e8: b8302773 csrr a4,mhpmcounter3h +800011ec: 08e7a623 sw a4,140(a5) +800011f0: b8402773 csrr a4,mhpmcounter4h +800011f4: 08e7a823 sw a4,144(a5) +800011f8: b8502773 csrr a4,mhpmcounter5h +800011fc: 08e7aa23 sw a4,148(a5) +80001200: b8602773 csrr a4,mhpmcounter6h +80001204: 08e7ac23 sw a4,152(a5) +80001208: b8702773 csrr a4,mhpmcounter7h +8000120c: 08e7ae23 sw a4,156(a5) +80001210: b8802773 csrr a4,mhpmcounter8h +80001214: 0ae7a023 sw a4,160(a5) +80001218: b8902773 csrr a4,mhpmcounter9h +8000121c: 0ae7a223 sw a4,164(a5) +80001220: b8a02773 csrr a4,mhpmcounter10h +80001224: 0ae7a423 sw a4,168(a5) +80001228: b8b02773 csrr a4,mhpmcounter11h +8000122c: 0ae7a623 sw a4,172(a5) +80001230: b8c02773 csrr a4,mhpmcounter12h +80001234: 0ae7a823 sw a4,176(a5) +80001238: b8d02773 csrr a4,mhpmcounter13h +8000123c: 0ae7aa23 sw a4,180(a5) +80001240: b8e02773 csrr a4,mhpmcounter14h +80001244: 0ae7ac23 sw a4,184(a5) +80001248: b8f02773 csrr a4,mhpmcounter15h +8000124c: 0ae7ae23 sw a4,188(a5) +80001250: b9002773 csrr a4,mhpmcounter16h +80001254: 0ce7a023 sw a4,192(a5) +80001258: b9102773 csrr a4,mhpmcounter17h +8000125c: 0ce7a223 sw a4,196(a5) +80001260: b9202773 csrr a4,mhpmcounter18h +80001264: 0ce7a423 sw a4,200(a5) +80001268: b9302773 csrr a4,mhpmcounter19h +8000126c: 0ce7a623 sw a4,204(a5) +80001270: b9402773 csrr a4,mhpmcounter20h +80001274: 0ce7a823 sw a4,208(a5) +80001278: b9502773 csrr a4,mhpmcounter21h +8000127c: 0ce7aa23 sw a4,212(a5) +80001280: b9602773 csrr a4,mhpmcounter22h +80001284: 0ce7ac23 sw a4,216(a5) +80001288: b9702773 csrr a4,mhpmcounter23h +8000128c: 0ce7ae23 sw a4,220(a5) +80001290: b9802773 csrr a4,mhpmcounter24h +80001294: 0ee7a023 sw a4,224(a5) +80001298: b9902773 csrr a4,mhpmcounter25h +8000129c: 0ee7a223 sw a4,228(a5) +800012a0: b9a02773 csrr a4,mhpmcounter26h +800012a4: 0ee7a423 sw a4,232(a5) +800012a8: b9b02773 csrr a4,mhpmcounter27h +800012ac: 0ee7a623 sw a4,236(a5) +800012b0: b9c02773 csrr a4,mhpmcounter28h +800012b4: 0ee7a823 sw a4,240(a5) +800012b8: b9d02773 csrr a4,mhpmcounter29h +800012bc: 0ee7aa23 sw a4,244(a5) +800012c0: b9e02773 csrr a4,mhpmcounter30h +800012c4: 0ee7ac23 sw a4,248(a5) +800012c8: b9f02773 csrr a4,mhpmcounter31h +800012cc: 0ee7ae23 sw a4,252(a5) +800012d0: 00008067 ret + +800012d4 : +800012d4: 00050593 mv a1,a0 +800012d8: 00000693 li a3,0 +800012dc: 00000613 li a2,0 +800012e0: 00000513 li a0,0 +800012e4: 1140006f j 800013f8 <__register_exitproc> + +800012e8 : +800012e8: ff010113 addi sp,sp,-16 +800012ec: 00000593 li a1,0 +800012f0: 00812423 sw s0,8(sp) +800012f4: 00112623 sw ra,12(sp) +800012f8: 00050413 mv s0,a0 +800012fc: 198000ef jal ra,80001494 <__call_exitprocs> +80001300: 800037b7 lui a5,0x80003 +80001304: 9f07a503 lw a0,-1552(a5) # 800029f0 <__stack_top+0x810029f0> +80001308: 03c52783 lw a5,60(a0) +8000130c: 00078463 beqz a5,80001314 +80001310: 000780e7 jalr a5 +80001314: 00040513 mv a0,s0 +80001318: a55ff0ef jal ra,80000d6c <_exit> + +8000131c : +8000131c: 00f00313 li t1,15 +80001320: 00050713 mv a4,a0 +80001324: 02c37e63 bgeu t1,a2,80001360 +80001328: 00f77793 andi a5,a4,15 +8000132c: 0a079063 bnez a5,800013cc +80001330: 08059263 bnez a1,800013b4 +80001334: ff067693 andi a3,a2,-16 +80001338: 00f67613 andi a2,a2,15 +8000133c: 00e686b3 add a3,a3,a4 +80001340: 00b72023 sw a1,0(a4) # ff0000 <__stack_size+0xfefc00> +80001344: 00b72223 sw a1,4(a4) +80001348: 00b72423 sw a1,8(a4) +8000134c: 00b72623 sw a1,12(a4) +80001350: 01070713 addi a4,a4,16 +80001354: fed766e3 bltu a4,a3,80001340 +80001358: 00061463 bnez a2,80001360 +8000135c: 00008067 ret +80001360: 40c306b3 sub a3,t1,a2 +80001364: 00269693 slli a3,a3,0x2 +80001368: 00000297 auipc t0,0x0 +8000136c: 005686b3 add a3,a3,t0 +80001370: 00c68067 jr 12(a3) +80001374: 00b70723 sb a1,14(a4) +80001378: 00b706a3 sb a1,13(a4) +8000137c: 00b70623 sb a1,12(a4) +80001380: 00b705a3 sb a1,11(a4) +80001384: 00b70523 sb a1,10(a4) +80001388: 00b704a3 sb a1,9(a4) +8000138c: 00b70423 sb a1,8(a4) +80001390: 00b703a3 sb a1,7(a4) +80001394: 00b70323 sb a1,6(a4) +80001398: 00b702a3 sb a1,5(a4) +8000139c: 00b70223 sb a1,4(a4) +800013a0: 00b701a3 sb a1,3(a4) +800013a4: 00b70123 sb a1,2(a4) +800013a8: 00b700a3 sb a1,1(a4) +800013ac: 00b70023 sb a1,0(a4) +800013b0: 00008067 ret +800013b4: 0ff5f593 andi a1,a1,255 +800013b8: 00859693 slli a3,a1,0x8 +800013bc: 00d5e5b3 or a1,a1,a3 +800013c0: 01059693 slli a3,a1,0x10 +800013c4: 00d5e5b3 or a1,a1,a3 +800013c8: f6dff06f j 80001334 +800013cc: 00279693 slli a3,a5,0x2 +800013d0: 00000297 auipc t0,0x0 +800013d4: 005686b3 add a3,a3,t0 +800013d8: 00008293 mv t0,ra +800013dc: fa0680e7 jalr -96(a3) +800013e0: 00028093 mv ra,t0 +800013e4: ff078793 addi a5,a5,-16 +800013e8: 40f70733 sub a4,a4,a5 +800013ec: 00f60633 add a2,a2,a5 +800013f0: f6c378e3 bgeu t1,a2,80001360 +800013f4: f3dff06f j 80001330 + +800013f8 <__register_exitproc>: +800013f8: 800037b7 lui a5,0x80003 +800013fc: 9f07a703 lw a4,-1552(a5) # 800029f0 <__stack_top+0x810029f0> +80001400: 14872783 lw a5,328(a4) +80001404: 04078c63 beqz a5,8000145c <__register_exitproc+0x64> +80001408: 0047a703 lw a4,4(a5) +8000140c: 01f00813 li a6,31 +80001410: 06e84e63 blt a6,a4,8000148c <__register_exitproc+0x94> +80001414: 00271813 slli a6,a4,0x2 +80001418: 02050663 beqz a0,80001444 <__register_exitproc+0x4c> +8000141c: 01078333 add t1,a5,a6 +80001420: 08c32423 sw a2,136(t1) +80001424: 1887a883 lw a7,392(a5) +80001428: 00100613 li a2,1 +8000142c: 00e61633 sll a2,a2,a4 +80001430: 00c8e8b3 or a7,a7,a2 +80001434: 1917a423 sw a7,392(a5) +80001438: 10d32423 sw a3,264(t1) +8000143c: 00200693 li a3,2 +80001440: 02d50463 beq a0,a3,80001468 <__register_exitproc+0x70> +80001444: 00170713 addi a4,a4,1 +80001448: 00e7a223 sw a4,4(a5) +8000144c: 010787b3 add a5,a5,a6 +80001450: 00b7a423 sw a1,8(a5) +80001454: 00000513 li a0,0 +80001458: 00008067 ret +8000145c: 14c70793 addi a5,a4,332 +80001460: 14f72423 sw a5,328(a4) +80001464: fa5ff06f j 80001408 <__register_exitproc+0x10> +80001468: 18c7a683 lw a3,396(a5) +8000146c: 00170713 addi a4,a4,1 +80001470: 00e7a223 sw a4,4(a5) +80001474: 00c6e633 or a2,a3,a2 +80001478: 18c7a623 sw a2,396(a5) +8000147c: 010787b3 add a5,a5,a6 +80001480: 00b7a423 sw a1,8(a5) +80001484: 00000513 li a0,0 +80001488: 00008067 ret +8000148c: fff00513 li a0,-1 +80001490: 00008067 ret + +80001494 <__call_exitprocs>: +80001494: fd010113 addi sp,sp,-48 +80001498: 800037b7 lui a5,0x80003 +8000149c: 01412c23 sw s4,24(sp) +800014a0: 9f07aa03 lw s4,-1552(a5) # 800029f0 <__stack_top+0x810029f0> +800014a4: 03212023 sw s2,32(sp) +800014a8: 02112623 sw ra,44(sp) +800014ac: 148a2903 lw s2,328(s4) +800014b0: 02812423 sw s0,40(sp) +800014b4: 02912223 sw s1,36(sp) +800014b8: 01312e23 sw s3,28(sp) +800014bc: 01512a23 sw s5,20(sp) +800014c0: 01612823 sw s6,16(sp) +800014c4: 01712623 sw s7,12(sp) +800014c8: 01812423 sw s8,8(sp) +800014cc: 04090063 beqz s2,8000150c <__call_exitprocs+0x78> +800014d0: 00050b13 mv s6,a0 +800014d4: 00058b93 mv s7,a1 +800014d8: 00100a93 li s5,1 +800014dc: fff00993 li s3,-1 +800014e0: 00492483 lw s1,4(s2) +800014e4: fff48413 addi s0,s1,-1 +800014e8: 02044263 bltz s0,8000150c <__call_exitprocs+0x78> +800014ec: 00249493 slli s1,s1,0x2 +800014f0: 009904b3 add s1,s2,s1 +800014f4: 040b8463 beqz s7,8000153c <__call_exitprocs+0xa8> +800014f8: 1044a783 lw a5,260(s1) +800014fc: 05778063 beq a5,s7,8000153c <__call_exitprocs+0xa8> +80001500: fff40413 addi s0,s0,-1 +80001504: ffc48493 addi s1,s1,-4 +80001508: ff3416e3 bne s0,s3,800014f4 <__call_exitprocs+0x60> +8000150c: 02c12083 lw ra,44(sp) +80001510: 02812403 lw s0,40(sp) +80001514: 02412483 lw s1,36(sp) +80001518: 02012903 lw s2,32(sp) +8000151c: 01c12983 lw s3,28(sp) +80001520: 01812a03 lw s4,24(sp) +80001524: 01412a83 lw s5,20(sp) +80001528: 01012b03 lw s6,16(sp) +8000152c: 00c12b83 lw s7,12(sp) +80001530: 00812c03 lw s8,8(sp) +80001534: 03010113 addi sp,sp,48 +80001538: 00008067 ret +8000153c: 00492783 lw a5,4(s2) +80001540: 0044a683 lw a3,4(s1) +80001544: fff78793 addi a5,a5,-1 +80001548: 04878e63 beq a5,s0,800015a4 <__call_exitprocs+0x110> +8000154c: 0004a223 sw zero,4(s1) +80001550: fa0688e3 beqz a3,80001500 <__call_exitprocs+0x6c> +80001554: 18892783 lw a5,392(s2) +80001558: 008a9733 sll a4,s5,s0 +8000155c: 00492c03 lw s8,4(s2) +80001560: 00f777b3 and a5,a4,a5 +80001564: 02079263 bnez a5,80001588 <__call_exitprocs+0xf4> +80001568: 000680e7 jalr a3 +8000156c: 00492703 lw a4,4(s2) +80001570: 148a2783 lw a5,328(s4) +80001574: 01871463 bne a4,s8,8000157c <__call_exitprocs+0xe8> +80001578: f92784e3 beq a5,s2,80001500 <__call_exitprocs+0x6c> +8000157c: f80788e3 beqz a5,8000150c <__call_exitprocs+0x78> +80001580: 00078913 mv s2,a5 +80001584: f5dff06f j 800014e0 <__call_exitprocs+0x4c> +80001588: 18c92783 lw a5,396(s2) +8000158c: 0844a583 lw a1,132(s1) +80001590: 00f77733 and a4,a4,a5 +80001594: 00071c63 bnez a4,800015ac <__call_exitprocs+0x118> +80001598: 000b0513 mv a0,s6 +8000159c: 000680e7 jalr a3 +800015a0: fcdff06f j 8000156c <__call_exitprocs+0xd8> +800015a4: 00892223 sw s0,4(s2) +800015a8: fa9ff06f j 80001550 <__call_exitprocs+0xbc> +800015ac: 00058513 mv a0,a1 +800015b0: 000680e7 jalr a3 +800015b4: fb9ff06f j 8000156c <__call_exitprocs+0xd8> + +Disassembly of section .init_array: + +800025b8 <__init_array_start>: +800025b8: 00fc addi a5,sp,76 +800025ba: 8000 0x8000 + +Disassembly of section .data: + +800025c0 : +800025c0: 0000 unimp +800025c2: 0000 unimp +800025c4: 28ac fld fa1,80(s1) +800025c6: 8000 0x8000 +800025c8: 2914 fld fa3,16(a0) +800025ca: 8000 0x8000 +800025cc: 297c fld fa5,208(a0) +800025ce: 8000 0x8000 + ... +80002668: 0001 nop +8000266a: 0000 unimp +8000266c: 0000 unimp +8000266e: 0000 unimp +80002670: 330e fld ft6,224(sp) +80002672: abcd j 80002c64 <__BSS_END__+0x1f0> +80002674: 1234 addi a3,sp,296 +80002676: e66d bnez a2,80002760 +80002678: deec sw a1,124(a3) +8000267a: 0005 c.nop 1 +8000267c: 0000000b 0xb + ... + +Disassembly of section .sdata: + +800029e8 <__SDATA_BEGIN__>: +800029e8: 0000 unimp +800029ea: 4980 lw s0,16(a1) +800029ec: 0000 unimp +800029ee: 3f80 fld fs0,56(a5) + +800029f0 <_global_impure_ptr>: +800029f0: 25c0 fld fs0,136(a1) +800029f2: 8000 0x8000 + +Disassembly of section .bss: + +800029f4 : + ... + +Disassembly of section .comment: + +00000000 <.comment>: + 0: 3a434347 fmsub.d ft6,ft6,ft4,ft7,rmm + 4: 2820 fld fs0,80(s0) + 6: 29554e47 fmsub.s ft8,fa0,fs5,ft5,rmm + a: 3120 fld fs0,96(a0) + c: 2e30 fld fa2,88(a2) + e: 2e32 fld ft8,264(sp) + 10: 0030 addi a2,sp,8 + +Disassembly of section .riscv.attributes: + +00000000 <.riscv.attributes>: + 0: 2941 jal 490 <__stack_size+0x90> + 2: 0000 unimp + 4: 7200 flw fs0,32(a2) + 6: 7369 lui t1,0xffffa + 8: 01007663 bgeu zero,a6,14 <__stack_usage+0x14> + c: 001f 0000 1004 0x10040000001f + 12: 7205 lui tp,0xfffe1 + 14: 3376 fld ft6,376(sp) + 16: 6932 flw fs2,12(sp) + 18: 7032 flw ft0,44(sp) + 1a: 5f30 lw a2,120(a4) + 1c: 326d jal fffff9c6 <__stack_top+0xfff9c6> + 1e: 3070 fld fa2,224(s0) + 20: 665f 7032 0030 0x307032665f + 26: 0108 addi a0,sp,128 + 28: 0b0a slli s6,s6,0x2 diff --git a/tests/regression/tex/kernel.elf b/tests/regression/tex/kernel.elf new file mode 100755 index 00000000..15dff9f8 Binary files /dev/null and b/tests/regression/tex/kernel.elf differ diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp new file mode 100644 index 00000000..b57f18e4 --- /dev/null +++ b/tests/regression/tex/main.cpp @@ -0,0 +1,260 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include "utils.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +const char* input_file = "palette64.tga"; +const char* output_file = "output.tga"; +int wrap = 0; +int filter = 0; +float scale = 1.0f; +int format = 0; +bool use_sw = false; +ePixelFormat eformat = FORMAT_A8R8G8B8; + +vx_device_h device = nullptr; +vx_buffer_h buffer = nullptr; + +static void show_usage() { + std::cout << "Vortex Texture Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "zi:o:k:w:f:g:h?")) != -1) { + switch (c) { + case 'i': + input_file = optarg; + break; + case 'o': + output_file = optarg; + break; + case 's': + scale = std::stof(optarg, NULL); + break; + case 'w': + wrap = std::atoi(optarg); + break; + case 'z': + use_sw = true; + break; + case 'f': { + format = std::atoi(optarg); + switch (format) { + case 0: eformat = FORMAT_A8R8G8B8; break; + case 1: eformat = FORMAT_R5G6B5; break; + case 2: eformat = FORMAT_R4G4B4A4; break; + case 3: eformat = FORMAT_L8; break; + case 4: eformat = FORMAT_A8; break; + default: + std::cout << "Error: invalid format: " << format << std::endl; + exit(1); + } + } break; + case 'g': + filter = std::atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (buffer) { + vx_buf_release(buffer); + } + if (device) { + vx_dev_close(device); + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t width, + uint32_t height, + uint32_t bpp) { + auto time_start = std::chrono::high_resolution_clock::now(); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, -1)); + + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + + std::vector dst_pixels(buf_size); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); + for (uint32_t i = 0; i < buf_size; ++i) { + dst_pixels[i] = buf_ptr[i]; + } + + // save output image + std::cout << "save output image" << std::endl; + //dump_image(dst_pixels, width, height, bpp); + RT_CHECK(SaveTGA(output_file, dst_pixels, width, height, bpp)); + + return 0; +} + +int main(int argc, char *argv[]) { + kernel_arg_t kernel_arg; + std::vector src_pixels; + uint32_t src_width; + uint32_t src_height; + uint32_t src_bpp; + + // parse command arguments + parse_args(argc, argv); + + std::vector tmp_pixels; + RT_CHECK(LoadTGA(input_file, tmp_pixels, &src_width, &src_height)); + + // check power of two support + if (!ISPOW2(src_width) || !ISPOW2(src_height)) { + std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl; + return -1; + } + + RT_CHECK(ConvertImage(src_pixels, tmp_pixels, src_width, src_height, FORMAT_A8R8G8B8, eformat)); + src_bpp = Format::GetInfo(eformat).BytePerPixel; + + //dump_image(src_pixels, src_width, src_height, src_bpp); + + uint32_t src_bufsize = src_bpp * src_width * src_height; + + uint32_t dst_width = (uint32_t)(src_width * scale); + uint32_t dst_height = (uint32_t)(src_height * scale); + uint32_t dst_bpp = 4; + uint32_t dst_bufsize = dst_bpp * dst_width * dst_height; + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + unsigned max_cores, max_warps, max_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); + + uint32_t num_tasks = max_cores * max_warps * max_threads; + + std::cout << "number of tasks: " << std::dec << num_tasks << std::endl; + std::cout << "source buffer: width=" << src_width << ", heigth=" << src_height << ", size=" << src_bufsize << " bytes" << std::endl; + std::cout << "destination buffer: width=" << dst_width << ", heigth=" << dst_height << ", size=" << dst_bufsize << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + size_t src_addr, dst_addr; + RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr)); + RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr)); + + std::cout << "src_addr=0x" << std::hex << src_addr << std::endl; + std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl; + + // allocate staging shared memory + std::cout << "allocate shared memory" << std::endl; + uint32_t alloc_size = std::max(sizeof(kernel_arg_t), std::max(src_bufsize, dst_bufsize)); + RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + { + kernel_arg.num_tasks = std::min(num_tasks, dst_height); + kernel_arg.format = format; + kernel_arg.filter = filter; + kernel_arg.wrap = wrap; + kernel_arg.use_sw = use_sw; + kernel_arg.lod = 0x0; + + kernel_arg.src_logWidth = (uint32_t)std::log2(src_width); + kernel_arg.src_logHeight = (uint32_t)std::log2(src_height); + kernel_arg.src_stride = src_bpp; + kernel_arg.src_pitch = src_bpp * src_width; + kernel_arg.src_ptr = src_addr; + + kernel_arg.dst_width = dst_width; + kernel_arg.dst_height = dst_height; + kernel_arg.dst_stride = dst_bpp; + kernel_arg.dst_pitch = dst_bpp * dst_width; + kernel_arg.dst_ptr = dst_addr; + + auto buf_ptr = (int*)vx_host_ptr(buffer); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + } + + // upload source buffer + std::cout << "upload source buffer" << std::endl; + { + auto buf_ptr = (int8_t*)vx_host_ptr(buffer); + for (uint32_t i = 0; i < src_bufsize; ++i) { + buf_ptr[i] = src_pixels[i]; + } + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0)); + } + + // clear destination buffer + std::cout << "clear destination buffer" << std::endl; + { + auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + for (uint32_t i = 0; i < (dst_bufsize/4); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height, dst_bpp)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/tests/regression/tex/output.tga b/tests/regression/tex/output.tga new file mode 100644 index 00000000..31dcb0d3 Binary files /dev/null and b/tests/regression/tex/output.tga differ diff --git a/tests/regression/tex/palette16.tga b/tests/regression/tex/palette16.tga new file mode 100644 index 00000000..2653f445 Binary files /dev/null and b/tests/regression/tex/palette16.tga differ diff --git a/tests/regression/tex/palette4.tga b/tests/regression/tex/palette4.tga new file mode 100644 index 00000000..c342c47f Binary files /dev/null and b/tests/regression/tex/palette4.tga differ diff --git a/tests/regression/tex/palette64.tga b/tests/regression/tex/palette64.tga new file mode 100644 index 00000000..ea9bcd97 Binary files /dev/null and b/tests/regression/tex/palette64.tga differ diff --git a/tests/regression/tex/texsw.h b/tests/regression/tex/texsw.h new file mode 100644 index 00000000..0b3b07e7 --- /dev/null +++ b/tests/regression/tex/texsw.h @@ -0,0 +1,167 @@ +#ifndef _TEXSW_H_ + +#include "common.h" + +#define TEX_LOD_MAX 11 + +#define MIN(x, y) ((x < y) ? (x) : (y)) + +#define MAX(x, y) ((x > y) ? (x) : (y)) + +inline int address(int wrap, int value) { + switch (wrap) { + case 1: return value & 0xfffff; + default: + case 0: return MIN(MAX(value, 0), 0xfffff); + } +} + +inline void unpack(int format, int value, int* l, int* h) { + switch (format) { + case 1: + case 2: + *l = value; + *h = 0; + break; + case 3: + *l = (value | (value << 8)) & 0x00ff00ff; + *h = 0; + break; + case 4: + *l = (value | (value << 16)) & 0x07e0f81f; + *h = 0; + break; + case 5: + *l = (value | (value << 12)) & 0x0f0f0f0f; + *h = 0; + break; + default: + case 0: + *l = value & 0x00ff00ff; + *h = (value >> 8) & 0x00ff00ff; + break; + } +} + +inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) { + *l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + *h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; +} + +inline int pack(int format, int l, int h) { + switch (format) { + case 1: + case 2: + return l; + case 3: + return (l | (l >> 8)) & 0xffff; + case 4: + return (l | (l >> 16)) & 0xffff; + case 5: + return (l | (l >> 12)) & 0xffff; + default: + case 0: + return (h << 8) | l; + } +} + +inline int tex_sw(struct kernel_arg_t* state, int stage, int u, int v, int lod) { + int base_addr = state->src_ptr; + int mip_offset = 0; + int log_width = state->src_logWidth; + int log_height = state->src_logHeight; + int format = state->format; + int wrap = state->wrap; + int filter = state->filter; + + int32_t* pBits = ((uint32_t*)base_addr) + mip_offset; + + if (filter) { + int u0 = address(wrap, u - (0x80000 >> log_width)); + int v0 = address(wrap, v - (0x80000 >> log_height)); + int u1 = address(wrap, u + (0x80000 >> log_width)); + int v1 = address(wrap, v + (0x80000 >> log_height)); + + int x0 = u0 >> (20 - log_width); + int y0 = v0 >> (20 - log_height); + int x1 = u1 >> (20 - log_width); + int y1 = v1 >> (20 - log_height); + + // memory lookup + + int c0 = pBits[x0 + (y0 << log_width)]; + int c1 = pBits[x1 + (y0 << log_width)]; + int c2 = pBits[x0 + (y1 << log_width)]; + int c3 = pBits[x1 + (y1 << log_width)]; + + // filtering + + int alpha = x0 & 0xff; + int beta = y0 & 0xff; + + int c0a, c0b; + int c1a, c1b; + int c01a, c01b; + + unpack(format, c0, &c0a, &c0b); + unpack(format, c1, &c1a, &c1b); + lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b); + + int c2a, c2b; + int c3a, c3b; + int c23a, c23b; + + unpack(format, c2, &c2a, &c2b); + unpack(format, c3, &c3a, &c3b); + lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b); + + int c4a, c4b; + lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b); + return pack(format, c4a, c4b); + } else { + int u0 = address(wrap, u); + int v0 = address(wrap, v); + + int x0 = u0 >> (20 - log_width); + int y0 = v0 >> (20 - log_height); + + int c0 = pBits[x0 + (y0 <> 8) & 0x00ff00ff; + int bl = b & 0x00ff00ff; + int bh = (b >> 8) & 0x00ff00ff; + int frac = (lod >> 12) & 0xff; + int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; + int c = al | (ah << 8); + return c; +} + +inline int tex3_sw(struct kernel_arg_t* state, int stage, int u, int v, int lod) { + int lodn = MIN(lod + 0x10000, TEX_LOD_MAX); + int a = tex_sw(state, 0, u, v, lod); + int b = tex_sw(state, 0, u, v, lodn); + int al = a & 0x00ff00ff; + int ah = (a >> 8) & 0x00ff00ff; + + int bl = b & 0x00ff00ff; + int bh = (b >> 8) & 0x00ff00ff; + int frac = (lod >> 12) & 0xff; + int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; + int c = al | (ah << 8); + return c; +} + +#endif \ No newline at end of file diff --git a/tests/regression/tex/toad.tga b/tests/regression/tex/toad.tga new file mode 100644 index 00000000..1903c5c3 Binary files /dev/null and b/tests/regression/tex/toad.tga differ diff --git a/tests/regression/tex/utils.cpp b/tests/regression/tex/utils.cpp new file mode 100644 index 00000000..46f0862c --- /dev/null +++ b/tests/regression/tex/utils.cpp @@ -0,0 +1,217 @@ +#include "utils.h" +#include +#include +#include "format.h" + +struct __attribute__((__packed__)) tga_header_t { + int8_t idlength; + int8_t colormaptype; + int8_t imagetype; + int16_t colormaporigin; + int16_t colormaplength; + int8_t colormapdepth; + int16_t xoffset; + int16_t yoffset; + int16_t width; + int16_t height; + int8_t bitsperpixel; + int8_t imagedescriptor; +}; + +int LoadTGA(const char *filename, + std::vector &pixels, + uint32_t *width, + uint32_t *height) { + std::ifstream ifs(filename, std::ios::in | std::ios::binary); + if (!ifs.is_open()) { + std::cerr << "couldn't open file: " << filename << "!" << std::endl; + return -1; + } + + tga_header_t header; + ifs.read(reinterpret_cast(&header), sizeof(tga_header_t)); + if (ifs.fail()) { + std::cerr << "invalid TGA file header!" << std::endl; + return -1; + } + + if (header.imagetype != 2) { + std::cerr << "unsupported TGA encoding format!" << std::endl; + return -1; + } + + ifs.seekg(header.idlength, std::ios::cur); // skip string + if (ifs.fail()) { + std::cerr << "invalid TGA file!" << std::endl; + return -1; + } + + switch (header.bitsperpixel) { + case 16: + case 24: + case 32: { + auto stride = header.bitsperpixel / 8; + std::vector staging(stride * header.width * header.height); + + // Read pixels data + ifs.read((char*)staging.data(), staging.size()); + if (ifs.fail()) { + std::cerr << "invalid TGA file!" << std::endl; + return -1; + } + + // format conversion to RGBA + pixels.resize(4 * header.width * header.height); + const uint8_t* src_bytes = staging.data(); + uint32_t* dst_bytes = (uint32_t*)pixels.data(); + for (const uint8_t* const src_end = src_bytes + staging.size(); + src_bytes != src_end; + src_bytes += stride) { + ColorARGB color; + switch (stride) { + case 2: + color = Format::ConvertFrom(src_bytes); + break; + case 3: + color = Format::ConvertFrom(src_bytes); + break; + case 4: + color = Format::ConvertFrom(src_bytes); + break; + default: + std::abort(); + } + *dst_bytes++ = color; + } + break; + } + default: + std::cerr << "unsupported TGA bitsperpixel!" << std::endl; + return -1; + } + + *width = header.width; + *height = header.height; + + return 0; +} + +int SaveTGA(const char *filename, + const std::vector &pixels, + uint32_t width, + uint32_t height, + uint32_t bpp) { + std::ofstream ofs(filename, std::ios::out | std::ios::binary); + if (!ofs.is_open()) { + std::cerr << "couldn't create file: " << filename << "!" << std::endl; + return -1; + } + + if (bpp < 2 || bpp > 4) { + std::cerr << "unsupported pixel stride: " << bpp << "!" << std::endl; + return -1; + } + + tga_header_t header; + header.idlength = 0; + header.colormaptype = 0; // no palette + header.imagetype = 2; // color mapped data + header.colormaporigin = 0; + header.colormaplength = 0; + header.colormapdepth = 0; + header.xoffset = 0; + header.yoffset = 0; + header.width = width; + header.height = height; + header.bitsperpixel = bpp * 8; + header.imagedescriptor = 0; + + // write header + ofs.write(reinterpret_cast(&header), sizeof(tga_header_t)); + + // write pixel data + uint32_t pitch = bpp * width; + const uint8_t* pixel_bytes = pixels.data() + (height - 1) * pitch; + for (uint32_t y = 0; y < height; ++y) { + const uint8_t* pixel_row = pixel_bytes; + for (uint32_t x = 0; x < width; ++x) { + ofs.write((const char*)pixel_row, bpp); + pixel_row += bpp; + } + pixel_bytes -= pitch; + } + + return 0; +} + +void dump_image(const std::vector& pixels, uint32_t width, uint32_t height, uint32_t bpp) { + assert(width * height * bpp == pixels.size()); + const uint8_t* pixel_bytes = pixels.data(); + for (uint32_t y = 0; y < height; ++y) { + for (uint32_t x = 0; x < width; ++x) { + uint32_t pixel32 = 0; + for (uint32_t b = 0; b < bpp; ++b) { + uint32_t pixel8 = *pixel_bytes++; + pixel32 |= pixel8 << (b * 8); + } + if (x) std::cout << ", "; + std::cout << std::hex << pixel32; + } + std::cout << std::endl; + } +} + +int CopyBuffers(SurfaceDesc &dstDesc, + int32_t dstOffsetX, + int32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + int32_t srcOffsetX, + int32_t srcOffsetY) { + + static const BlitTable s_blitTable; + + if ((srcOffsetX >= (int32_t)srcDesc.Width) || (srcOffsetY >= (int32_t)srcDesc.Height) || + (dstOffsetX >= (int32_t)dstDesc.Width) || (dstOffsetY >= (int32_t)dstDesc.Height)) { + return -1; + } + + if (copyWidth > dstDesc.Width) { + copyWidth = dstDesc.Width; + } + + if (copyWidth > srcDesc.Width) { + copyWidth = srcDesc.Width; + } + + if (copyHeight > dstDesc.Height) { + copyHeight = dstDesc.Height; + } + + if (copyHeight > srcDesc.Height) { + copyHeight = srcDesc.Height; + } + + return s_blitTable.get(srcDesc.Format, dstDesc.Format)( + dstDesc, dstOffsetX, dstOffsetY, copyWidth, copyHeight, srcDesc, + srcOffsetX, srcOffsetY); +} + +int ConvertImage(std::vector& dst_pixels, + const std::vector& src_pixels, + uint32_t width, + uint32_t height, + ePixelFormat src_format, + ePixelFormat dst_format) { + + uint32_t src_pitch = Format::GetInfo(src_format).BytePerPixel * width; + uint32_t dst_pitch = Format::GetInfo(dst_format).BytePerPixel * width; + + dst_pixels.resize(dst_pitch * height); + + SurfaceDesc srcDesc{src_format, (uint8_t*)src_pixels.data(), width, height, src_pitch}; + SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch}; + + return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0); +} \ No newline at end of file diff --git a/tests/regression/tex/utils.h b/tests/regression/tex/utils.h new file mode 100644 index 00000000..2e7f7d7c --- /dev/null +++ b/tests/regression/tex/utils.h @@ -0,0 +1,42 @@ +#include +#include +#include +#include "blitter.h" + +#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1)))) + +inline uint32_t ilog2 (uint32_t value) { + return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1; +} + +int LoadTGA(const char *filename, + std::vector &pixels, + uint32_t *width, + uint32_t *height); + +int SaveTGA(const char *filename, + const std::vector &pixels, + uint32_t width, + uint32_t height, + uint32_t bpp); + +int CopyBuffers(SurfaceDesc &dstDesc, + int32_t dstOffsetX, + int32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + int32_t srcOffsetX, + int32_t srcOffsetY); + +int ConvertImage(std::vector& dst_pixels, + const std::vector& src_pixels, + uint32_t width, + uint32_t height, + ePixelFormat src_format, + ePixelFormat dst_format); + +void dump_image(const std::vector& pixels, + uint32_t width, + uint32_t height, + uint32_t bpp);