diff --git a/hw/VX_config.h b/hw/VX_config.h new file mode 100644 index 00000000..c730b02c --- /dev/null +++ b/hw/VX_config.h @@ -0,0 +1,432 @@ +// auto-generated by gen_config.py. DO NOT EDIT +// Generated at 2021-03-20 18:29:13.211392 + +#ifndef VX_USER_CONFIG +#define VX_USER_CONFIG + + +#endif +// auto-generated by gen_config.py. DO NOT EDIT +// Generated at 2021-03-20 18:29:13.214396 + +// Translated from VX_config.vh: + +#ifndef VX_CONFIG +#define VX_CONFIG + + + +#ifndef NUM_CLUSTERS +#define NUM_CLUSTERS 1 +#endif + +#ifndef NUM_CORES +#define NUM_CORES 1 +#endif + +#ifndef NUM_WARPS +#define NUM_WARPS 4 +#endif + +#ifndef NUM_THREADS +#define NUM_THREADS 4 +#endif + +#ifndef NUM_BARRIERS +#define NUM_BARRIERS 4 +#endif + +#ifndef L2_ENABLE +#define L2_ENABLE 0 +#endif + +#ifndef L3_ENABLE +#define L3_ENABLE 0 +#endif + +#ifndef SM_ENABLE +#define SM_ENABLE 1 +#endif + +#ifndef GLOBAL_BLOCK_SIZE +#define GLOBAL_BLOCK_SIZE 64 +#endif + +#ifndef L1_BLOCK_SIZE +#define L1_BLOCK_SIZE (NUM_THREADS * 4) +#endif + +#ifndef STARTUP_ADDR +#define STARTUP_ADDR 0x80000000 +#endif + +#ifndef IO_BUS_BASE_ADDR +#define IO_BUS_BASE_ADDR 0xFF000000 +#endif + +#ifndef SHARED_MEM_BASE_ADDR +#define SHARED_MEM_BASE_ADDR IO_BUS_BASE_ADDR +#endif + +#ifndef SHARED_MEM_BASE_ADDR_ALIGN +#define SHARED_MEM_BASE_ADDR_ALIGN 64 +#endif + +#ifndef IO_BUS_ADDR_COUT +#define IO_BUS_ADDR_COUT 0xFFFFFFFC +#endif + +#ifndef FRAME_BUFFER_BASE_ADDR +#define FRAME_BUFFER_BASE_ADDR 0xFF000000 +#endif + +#ifndef FRAME_BUFFER_WIDTH +#define FRAME_BUFFER_WIDTH 1920 +#endif + +#ifndef FRAME_BUFFER_HEIGHT +#define FRAME_BUFFER_HEIGHT 1080 +#endif + +#define FRAME_BUFFER_SIZE (FRAME_BUFFER_WIDTH * FRAME_BUFFER_HEIGHT) + +#ifndef EXT_M_DISABLE +#define EXT_M_ENABLE +#endif + +#ifndef EXT_F_DISABLE +#define EXT_F_ENABLE +#endif + +#ifndef EXT_TEX_DISABLE +#define EXT_TEX_ENABLE +#endif + +// Device identification +#define VENDOR_ID 0 +#define ARCHITECTURE_ID 0 +#define IMPLEMENTATION_ID 0 + +/////////////////////////////////////////////////////////////////////////////// + +#ifndef LATENCY_IMUL +#define LATENCY_IMUL 3 +#endif + +#ifndef LATENCY_FNCP +#define LATENCY_FNCP 2 +#endif + +#ifndef LATENCY_FMA +#define LATENCY_FMA 4 +#endif + +#ifndef LATENCY_FDIV +#ifdef ALTERA_S10 +#define LATENCY_FDIV 34 +#else +#define LATENCY_FDIV 15 +#endif +#endif + +#ifndef LATENCY_FSQRT +#ifdef ALTERA_S10 +#define LATENCY_FSQRT 25 +#else +#define LATENCY_FSQRT 10 +#endif +#endif + +#ifndef LATENCY_FDIVSQRT +#define LATENCY_FDIVSQRT 32 +#endif + +#ifndef LATENCY_FCVT +#define LATENCY_FCVT 4 +#endif + +// CSR Addresses ////////////////////////////////////////////////////////////// + +// User Floating-Point CSRs +#define CSR_FFLAGS 0x001 +#define CSR_FRM 0x002 +#define CSR_FCSR 0x003 + +#define CSR_SATP 0x180 + +#define CSR_PMPCFG0 0x3A0 +#define CSR_PMPADDR0 0x3B0 + +#define CSR_MSTATUS 0x300 +#define CSR_MISA 0x301 +#define CSR_MEDELEG 0x302 +#define CSR_MIDELEG 0x303 +#define CSR_MIE 0x304 +#define CSR_MTVEC 0x305 + +#define CSR_MEPC 0x341 + +// Machine Counter/Timers +#define CSR_CYCLE 0xC00 +#define CSR_CYCLE_H 0xC80 +#define CSR_INSTRET 0xC02 +#define CSR_INSTRET_H 0xC82 + +// Machine Performance-monitoring counters +// PERF: pipeline +#define CSR_MPM_IBUF_ST 0xB03 +#define CSR_MPM_IBUF_ST_H 0xB83 +#define CSR_MPM_SCRB_ST 0xB04 +#define CSR_MPM_SCRB_ST_H 0xB84 +#define CSR_MPM_ALU_ST 0xB05 +#define CSR_MPM_ALU_ST_H 0xB85 +#define CSR_MPM_LSU_ST 0xB06 +#define CSR_MPM_LSU_ST_H 0xB86 +#define CSR_MPM_CSR_ST 0xB07 +#define CSR_MPM_CSR_ST_H 0xB87 +#define CSR_MPM_FPU_ST 0xB08 +#define CSR_MPM_FPU_ST_H 0xB88 +#define CSR_MPM_GPU_ST 0xB09 +#define CSR_MPM_GPU_ST_H 0xB89 +// PERF: icache +#define CSR_MPM_ICACHE_READS 0xB0A // total reads +#define CSR_MPM_ICACHE_READS_H 0xB8A +#define CSR_MPM_ICACHE_MISS_R 0xB0B // total misses +#define CSR_MPM_ICACHE_MISS_R_H 0xB8B +#define CSR_MPM_ICACHE_PIPE_ST 0xB0C // pipeline stalls +#define CSR_MPM_ICACHE_PIPE_ST_H 0xB8C +#define CSR_MPM_ICACHE_CRSP_ST 0xB0D // core response stalls +#define CSR_MPM_ICACHE_CRSP_ST_H 0xB8D +// PERF: dcache +#define CSR_MPM_DCACHE_READS 0xB0E // total reads +#define CSR_MPM_DCACHE_READS_H 0xB8E +#define CSR_MPM_DCACHE_WRITES 0xB0F // total writes +#define CSR_MPM_DCACHE_WRITES_H 0xB8F +#define CSR_MPM_DCACHE_MISS_R 0xB10 // read misses +#define CSR_MPM_DCACHE_MISS_R_H 0xB90 +#define CSR_MPM_DCACHE_MISS_W 0xB11 // write misses +#define CSR_MPM_DCACHE_MISS_W_H 0xB91 +#define CSR_MPM_DCACHE_BANK_ST 0xB12 // bank conflicts stalls +#define CSR_MPM_DCACHE_BANK_ST_H 0xB92 +#define CSR_MPM_DCACHE_MSHR_ST 0xB13 // MSHR stalls +#define CSR_MPM_DCACHE_MSHR_ST_H 0xB93 +#define CSR_MPM_DCACHE_PIPE_ST 0xB14 // pipeline stalls +#define CSR_MPM_DCACHE_PIPE_ST_H 0xB94 +#define CSR_MPM_DCACHE_CRSP_ST 0xB15 // core response stalls +#define CSR_MPM_DCACHE_CRSP_ST_H 0xB95 +// PERF: smem +#define CSR_MPM_SMEM_READS 0xB16 // total reads +#define CSR_MPM_SMEM_READS_H 0xB96 +#define CSR_MPM_SMEM_WRITES 0xB17 // total writes +#define CSR_MPM_SMEM_WRITES_H 0xB97 +#define CSR_MPM_SMEM_BANK_ST 0xB18 // bank conflicts stalls +#define CSR_MPM_SMEM_BANK_ST_H 0xB98 +// PERF: memory +#define CSR_MPM_DRAM_READS 0xB19 // dram reads +#define CSR_MPM_DRAM_READS_H 0xB99 +#define CSR_MPM_DRAM_WRITES 0xB1A // dram writes +#define CSR_MPM_DRAM_WRITES_H 0xB9A +#define CSR_MPM_DRAM_ST 0xB1B // dram request stalls +#define CSR_MPM_DRAM_ST_H 0xB9B +#define CSR_MPM_DRAM_LAT 0xB1C // dram latency (total) +#define CSR_MPM_DRAM_LAT_H 0xB9C + +// Machine Information Registers +#define CSR_MVENDORID 0xF11 +#define CSR_MARCHID 0xF12 +#define CSR_MIMPID 0xF13 +#define CSR_MHARTID 0xF14 + +// User SIMT CSRs +#define CSR_WTID 0xCC0 +#define CSR_LTID 0xCC1 +#define CSR_GTID 0xCC2 +#define CSR_LWID 0xCC3 +#define CSR_GWID CSR_MHARTID +#define CSR_GCID 0xCC5 + +// Machine SIMT CSRs +#define CSR_NT 0xFC0 +#define CSR_NW 0xFC1 +#define CSR_NC 0xFC2 + +////////// Texture Units ////////////////////////////////////////////////////// + +#define NUM_TEX_UNITS 2 + +#define CSR_TEX_STATES 8 +#define CSR_TEX_BEGIN(x) (0xFD0 + (x) * CSR_TEX_STATES) + +#define CSR_TEX_ADDR(x) (CSR_TEX_BEGIN(x) + 0x00) +#define CSR_TEX_FORMAT(x) (CSR_TEX_BEGIN(x) + 0x01) +#define CSR_TEX_WIDTH(x) (CSR_TEX_BEGIN(x) + 0x02) +#define CSR_TEX_HEIGHT(x) (CSR_TEX_BEGIN(x) + 0x03) +#define CSR_TEX_STRIDE(x) (CSR_TEX_BEGIN(x) + 0x04) +#define CSR_TEX_WRAP_U(x) (CSR_TEX_BEGIN(x) + 0x05) +#define CSR_TEX_WRAP_V(x) (CSR_TEX_BEGIN(x) + 0x06) +#define CSR_TEX_FILTER(x) (CSR_TEX_BEGIN(x) + 0x07) + +// Pipeline Queues //////////////////////////////////////////////////////////// + +// Size of LSU Request Queue +#ifndef LSUQ_SIZE +#define LSUQ_SIZE 8 +#endif + +// Size of FPU Request Queue +#ifndef FPUQ_SIZE +#define FPUQ_SIZE 8 +#endif + +// Icache Configurable Knobs ////////////////////////////////////////////////// + +// Size of cache in bytes +#ifndef ICACHE_SIZE +#define ICACHE_SIZE 16384 +#endif + +// Core Request Queue Size +#ifndef ICREQ_SIZE +#define ICREQ_SIZE 4 +#endif + +// Miss Handling Register Size +#ifndef IMSHR_SIZE +#define IMSHR_SIZE NUM_WARPS +#endif + +// DRAM Request Queue Size +#ifndef IDREQ_SIZE +#define IDREQ_SIZE 4 +#endif + +// DRAM Response Queue Size +#ifndef IDRSQ_SIZE +#define IDRSQ_SIZE 4 +#endif + +// Dcache Configurable Knobs ////////////////////////////////////////////////// + +// Size of cache in bytes +#ifndef DCACHE_SIZE +#define DCACHE_SIZE 16384 +#endif + +// Number of banks +#ifndef DNUM_BANKS +#define DNUM_BANKS NUM_THREADS +#endif + +// Number of bank ports +#ifndef DNUM_PORTS +#define DNUM_PORTS 1 +#endif + +// Core Request Queue Size +#ifndef DCREQ_SIZE +#define DCREQ_SIZE 4 +#endif + +// Miss Handling Register Size +#ifndef DMSHR_SIZE +#define DMSHR_SIZE LSUQ_SIZE +#endif + +// DRAM Request Queue Size +#ifndef DDREQ_SIZE +#define DDREQ_SIZE 4 +#endif + +// DRAM Response Queue Size +#ifndef DDRSQ_SIZE +#define DDRSQ_SIZE MAX(4, (DNUM_BANKS * 2)) +#endif + +// SM Configurable Knobs ////////////////////////////////////////////////////// + +// per thread stack size +#ifndef STACK_SIZE +#define STACK_SIZE 1024 +#endif + +// Size of cache in bytes +#ifndef SMEM_SIZE +#define SMEM_SIZE (STACK_SIZE * NUM_WARPS * NUM_THREADS) +#endif + +// Number of banks +#ifndef SNUM_BANKS +#define SNUM_BANKS NUM_THREADS +#endif + +// Core Request Queue Size +#ifndef SCREQ_SIZE +#define SCREQ_SIZE 4 +#endif + +// L2cache Configurable Knobs ///////////////////////////////////////////////// + +// Size of cache in bytes +#ifndef L2CACHE_SIZE +#define L2CACHE_SIZE 65536 +#endif + +// Number of banks +#ifndef L2NUM_BANKS +#define L2NUM_BANKS MIN(NUM_CORES, 4) +#endif + +// Core Request Queue Size +#ifndef L2CREQ_SIZE +#define L2CREQ_SIZE 4 +#endif + +// Miss Handling Register Size +#ifndef L2MSHR_SIZE +#define L2MSHR_SIZE 16 +#endif + +// DRAM Request Queue Size +#ifndef L2DREQ_SIZE +#define L2DREQ_SIZE 4 +#endif + +// DRAM Response Queue Size +#ifndef L2DRSQ_SIZE +#define L2DRSQ_SIZE MAX(4, (L2NUM_BANKS * 2)) +#endif + +// L3cache Configurable Knobs ///////////////////////////////////////////////// + +// Size of cache in bytes +#ifndef L3CACHE_SIZE +#define L3CACHE_SIZE 131072 +#endif + +// Number of banks +#ifndef L3NUM_BANKS +#define L3NUM_BANKS MIN(NUM_CLUSTERS, 4) +#endif + +// Core Request Queue Size +#ifndef L3CREQ_SIZE +#define L3CREQ_SIZE 4 +#endif + +// Miss Handling Register Size +#ifndef L3MSHR_SIZE +#define L3MSHR_SIZE 16 +#endif + +// DRAM Request Queue Size +#ifndef L3DREQ_SIZE +#define L3DREQ_SIZE 4 +#endif + +// DRAM Response Queue Size +#ifndef L3DRSQ_SIZE +#define L3DRSQ_SIZE MAX(4, (L3NUM_BANKS * 2)) +#endif + +#endif + diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh index 0b72b821..647aa1ca 100644 --- a/hw/rtl/tex_unit/VX_tex_define.vh +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -23,4 +23,16 @@ `define TEX_WRAP_CLAMP 1 `define TEX_WRAP_MIRROR 2 +`define MAX_COLOR_WIDTH 8 +`define NUM_COLOR_CHANNEL 4 + +`define R5G6B5 `TEX_FORMAT_BITS'h1 +`define R8G8B8 `TEX_FORMAT_BITS'h2 +`define R8G8B8A8 `TEX_FORMAT_BITS'h3 + +`define RBEGIN 24 +`define GBEGIN 16 +`define BBEGIN 8 +`define ABEGIN 0 + `endif \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_format.v b/hw/rtl/tex_unit/VX_tex_format.v index 724664ec..f5c36e9f 100644 --- a/hw/rtl/tex_unit/VX_tex_format.v +++ b/hw/rtl/tex_unit/VX_tex_format.v @@ -3,10 +3,59 @@ module VX_tex_format #( parameter CORE_ID = 0 ) ( - // TODO + input wire [31:0] texel_data, + input wire [`TEX_FORMAT_BITS-1:0] format, + + output wire [`NUM_COLOR_CHANNEL-1:0] color_enable, + output wire [`MAX_COLOR_BITS-1:0] R, + output wire [`MAX_COLOR_BITS-1:0] G, + output wire [`MAX_COLOR_BITS-1:0] B, + output wire [`MAX_COLOR_BITS-1:0] A ); `UNUSED_PARAM (CORE_ID) - - // TODO + + reg [`NUM_COLOR_CHANNEL-1:0] color_enable_r; + reg [`MAX_COLOR_BITS-1:0] R_r; + reg [`MAX_COLOR_BITS-1:0] G_r; + reg [`MAX_COLOR_BITS-1:0] B_r; + reg [`MAX_COLOR_BITS-1:0] A_r; + + always @(*) begin + case (format) + `R5G6B5: + R_r = `MAX_COLOR_BITS'(texel_data[15:11]); + G_r = `MAX_COLOR_BITS'(texel_data[10:5]); + B_r = `MAX_COLOR_BITS'(texel_data[4:0]); + A_r = {`MAX_COLOR_BITS{1'b0}}; + color_enable_r = 4'b1110; + + `R8G8B8: + R_r = `MAX_COLOR_BITS'(texel_data[23:16]); + G_r = `MAX_COLOR_BITS'(texel_data[15:8]); + B_r = `MAX_COLOR_BITS'(texel_data[7:0]); + A_r = {`MAX_COLOR_BITS{1'b0}}; + color_enable_r = 4'b1110; + + `R8G8B8A8: + R_r = `MAX_COLOR_BITS'(texel_data[31:24]); + G_r = `MAX_COLOR_BITS'(texel_data[23:16]); + B_r = `MAX_COLOR_BITS'(texel_data[15:8]); + A_r = `MAX_COLOR_BITS'(texel_data[7:0]); + color_enable_r = 4'b1111; + + default: + R_r = `MAX_COLOR_BITS'(texel_data[23:16]); + G_r = `MAX_COLOR_BITS'(texel_data[15:8]); + B_r = `MAX_COLOR_BITS'(texel_data[7:0]); + A_r = {`MAX_COLOR_BITS{1'b0}}; + color_enable_r = 4'b1110; + endcase + end + + assign color_enable = color_enable_r; + assign R = R_r; + assign G = G_r; + assign B = B_r; + assign A = A_r; endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_sampler.v b/hw/rtl/tex_unit/VX_tex_sampler.v index 1271c063..b1b31186 100644 --- a/hw/rtl/tex_unit/VX_tex_sampler.v +++ b/hw/rtl/tex_unit/VX_tex_sampler.v @@ -15,6 +15,8 @@ module VX_tex_sampler #( input wire req_wb, input wire [`TEX_FILTER_BITS-1:0] req_filter, input wire [`TEX_FORMAT_BITS-1:0] req_format, + input wire [3:0][`FIXED_FRAC-1:0] req_ufrac, + input wire [3:0][`FIXED_FRAC-1:0] req_vfrac, input wire [`NUM_THREADS-1:0][3:0][31:0] req_texels, output wire req_ready, @@ -30,41 +32,48 @@ module VX_tex_sampler #( ); `UNUSED_PARAM (CORE_ID) + + + if (req_filter == 0) begin // point sampling - /* - assign tex_req_if.ready = (& pt_addr_ready); + wire [31:0] req_data [`NUM_THREADS-1:0]; - assign lsu_req_if.valid = (& pt_addr_valid); + for (genvar i = 0; i<`NUM_THREADS ;i++ ) begin + + VX_tex_format #( + .CORE_ID (CORE_ID) + ) tex_format_point ( + .texel_data (req_texels[i]), + .format (req_format), - assign lsu_req_if.wid = tex_req_if.wid; - assign lsu_req_if.tmask = tex_req_if.tmask; - assign lsu_req_if.PC = tex_req_if.PC; - assign lsu_req_if.rd = tex_req_if.rd; - assign lsu_req_if.wb = tex_req_if.wb; - assign lsu_req_if.offset = 32'h0000; - assign lsu_req_if.op_type = `OP_BITS'({1'b0, 3'b000}); //func3 for word load?? - assign lsu_req_if.store_data = {`NUM_THREADS{32'h0000}}; + .color_enable (), + .R(req_data[i][`RBEGIN +: 8]), + .G(req_data[i][`GBEGIN +: 8]), + .B(req_data[i][`BBEGIN +: 8]), + .A(req_data[i][`ABEGIN +: 8]) + ); - // wait buffer for fragments / replace with cache/state fragment fifo for bilerp - // no filtering for point sampling -> directly from dcache to output response + end - VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), - .RESETW (1) - ) pipe_reg ( - .clk (clk), - .reset (reset), - .enable (~stall_out), - .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}), - .data_out ({tex_rsp_if.valid, tex_rsp_if.wid, tex_rsp_if.tmask, tex_rsp_if.PC, tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.data}) - ); + VX_pipe_register #( + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), + .RESETW (1) + ) pipe_reg ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({req_valid, req_wid, req_tmask, req_PC, req_rd, req_wb, req_data}), + .data_out ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}) + ); - // output - assign stall_out = ~tex_rsp_if.ready && tex_rsp_if.valid; + // output + assign stall_out = ~rsp_ready; + assign req_ready = rsp_ready; + + end else begin // bilinear sampling + // TO DO + end - // can accept new request? - assign stall_in = stall_out; - assign ld_commit_if.ready = ~stall_in;*/ endmodule \ No newline at end of file