diff --git a/.travis.yml b/.travis.yml index 9fbe9bac..d652b27f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,25 +30,28 @@ jobs: include: - stage: test name: coverage - script: cp -r $PWD ../build1 && cd ../build1 && ./ci/travis_run.py ./ci/regression.sh -coverage + script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage + - stage: test + name: tex + script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex - stage: test name: cluster - script: cp -r $PWD ../build2 && cd ../build2 && ./ci/travis_run.py ./ci/regression.sh -cluster + script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster - stage: test name: debug - script: cp -r $PWD ../build3 && cd ../build3 && ./ci/travis_run.py ./ci/regression.sh -debug + script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug - stage: test name: config - script: cp -r $PWD ../build4 && cd ../build4 && ./ci/travis_run.py ./ci/regression.sh -config + script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config - stage: test name: stress0 - script: cp -r $PWD ../build5 && cd ../build5 && ./ci/travis_run.py ./ci/regression.sh -stress0 + script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0 - stage: test name: stress1 - script: cp -r $PWD ../build6 && cd ../build6 && ./ci/travis_run.py ./ci/regression.sh -stress1 + script: cp -r $PWD ../build_stress1 && cd ../build_stress1 && ./ci/travis_run.py ./ci/regression.sh -stress1 - stage: test name: compiler - script: cp -r $PWD ../build7 && cd ../build7 && ./ci/travis_run.py ./ci/test_compiler.sh + script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh after_success: # Gather code coverage diff --git a/README.md b/README.md index 76bbd380..d3abc7a3 100644 --- a/README.md +++ b/README.md @@ -21,59 +21,32 @@ Vortex is a full-system RISCV-based GPGPU processor. ## Directory structure - `doc`: [Documentation](doc/Vortex.md). - - `hw`: Hardware sources. - - `driver`: Host drivers repository. - - `runtime`: Kernel Runtime software. - - `sim`: Simulators repository. - - `tests`: Tests repository. - - `ci`: Continuous integration scripts. - - `miscs`: Miscellaneous resources. -## Basic Installation - +## Build Instructions +### Supported OS Platforms +- Ubuntu 18.04 +- Centos 7 +### Toolchain Dependencies +- [POCL](http://portablecl.org/) +- [LLVM](https://llvm.org/) +- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain) +- [Verilator](https://www.veripool.org/verilator) ### Install development tools - $ sudo apt-get install build-essential $ sudo apt-get install git - -### Install gnu-riscv-tools - - $ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain - - $ sudo apt-get -y install \ - binutils build-essential libtool texinfo \ - gzip zip unzip patchutils curl git \ - make cmake ninja-build automake bison flex gperf \ - grep sed gawk python bc \ - zlib1g-dev libexpat1-dev libmpc-dev \ - libglib2.0-dev libfdt-dev libpixman-1-dev - $ git clone https://github.com/riscv/riscv-gnu-toolchain - $ cd riscv-gnu-toolchain - $ git submodule update --init --recursive - $ mkdir build - $ cd build - $ ../configure --prefix=$RISCV_TOOLCHAIN_PATH --with-arch=rv32im --with-abi=ilp32 - $ make -j`nproc` - $ make -j`nproc` build-qemu - -### Install Verilator - - You need into build the latest version using the instructions on their website - $ https://www.veripool.org/projects/verilator/wiki/Installing - -### Install Vortex - +### Install Vortex codebase $ git clone --recursive https://github.com/vortexgpgpu/vortex.git $ cd Vortex - $ make - -### Quick Test running OpenCL vecadd sample on 2 cores - - $ ./ci/blackbox.sh --cores=2 --app=vecadd +### Install prebuilt toolchain + $ ./ci/toolchain_install.sh -all +### Build Vortex sources + $ make -s +### Quick demo running vecadd OpenCL kernel on 2 cores + $ ./ci/blackbox.sh --driver=rtlsim --cores=2 --app=vecadd diff --git a/ci/blackbox.sh b/ci/blackbox.sh index f14acec2..5ba7a29a 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -12,7 +12,7 @@ VORTEX_HOME=$SCRIPT_DIR/.. DRIVER=vlsim APP=sgemm CLUSTERS=1 -CORES=2 +CORES=1 WARPS=4 THREADS=4 L2=0 @@ -132,9 +132,9 @@ if [ $DEBUG -eq 1 ] then if [ $SCOPE -eq 1 ] then - DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH + DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH else - DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH + DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH fi if [ $HAS_ARGS -eq 1 ] @@ -153,9 +153,9 @@ then else if [ $SCOPE -eq 1 ] then - SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH + SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH else - CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH + CONFIGS="$CONFIGS" make -C $DRIVER_PATH fi if [ $HAS_ARGS -eq 1 ] diff --git a/ci/regression.sh b/ci/regression.sh index 419c2900..073c0ed1 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -22,6 +22,17 @@ make -C tests/opencl run-simx echo "coverage tests done!" } +tex() +{ +echo "begin texture tests..." + +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" +CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1" + +echo "coverage texture done!" +} + cluster() { echo "begin clustering tests..." @@ -134,13 +145,15 @@ echo "stress1 tests done!" usage() { - echo "usage: regression [-coverage] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]" + echo "usage: regression [-coverage] [-tex] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]" } while [ "$1" != "" ]; do case $1 in -coverage ) coverage ;; + -tex ) tex + ;; -cluster ) cluster ;; -debug ) debug @@ -155,6 +168,7 @@ while [ "$1" != "" ]; do stress1 ;; -all ) coverage + tex cluster debug config diff --git a/doc/images/cache_hierarchy.png b/docs/assets/img/cache_hierarchy.png similarity index 100% rename from doc/images/cache_hierarchy.png rename to docs/assets/img/cache_hierarchy.png diff --git a/doc/images/vortex_bank.png b/docs/assets/img/vortex_bank.png similarity index 100% rename from doc/images/vortex_bank.png rename to docs/assets/img/vortex_bank.png diff --git a/doc/images/vortex_cache_top_module.png b/docs/assets/img/vortex_cache_top_module.png similarity index 100% rename from doc/images/vortex_cache_top_module.png rename to docs/assets/img/vortex_cache_top_module.png diff --git a/doc/images/vortex_microarchitecture_v2.png b/docs/assets/img/vortex_microarchitecture_v2.png similarity index 100% rename from doc/images/vortex_microarchitecture_v2.png rename to docs/assets/img/vortex_microarchitecture_v2.png diff --git a/doc/cache_subsystem.md b/docs/cache_subsystem.md similarity index 91% rename from doc/cache_subsystem.md rename to docs/cache_subsystem.md index 41d2fad6..1c03a326 100644 --- a/doc/cache_subsystem.md +++ b/docs/cache_subsystem.md @@ -8,7 +8,7 @@ The Vortex Cache Sub-system has the following main properties: ### Cache Hierarchy -![Image of Cache Hierarchy](./images/cache_hierarchy.png) +![Image of Cache Hierarchy](./assets/img/cache_hierarchy.png) - Cache can be configured to be any level in the hierarchy - Caches communicate via snooping @@ -18,7 +18,7 @@ The Vortex Cache Sub-system has the following main properties: VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory. -![Image of Vortex Cache](./images/vortex_cache_top_module.png) +![Image of Vortex Cache](./assets/img/vortex_cache_top_module.png) - Configurable (Cache size, number of banks, bank line size, etc.) - I/O signals @@ -44,7 +44,7 @@ VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/c VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory. -![Image of Vortex Cache Bank](./images/vortex_bank.png) +![Image of Vortex Cache Bank](./assets/img/vortex_bank.png) - Allows for high throughput​ - Each bank contains queues to hold requests to the cache​ diff --git a/doc/codebase.md b/docs/codebase.md similarity index 100% rename from doc/codebase.md rename to docs/codebase.md diff --git a/doc/debugging.md b/docs/debugging.md similarity index 100% rename from doc/debugging.md rename to docs/debugging.md diff --git a/docs/execute_opencl_on_vortex.md b/docs/execute_opencl_on_vortex.md new file mode 100644 index 00000000..96f9e4b4 --- /dev/null +++ b/docs/execute_opencl_on_vortex.md @@ -0,0 +1,128 @@ +# Execute OpenCL on Vortex backend + +## Requirements +- [Vortex](https://github.com/vortexgpgpu/vortex) +- [POCL for Vortex](https://github.com/vortexgpgpu/pocl) +- [riscv-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain) +- [llvm-riscv](https://github.com/llvm-mirror/llvm) + +For installation, please see [Build Instructions](../README.md) for more details. + +**For Ubuntu18.04 users, you can directly download pre-build toolchains with [toolchain_install.sh](https://github.com/vortexgpgpu/vortex/blob/master/ci/toolchain_install.sh) script.** +```bash +# please modify the DESTDIR variable in the script before execution +bash toolchain_install.sh -all +``` +Assuming we have installed all dependencies in `/opt` path, we can get the following environment: +```bash +tree -L 2 /opt +''' +/opt/ +├── llvm-riscv +│ ├── bin +│ ├── include +│ ├── lib +│ ├── libexec +│ └── share +├── pocl +│ ├── compiler +│ └── runtime +├── riscv-gnu-toolchain +│ ├── bin +│ ├── drops +│ ├── include +│ ├── lib +│ ├── libexec +│ ├── riscv32-unknown-elf +│ ├── share +│ └── var +└── verilator + ├── bin + ├── examples + ├── include + ├── verilator-config.cmake + └── verilator-config-version.cmake +''' +``` +## Execute OpenCL on Vortex +In this tutorial, we show the example of executing a vecadd programs on SIMX backend. +To execute a OpenCL program on Vortex, we have the following steps: +- Compile the [OpenCL kernels](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/kernel.cl) into risc-v binary by POCL compiler. +- Compile the [OpenCL host](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/main.cc) and link with Vortex driver(```-lvortex```). +- Execute the compiled host programs on a backend. + +Thus, we can write a Makefile as following: +```Makefile +LLVM_PREFIX ?= /opt/llvm-riscv +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf +POCL_CC_PATH ?= /opt/pocl/compiler +POCL_RT_PATH ?= /opt/pocl/runtime + +OPTS ?= -n64 + +# please edit these two variable to your environment +VORTEX_DRV_PATH ?= $(realpath ../../../driver) +VORTEX_RT_PATH ?= $(realpath ../../../runtime) + +K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small" +K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections" +K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm" + +CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors + +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter + +CXXFLAGS += -I$(POCL_RT_PATH)/include + +LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex + +PROJECT = vecadd + +SRCS = main.cc + +all: $(PROJECT) kernel.pocl + +kernel.pocl: kernel.cl + LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-fpga: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-simx: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) kernel.pocl + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.pocl *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif +``` + +First, build the host program. +```bash +make all +``` +If we want to execute on SIMX, we can execute the command below. +```bash +make run-simx +``` diff --git a/doc/fpga_setup.md b/docs/fpga_setup.md similarity index 68% rename from doc/fpga_setup.md rename to docs/fpga_setup.md index cbf06187..741ac03a 100644 --- a/doc/fpga_setup.md +++ b/docs/fpga_setup.md @@ -13,17 +13,6 @@ OPAE Environment Setup $ export PATH=:/opt/verilator/bin:$PATH $ export VERILATOR_ROOT=/opt/verilator -OPAE Build Configuration ------------------------- - -Within the `/hw/syn/opae` directory, there are source text files for each core-option for the fpga build (the 32 and 64 core options are not currently implemented) which have the following parameters that can be configured: -- NUM_CORES: the number of cores per cluster -- NUM_CLUSTERS: the number of clusters alotted to the processor -- L3_ENABLE: enable the use of the L3 cache -- PERF_ENABLE: enable the use of all profile counters - -To enable L3 cache and profile counters for a build, simply uncomment the definition within the respective source file. - OPAE Build ------------------ @@ -33,41 +22,58 @@ The FPGA has to following configuration options: - 4 cores fpga (fpga-4c) - 8 cores fpga (fpga-8c) - 16 cores fpga (fpga-16c) +- 32 cores fpga (fpga-32c) +- 64 cores fpga (fpga-64c) + +Command line: $ cd hw/syn/opae - $ make fpga- *# of cores* c + $ make fpga-c Example: `make fpga-4c` -A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-45 min to complete. +A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-480 min to complete. + + +OPAE Build Configuration +------------------------ + +The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured: +- `NUM_WARPS`: Number of warps per cores +- `NUM_THREADS`: Number of threads per warps +- `PERF_ENABLE`: enable the use of all profile counters + +You configure the syntesis build from the command line: + + $ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make fpga-4c OPAE Build Progress ------------------- You could check the last 10 lines in the build log for possible errors until build completion. - $ tail -n 10 ./build_fpga_4c/build.log + $ tail -n 10 ./build_fpga_c/build.log Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs. - $ ps -u *username* + $ ps -u If the build fails and you need to restart it, clean up the build folder using the following command: - $ make clean-fpga- *# of cores* c + $ make clean-fpga-c Example: `make clean-fpga-4c` The file `vortex_afu.gbs` should exist when the build is done: - $ ls -lsa ./build_fpga_ *# of cores* c/vortex_afu.gbs + $ ls -lsa ./build_fpga_c/vortex_afu.gbs Signing the bitstream and Programming the FPGA ---------------------------------------------- - $ cd ./build_fpga_`# of cores`c/ + $ cd ./build_fpga_c $ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs $ fpgasupdate vortex_afu_unsigned_ssl.gbs diff --git a/doc/Vortex.md b/docs/index.md similarity index 67% rename from doc/Vortex.md rename to docs/index.md index 2ff5511c..f7544b03 100644 --- a/doc/Vortex.md +++ b/docs/index.md @@ -14,17 +14,17 @@ ## Installation -- Refer to the install instructions in [README](../README.md). +- Refer to the build instructions in [README](../README.md). ## Quick Start Scenarios Running Vortex simulators with different configurations: - Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads - $ ./ci/blackbox.sh --clusters=2 --cores=2 --warps=2 --threads=4 --driver=rtlsim --app=basic + $ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic - Run demo driver test with vlsim driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads - $ ./ci/blackbox.sh --clusters=1 --cores=4 --warps=4 --threads=2 --driver=vlsim --app=demo + $ ./ci/blackbox.sh --driver=vlsim --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo - Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads - $ ./ci/blackbox.sh --clusters=4 --cores=4 --warps=8 --threads=6 --driver=simx --app=dogfood \ No newline at end of file + $ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood \ No newline at end of file diff --git a/doc/microarchitecture.md b/docs/microarchitecture.md similarity index 97% rename from doc/microarchitecture.md rename to docs/microarchitecture.md index 7e7a6a59..972da7b0 100644 --- a/doc/microarchitecture.md +++ b/docs/microarchitecture.md @@ -32,7 +32,7 @@ Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with ### Vortex Pipeline/Datapath -![Image of Vortex Microarchitecture](./images/vortex_microarchitecture_v2.png) +![Image of Vortex Microarchitecture](./assets/img/vortex_microarchitecture_v2.png) Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB. diff --git a/doc/references.md b/docs/references.md similarity index 100% rename from doc/references.md rename to docs/references.md diff --git a/doc/simulation.md b/docs/simulation.md similarity index 100% rename from doc/simulation.md rename to docs/simulation.md diff --git a/doc/software.md b/docs/software.md similarity index 100% rename from doc/software.md rename to docs/software.md diff --git a/driver/asesim/Makefile b/driver/asesim/Makefile index c3077e06..4452a194 100644 --- a/driver/asesim/Makefile +++ b/driver/asesim/Makefile @@ -63,12 +63,5 @@ scope: scope-defs.h $(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H) $(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) -o $(PROJECT) -.depend: $(SRCS) - $(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend; - clean: - rm -rf $(PROJECT) *.o .depend scope-defs.h - -ifneq ($(MAKECMDGOALS),clean) - -include .depend -endif + rm -rf $(PROJECT) *.o scope-defs.h \ No newline at end of file diff --git a/driver/fpga/Makefile b/driver/fpga/Makefile index a2771ecb..bdc12d60 100644 --- a/driver/fpga/Makefile +++ b/driver/fpga/Makefile @@ -65,12 +65,5 @@ scope: scope-defs.h $(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H) $(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) -o $(PROJECT) -.depend: $(SRCS) - $(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend; - clean: - rm -rf $(PROJECT) *.o .depend scope-defs.h - -ifneq ($(MAKECMDGOALS),clean) - -include .depend -endif + rm -rf $(PROJECT) *.o scope-defs.h \ No newline at end of file diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 87bd7980..cf0a184d 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -35,4 +35,4 @@ $(PROJECT): $(SRCS) clean: $(MAKE) -C $(RTLSIM_DIR) clean-static - rm -rf $(PROJECT) *.o .depend \ No newline at end of file + rm -rf $(PROJECT) *.o \ No newline at end of file diff --git a/driver/simx/Makefile b/driver/simx/Makefile index ba6f0284..82bf6e32 100644 --- a/driver/simx/Makefile +++ b/driver/simx/Makefile @@ -21,9 +21,6 @@ $(PROJECT): $(SRCS) $(MAKE) -C $(SIMX_DIR) static $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ -.depend: $(SRCS) - $(CXX) $(CXXFLAGS) -MM $^ > .depend; - clean: $(MAKE) -C $(SIMX_DIR) clean-static - rm -rf $(PROJECT) *.o .depend \ No newline at end of file + rm -rf $(PROJECT) *.o \ No newline at end of file diff --git a/driver/vlsim/Makefile b/driver/vlsim/Makefile index 58e8d566..5608ad11 100644 --- a/driver/vlsim/Makefile +++ b/driver/vlsim/Makefile @@ -50,13 +50,6 @@ $(PROJECT): $(SRCS) $(SCOPE_H) $(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static $(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT) -.depend: $(SRCS) - $(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend; - clean: $(MAKE) -C $(VLSIM_DIR) clean-static - rm -rf $(PROJECT) *.o .depend scope-defs.h - -ifneq ($(MAKECMDGOALS),clean) - -include .depend -endif + rm -rf $(PROJECT) *.o scope-defs.h \ No newline at end of file diff --git a/hw/rtl/VX_alu_unit.sv b/hw/rtl/VX_alu_unit.sv index 129b1202..8840f044 100644 --- a/hw/rtl/VX_alu_unit.sv +++ b/hw/rtl/VX_alu_unit.sv @@ -217,7 +217,7 @@ module VX_alu_unit #( // can accept new request? assign alu_req_if.ready = ready_in; -`ifdef DBG_PRINT_PIPELINE +`ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (branch_ctl_if.valid) begin dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n", diff --git a/hw/rtl/VX_cache_arb.sv b/hw/rtl/VX_cache_arb.sv new file mode 100644 index 00000000..85800edf --- /dev/null +++ b/hw/rtl/VX_cache_arb.sv @@ -0,0 +1,159 @@ +`include "VX_define.vh" + +module VX_cache_arb #( + parameter NUM_REQS = 1, + parameter LANES = 1, + parameter DATA_SIZE = 1, + parameter TAG_IN_WIDTH = 1, + parameter TAG_SEL_IDX = 0, + parameter BUFFERED_REQ = 0, + parameter BUFFERED_RSP = 0, + parameter TYPE = "R", + + localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)), + localparam DATA_WIDTH = (8 * DATA_SIZE), + localparam LOG_NUM_REQS = `CLOG2(NUM_REQS), + localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS +) ( + input wire clk, + input wire reset, + + // input requests + input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in, + input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in, + input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in, + input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in, + input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in, + input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in, + output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in, + + // output request + output wire [LANES-1:0] req_valid_out, + output wire [LANES-1:0] req_rw_out, + output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out, + output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out, + output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out, + output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out, + input wire [LANES-1:0] req_ready_out, + + // input response + input wire rsp_valid_in, + input wire [LANES-1:0] rsp_tmask_in, + input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in, + input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in, + output wire rsp_ready_in, + + // output responses + output wire [NUM_REQS-1:0] rsp_valid_out, + output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out, + output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out, + output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out, + input wire [NUM_REQS-1:0] rsp_ready_out +); + localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; + localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH; + + if (NUM_REQS > 1) begin + + wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged; + wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged; + + for (genvar i = 0; i < NUM_REQS; i++) begin + for (genvar j = 0; j < LANES; ++j) begin + wire [TAG_OUT_WIDTH-1:0] req_tag_in_w; + + VX_bits_insert #( + .N (TAG_IN_WIDTH), + .S (LOG_NUM_REQS), + .POS (TAG_SEL_IDX) + ) bits_insert ( + .data_in (req_tag_in[i][j]), + .sel_in (LOG_NUM_REQS'(i)), + .data_out (req_tag_in_w) + ); + + assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]}; + end + end + + VX_stream_arbiter #( + .NUM_REQS (NUM_REQS), + .LANES (LANES), + .DATAW (REQ_DATAW), + .BUFFERED (BUFFERED_REQ), + .TYPE (TYPE) + ) req_arb ( + .clk (clk), + .reset (reset), + .valid_in (req_valid_in), + .data_in (req_data_in_merged), + .ready_in (req_ready_in), + .valid_out (req_valid_out), + .data_out (req_data_out_merged), + .ready_out (req_ready_out) + ); + + for (genvar i = 0; i < LANES; ++i) begin + assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i]; + end + + /////////////////////////////////////////////////////////////////////// + + wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged; + + wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS]; + + wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w; + + VX_bits_remove #( + .N (TAG_OUT_WIDTH), + .S (LOG_NUM_REQS), + .POS (TAG_SEL_IDX) + ) bits_remove ( + .data_in (rsp_tag_in), + .data_out (rsp_tag_in_w) + ); + + VX_stream_demux #( + .NUM_REQS (NUM_REQS), + .LANES (1), + .DATAW (RSP_DATAW), + .BUFFERED (BUFFERED_RSP) + ) rsp_demux ( + .clk (clk), + .reset (reset), + .sel_in (rsp_sel), + .valid_in (rsp_valid_in), + .data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}), + .ready_in (rsp_ready_in), + .valid_out (rsp_valid_out), + .data_out (rsp_data_out_merged), + .ready_out (rsp_ready_out) + ); + + for (genvar i = 0; i < NUM_REQS; i++) begin + assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i]; + end + + end else begin + + `UNUSED_VAR (clk) + `UNUSED_VAR (reset) + + assign req_valid_out = req_valid_in; + assign req_tag_out = req_tag_in; + assign req_addr_out = req_addr_in; + assign req_rw_out = req_rw_in; + assign req_byteen_out = req_byteen_in; + assign req_data_out = req_data_in; + assign req_ready_in = req_ready_out; + + assign rsp_valid_out = rsp_valid_in; + assign rsp_tmask_out = rsp_tmask_in; + assign rsp_tag_out = rsp_tag_in; + assign rsp_data_out = rsp_data_in; + assign rsp_ready_in = rsp_ready_out; + + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_commit.sv b/hw/rtl/VX_commit.sv index a8e1764b..07b83df0 100644 --- a/hw/rtl/VX_commit.sv +++ b/hw/rtl/VX_commit.sv @@ -78,14 +78,14 @@ module VX_commit #( `ifdef EXT_F_ENABLE .fpu_commit_if (fpu_commit_if), `endif + .gpu_commit_if (gpu_commit_if), .writeback_if (writeback_if) ); // store and gpu commits don't writeback assign st_commit_if.ready = 1'b1; - assign gpu_commit_if.ready = 1'b1; -`ifdef DBG_PRINT_PIPELINE +`ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (alu_commit_if.valid && alu_commit_if.ready) begin dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd); diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 0b9dbf42..b52a1ab2 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -230,6 +230,21 @@ `define CSR_NW 12'hFC1 `define CSR_NC 12'hFC2 +////////// Texture Units ////////////////////////////////////////////////////// + +`define NUM_TEX_UNITS 2 + +`define CSR_TEX_STATES 7 +`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES) + +`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00) +`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01) +`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02) +`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03) +`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04) +`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05) +`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06) + // Pipeline Queues //////////////////////////////////////////////////////////// // Size of Instruction Buffer diff --git a/hw/rtl/VX_csr_data.sv b/hw/rtl/VX_csr_data.sv index 3baf73f7..b071a347 100644 --- a/hw/rtl/VX_csr_data.sv +++ b/hw/rtl/VX_csr_data.sv @@ -17,6 +17,9 @@ module VX_csr_data #( `ifdef EXT_F_ENABLE VX_fpu_to_csr_if.slave fpu_to_csr_if, `endif +`ifdef EXT_TEX_ENABLE + VX_tex_csr_if.master tex_csr_if, +`endif input wire read_enable, input wire[`CSR_ADDR_BITS-1:0] read_addr, @@ -26,7 +29,7 @@ module VX_csr_data #( input wire write_enable, input wire[`CSR_ADDR_BITS-1:0] write_addr, input wire[`NW_BITS-1:0] write_wid, - input wire[`CSR_WIDTH-1:0] write_data, + input wire[31:0] write_data, input wire busy ); @@ -46,13 +49,13 @@ module VX_csr_data #( reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr; - always @(posedge clk) begin + always @(posedge clk) begin `ifdef EXT_F_ENABLE if (reset) begin fcsr <= '0; - end + end if (fpu_to_csr_if.write_enable) begin - fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] + fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] | fpu_to_csr_if.write_fflags; end `endif @@ -61,27 +64,33 @@ module VX_csr_data #( `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; - - `CSR_SATP: csr_satp <= write_data; - - `CSR_MSTATUS: csr_mstatus <= write_data; - `CSR_MEDELEG: csr_medeleg <= write_data; - `CSR_MIDELEG: csr_mideleg <= write_data; - `CSR_MIE: csr_mie <= write_data; - `CSR_MTVEC: csr_mtvec <= write_data; - - `CSR_MEPC: csr_mepc <= write_data; - - `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data; - `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data; - - default: begin - `ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr)); + `CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0]; + `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0]; + `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0]; + `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0]; + `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0]; + `CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0]; + default: begin + `ASSERT(write_addr >= `CSR_TEX_BEGIN(0) + && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES), + ("%t: invalid CSR write address: %0h", $time, write_addr)); end - endcase + endcase end end + `UNUSED_VAR (write_data) + + // TEX CSRs +`ifdef EXT_TEX_ENABLE + assign tex_csr_if.write_enable = write_enable; + assign tex_csr_if.write_addr = write_addr; + assign tex_csr_if.write_data = write_data; +`endif + always @(posedge clk) begin if (reset) begin csr_cycle <= 0; @@ -209,7 +218,8 @@ module VX_csr_data #( default: begin if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) - | (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)))) begin + || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32) + || (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin read_addr_valid_r = 0; end end diff --git a/hw/rtl/VX_csr_unit.sv b/hw/rtl/VX_csr_unit.sv index 1628253c..0b05ca9c 100644 --- a/hw/rtl/VX_csr_unit.sv +++ b/hw/rtl/VX_csr_unit.sv @@ -20,6 +20,9 @@ module VX_csr_unit #( VX_fpu_to_csr_if.slave fpu_to_csr_if, input wire[`NUM_WARPS-1:0] fpu_pending, `endif +`ifdef EXT_TEX_ENABLE + VX_tex_csr_if.master tex_csr_if, +`endif output wire[`NUM_WARPS-1:0] pending, input wire busy @@ -46,6 +49,9 @@ module VX_csr_unit #( .fetch_to_csr_if(fetch_to_csr_if), `ifdef EXT_F_ENABLE .fpu_to_csr_if (fpu_to_csr_if), + `endif + `ifdef EXT_TEX_ENABLE + .tex_csr_if (tex_csr_if), `endif .read_enable (csr_req_if.valid), .read_addr (csr_req_if.addr), @@ -54,7 +60,7 @@ module VX_csr_unit #( .write_enable (write_enable), .write_addr (csr_addr_s1), .write_wid (csr_commit_if.wid), - .write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]), + .write_data (csr_updated_data_s1), .busy (busy) ); diff --git a/hw/rtl/VX_decode.sv b/hw/rtl/VX_decode.sv index 4f7b3164..89d70d7a 100644 --- a/hw/rtl/VX_decode.sv +++ b/hw/rtl/VX_decode.sv @@ -1,6 +1,6 @@ `include "VX_define.vh" -`ifdef DBG_PRINT_PIPELINE -`include "VX_print_instr.vh" +`ifdef DBG_TRACE_PIPELINE +`include "VX_trace_instr.vh" `endif `ifdef EXT_F_ENABLE @@ -42,6 +42,7 @@ module VX_decode #( wire [31:0] instr = ifetch_rsp_if.data; wire [6:0] opcode = instr[6:0]; + wire [1:0] func2 = instr[26:25]; wire [2:0] func3 = instr[14:12]; wire [6:0] func7 = instr[31:25]; wire [11:0] u_12 = instr[31:20]; @@ -193,7 +194,6 @@ module VX_decode #( end `INST_F: begin ex_type = `EX_LSU; - op_type = `INST_OP_BITS'(func3[0]); op_mod = `INST_MOD_BITS'(1); end `INST_SYS : begin @@ -375,11 +375,21 @@ module VX_decode #( `USED_IREG (rs1); `USED_IREG (rs2); end + `ifdef EXT_TEX_ENABLE 3'h5: begin + op_type = `INST_OP_BITS'(`INST_GPU_TEX); + op_mod = `INST_MOD_BITS'(func2); + use_rd = 1; + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); + `USED_IREG (rs3); + end + `endif + 3'h6: begin ex_type = `EX_LSU; - op_type = `INST_OP_BITS'(`INST_GPU_PRED); - imm = {{20{u_12[11]}}, u_12}; - use_rd = 0; + op_type = `INST_OP_BITS'(`INST_LSU_LW); + op_mod = `INST_MOD_BITS'(2); `USED_IREG (rs1); end default:; @@ -389,6 +399,8 @@ module VX_decode #( endcase end + `UNUSED_VAR (func2) + // disable write to integer register r0 wire wb = use_rd && (| rd_r); @@ -421,13 +433,13 @@ module VX_decode #( assign ifetch_rsp_if.ready = decode_if.ready; -`ifdef DBG_PRINT_PIPELINE +`ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (decode_if.valid && decode_if.ready) begin dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC); - print_ex_type(decode_if.ex_type); + trace_ex_type(decode_if.ex_type); dpi_trace(", op="); - print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); + trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm); end end diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index cedb7594..c3706000 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -18,6 +18,8 @@ `define NRI_BITS `LOG2UP(`NUM_IREGS) +`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS) + `ifdef EXT_F_ENABLE `define NUM_REGS (2 * `NUM_IREGS) `else @@ -66,6 +68,8 @@ `define INST_GPU 7'b1101011 +`define INST_TEX 7'b0101011 + /////////////////////////////////////////////////////////////////////////////// `define INST_FRM_RNE 3'b000 // round to nearest even @@ -150,8 +154,8 @@ `define INST_LSU_BITS 4 `define INST_LSU_FMT(x) x[2:0] `define INST_LSU_WSIZE(x) x[1:0] -`define INST_LSU_IS_FENCE(x) x[0] -`define INST_LSU_IS_PREF(x) (x==3'b111) +`define INST_LSU_IS_FENCE(x) (3'h1 == x) +`define INST_LSU_IS_PREFETCH(x) (3'h2 == x) `define INST_FENCE_BITS 1 `define INST_FENCE_D 1'h0 @@ -187,6 +191,7 @@ `define INST_GPU_JOIN 3'h3 `define INST_GPU_BAR 3'h4 `define INST_GPU_PRED 3'h5 +`define INST_GPU_TEX 3'h6 `define INST_GPU_BITS 3 /////////////////////////////////////////////////////////////////////////////// @@ -238,8 +243,11 @@ `define DBG_CACHE_REQ_MDATAW 0 `endif -// non-cacheable address bit -`define NC_FLAG_BITS 1 +// non-cacheable tag bits +`define NC_TAG_BIT 1 + +// texture tag bits +`define TEX_TAG_BIT 1 ////////////////////////// Icache Configurable Knobs ////////////////////////// @@ -278,12 +286,20 @@ // Block size in bytes `define DCACHE_LINE_SIZE `L1_BLOCK_SIZE -// TAG sharing enable -`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) -`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_FLAG_BITS + `SM_ENABLE) - -// Input request tag bits -`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS) +// Core request tag bits +`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) +`ifdef EXT_TEX_ENABLE +`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) +`define TEX_TAG_ID_BITS (2) +`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS) +`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT) +`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS) +`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS) +`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS) +`else +`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE) +`endif +`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS) // Memory request data bits `define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8) @@ -300,7 +316,7 @@ // Memory request tag bits `define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE) `define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH) -`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_FLAG_BITS), `_DNC_MEM_TAG_WIDTH) +`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_TAG_BIT), `_DNC_MEM_TAG_WIDTH) // Merged D-cache/I-cache memory tag `define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2)) @@ -348,7 +364,7 @@ // Memory request tag bits `define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE) `define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH) -`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_FLAG_BITS), `_L2_NC_MEM_TAG_WIDTH) +`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_TAG_BIT), `_L2_NC_MEM_TAG_WIDTH) `define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS))) ////////////////////////// L3cache Configurable Knobs ///////////////////////// @@ -380,7 +396,7 @@ // Memory request tag bits `define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE) `define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH) -`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_FLAG_BITS), `_L3_NC_MEM_TAG_WIDTH) +`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_TAG_BIT), `_L3_NC_MEM_TAG_WIDTH) `define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS))) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_instr_demux.sv b/hw/rtl/VX_dispatch.sv similarity index 92% rename from hw/rtl/VX_instr_demux.sv rename to hw/rtl/VX_dispatch.sv index 3578e8d1..008a7c62 100644 --- a/hw/rtl/VX_instr_demux.sv +++ b/hw/rtl/VX_dispatch.sv @@ -1,6 +1,6 @@ `include "VX_define.vh" -module VX_instr_demux ( +module VX_dispatch ( input wire clk, input wire reset, @@ -60,7 +60,7 @@ module VX_instr_demux ( wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU); wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type); wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod); - wire lsu_is_prefetch = (~ibuffer_if.wb) && ~(ibuffer_if.op_type[`INST_OP_BITS-1]); + wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod); VX_skid_buffer #( .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), @@ -125,18 +125,17 @@ module VX_instr_demux ( wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU); wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type); - wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid]; VX_skid_buffer #( - .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), + .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)), .OUT_REG (1) ) gpu_buffer ( .clk (clk), .reset (reset), .valid_in (gpu_req_valid), .ready_in (gpu_req_ready), - .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpu_rs2_data}), - .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), + .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}), + .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}), .valid_out (gpu_req_if.valid), .ready_out (gpu_req_if.ready) ); diff --git a/hw/rtl/VX_execute.sv b/hw/rtl/VX_execute.sv index 9c09d826..f0cdd37e 100644 --- a/hw/rtl/VX_execute.sv +++ b/hw/rtl/VX_execute.sv @@ -45,12 +45,108 @@ module VX_execute #( VX_commit_if.master gpu_commit_if, input wire busy -); +); + +`ifdef EXT_TEX_ENABLE + + VX_dcache_req_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + ) lsu_dcache_req_if(); + + VX_dcache_rsp_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`LSU_DCACHE_TAG_BITS) + ) lsu_dcache_rsp_if(); + + VX_dcache_req_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + ) tex_dcache_req_if(); + + VX_dcache_rsp_if #( + .NUM_REQS (`NUM_THREADS), + .WORD_SIZE (4), + .TAG_WIDTH (`TEX_DCACHE_TAG_BITS) + ) tex_dcache_rsp_if(); + + VX_tex_csr_if tex_csr_if(); + + wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in; + wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out; + + `UNUSED_VAR (tex_tag_out) + `UNUSED_VAR (lsu_tag_out) + + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]); + assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]); + `ifdef DBG_CACHE_REQ_INFO + assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS]; + assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS]; + `endif + end + + assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0]; + assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0]; +`ifdef DBG_CACHE_REQ_INFO + assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; + assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS]; +`endif + + VX_cache_arb #( + .NUM_REQS (2), + .LANES (`NUM_THREADS), + .DATA_SIZE (4), + .TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS), + .TAG_SEL_IDX (`NC_TAG_BIT + `SM_ENABLE) + ) tex_lsu_arb ( + .clk (clk), + .reset (reset), + + // Tex/LSU request + .req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}), + .req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}), + .req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}), + .req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}), + .req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}), + .req_tag_in ({tex_tag_in, lsu_tag_in}), + .req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}), + + // Dcache request + .req_valid_out (dcache_req_if.valid), + .req_rw_out (dcache_req_if.rw), + .req_byteen_out (dcache_req_if.byteen), + .req_addr_out (dcache_req_if.addr), + .req_data_out (dcache_req_if.data), + .req_tag_out (dcache_req_if.tag), + .req_ready_out (dcache_req_if.ready), + + // Dcache response + .rsp_valid_in (dcache_rsp_if.valid), + .rsp_tmask_in (dcache_rsp_if.tmask), + .rsp_tag_in (dcache_rsp_if.tag), + .rsp_data_in (dcache_rsp_if.data), + .rsp_ready_in (dcache_rsp_if.ready), + + // Tex/LSU response + .rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}), + .rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}), + .rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}), + .rsp_tag_out ({tex_tag_out, lsu_tag_out}), + .rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready}) + ); + +`endif + `ifdef EXT_F_ENABLE - VX_fpu_to_csr_if fpu_to_csr_if(); - wire[`NUM_WARPS-1:0] fpu_pending; - wire[`NUM_WARPS-1:0] csr_pending; -`endif + wire [`NUM_WARPS-1:0] csr_pending; + wire [`NUM_WARPS-1:0] fpu_pending; + VX_fpu_to_csr_if fpu_to_csr_if(); +`endif `RESET_RELAY (alu_reset); `RESET_RELAY (lsu_reset); @@ -58,7 +154,7 @@ module VX_execute #( `RESET_RELAY (gpu_reset); VX_alu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) alu_unit ( .clk (clk), .reset (alu_reset), @@ -68,20 +164,25 @@ module VX_execute #( ); VX_lsu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) lsu_unit ( `SCOPE_BIND_VX_execute_lsu_unit .clk (clk), .reset (lsu_reset), + `ifdef EXT_TEX_ENABLE + .dcache_req_if (lsu_dcache_req_if), + .dcache_rsp_if (lsu_dcache_rsp_if), + `else .dcache_req_if (dcache_req_if), .dcache_rsp_if (dcache_rsp_if), + `endif .lsu_req_if (lsu_req_if), .ld_commit_if (ld_commit_if), .st_commit_if (st_commit_if) ); VX_csr_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) csr_unit ( .clk (clk), .reset (csr_reset), @@ -89,7 +190,7 @@ module VX_execute #( .perf_memsys_if (perf_memsys_if), .perf_pipeline_if(perf_pipeline_if), `endif - .cmt_to_csr_if (cmt_to_csr_if), + .cmt_to_csr_if (cmt_to_csr_if), .fetch_to_csr_if(fetch_to_csr_if), .csr_req_if (csr_req_if), .csr_commit_if (csr_commit_if), @@ -100,6 +201,9 @@ module VX_execute #( `else `UNUSED_PIN (pending), `endif + `ifdef EXT_TEX_ENABLE + .tex_csr_if (tex_csr_if), + `endif .busy (busy) ); @@ -107,7 +211,7 @@ module VX_execute #( `RESET_RELAY (fpu_reset); VX_fpu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) fpu_unit ( .clk (clk), .reset (fpu_reset), @@ -120,12 +224,17 @@ module VX_execute #( `endif VX_gpu_unit #( - .CORE_ID (CORE_ID) + .CORE_ID(CORE_ID) ) gpu_unit ( `SCOPE_BIND_VX_execute_gpu_unit .clk (clk), .reset (gpu_reset), .gpu_req_if (gpu_req_if), + `ifdef EXT_TEX_ENABLE + .tex_csr_if (tex_csr_if), + .dcache_req_if (tex_dcache_req_if), + .dcache_rsp_if (tex_dcache_rsp_if), + `endif .warp_ctl_if (warp_ctl_if), .gpu_commit_if (gpu_commit_if) ); @@ -137,4 +246,4 @@ module VX_execute #( && (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK || `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL); -endmodule +endmodule \ No newline at end of file diff --git a/hw/rtl/VX_gpu_unit.sv b/hw/rtl/VX_gpu_unit.sv index 66f76cda..06d5fbc7 100644 --- a/hw/rtl/VX_gpu_unit.sv +++ b/hw/rtl/VX_gpu_unit.sv @@ -11,6 +11,12 @@ module VX_gpu_unit #( // Inputs VX_gpu_req_if.slave gpu_req_if, +`ifdef EXT_TEX_ENABLE + VX_dcache_req_if.master dcache_req_if, + VX_dcache_rsp_if.slave dcache_rsp_if, + VX_tex_csr_if.slave tex_csr_if, +`endif + // Outputs VX_warp_ctl_if.master warp_ctl_if, VX_commit_if.master gpu_commit_if @@ -18,14 +24,29 @@ module VX_gpu_unit #( import gpu_types::*; `UNUSED_PARAM (CORE_ID) - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) + + localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS; + localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW); + + wire rsp_valid; + wire [`NW_BITS-1:0] rsp_wid; + wire [`NUM_THREADS-1:0] rsp_tmask; + wire [31:0] rsp_PC; + wire [`NR_BITS-1:0] rsp_rd; + wire rsp_wb; + + wire [RSP_DATAW-1:0] rsp_data, rsp_data_r; gpu_tmc_t tmc; gpu_wspawn_t wspawn; gpu_barrier_t barrier; gpu_split_t split; + wire [WCTL_DATAW-1:0] warp_ctl_data; + wire is_warp_ctl; + + wire stall_in, stall_out; + wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN); wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC); wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT); @@ -33,7 +54,8 @@ module VX_gpu_unit #( wire is_pred = (gpu_req_if.op_type == `INST_GPU_PRED); wire [31:0] rs1_data = gpu_req_if.rs1_data[gpu_req_if.tid]; - + wire [31:0] rs2_data = gpu_req_if.rs2_data[gpu_req_if.tid]; + wire [`NUM_THREADS-1:0] taken_tmask; wire [`NUM_THREADS-1:0] not_taken_tmask; @@ -52,7 +74,7 @@ module VX_gpu_unit #( // wspawn - wire [31:0] wspawn_pc = gpu_req_if.rs2_data; + wire [31:0] wspawn_pc = rs2_data; wire [`NUM_WARPS-1:0] wspawn_wmask; for (genvar i = 0; i < `NUM_WARPS; i++) begin assign wspawn_wmask[i] = (i < rs1_data); @@ -73,30 +95,109 @@ module VX_gpu_unit #( assign barrier.valid = is_bar; assign barrier.id = rs1_data[`NB_BITS-1:0]; - assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1); + assign barrier.size_m1 = (`NW_BITS)'(rs2_data - 1); + + // pack warp ctl result + assign warp_ctl_data = {tmc, wspawn, split, barrier}; + + // texture + +`ifdef EXT_TEX_ENABLE + + `UNUSED_VAR (gpu_req_if.op_mod) + + VX_tex_req_if tex_req_if(); + VX_tex_rsp_if tex_rsp_if(); + + wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX); + + assign tex_req_if.valid = gpu_req_if.valid && is_tex; + assign tex_req_if.wid = gpu_req_if.wid; + assign tex_req_if.tmask = gpu_req_if.tmask; + assign tex_req_if.PC = gpu_req_if.PC; + assign tex_req_if.rd = gpu_req_if.rd; + assign tex_req_if.wb = gpu_req_if.wb; + + assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0]; + assign tex_req_if.coords[0] = gpu_req_if.rs1_data; + assign tex_req_if.coords[1] = gpu_req_if.rs2_data; + assign tex_req_if.lod = gpu_req_if.rs3_data; + + VX_tex_unit #( + .CORE_ID(CORE_ID) + ) tex_unit ( + .clk (clk), + .reset (reset), + .tex_req_if (tex_req_if), + .tex_csr_if (tex_csr_if), + .tex_rsp_if (tex_rsp_if), + .dcache_req_if (dcache_req_if), + .dcache_rsp_if (dcache_rsp_if) + ); + + assign tex_rsp_if.ready = !stall_out; + + assign stall_in = (is_tex && ~tex_req_if.ready) + || (~is_tex && (tex_rsp_if.valid || stall_out)); + + assign is_warp_ctl = !(is_tex || tex_rsp_if.valid); + + assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex); + assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid; + assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask; + assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC; + assign rsp_rd = tex_rsp_if.rd; + assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb; + assign rsp_data = tex_rsp_if.valid ? RSP_DATAW'(tex_rsp_if.data) : RSP_DATAW'(warp_ctl_data); + +`else + + `UNUSED_VAR (gpu_req_if.op_mod) + `UNUSED_VAR (gpu_req_if.rs3_data) + `UNUSED_VAR (gpu_req_if.wb) + `UNUSED_VAR (gpu_req_if.rd) + + assign stall_in = stall_out; + assign is_warp_ctl = 1; + + assign rsp_valid = gpu_req_if.valid; + assign rsp_wid = gpu_req_if.wid; + assign rsp_tmask = gpu_req_if.tmask; + assign rsp_PC = gpu_req_if.PC; + assign rsp_rd = 0; + assign rsp_wb = 0; + assign rsp_data = RSP_DATAW'(warp_ctl_data); + +`endif + + wire is_warp_ctl_r; // output - - wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid; + assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid; VX_pipe_register #( - .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS), + .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1), .RESETW (1) ) pipe_reg ( .clk (clk), .reset (reset), - .enable (!stall), - .data_in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}), - .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) - ); + .enable (!stall_out), + .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}), + .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r}) + ); - assign gpu_commit_if.eop = 1'b1; + assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0]; + assign gpu_commit_if.eop = 1'b1; - assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready; - assign warp_ctl_if.wid = gpu_commit_if.wid; + // warp control reponse + + assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data_r[WCTL_DATAW-1:0]; + + assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r; + assign warp_ctl_if.wid = gpu_commit_if.wid; // can accept new request? - assign gpu_req_if.ready = ~stall; + assign gpu_req_if.ready = ~stall_in; `SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid); `SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid); diff --git a/hw/rtl/VX_icache_stage.sv b/hw/rtl/VX_icache_stage.sv index 96ab2531..cb33b82d 100644 --- a/hw/rtl/VX_icache_stage.sv +++ b/hw/rtl/VX_icache_stage.sv @@ -88,7 +88,7 @@ module VX_icache_stage #( `SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data); `SCOPE_ASSIGN (icache_rsp_tag, rsp_tag); -`ifdef DBG_PRINT_CORE_ICACHE +`ifdef DBG_TRACE_CORE_ICACHE always @(posedge clk) begin if (icache_req_if.valid && icache_req_if.ready) begin dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC); diff --git a/hw/rtl/VX_issue.sv b/hw/rtl/VX_issue.sv index c59ed39e..abbb5241 100644 --- a/hw/rtl/VX_issue.sv +++ b/hw/rtl/VX_issue.sv @@ -23,56 +23,60 @@ module VX_issue #( `endif VX_gpu_req_if.master gpu_req_if ); - VX_ibuffer_if ibuffer_if(); - VX_gpr_rsp_if gpr_rsp_if(); - - VX_gpr_req_if gpr_req_if(); - assign gpr_req_if.wid = ibuffer_if.wid; - assign gpr_req_if.rs1 = ibuffer_if.rs1; - assign gpr_req_if.rs2 = ibuffer_if.rs2; - assign gpr_req_if.rs3 = ibuffer_if.rs3; - + VX_ibuffer_if ibuffer_if(); + VX_gpr_req_if gpr_req_if(); + VX_gpr_rsp_if gpr_rsp_if(); VX_writeback_if sboard_wb_if(); - assign sboard_wb_if.valid = writeback_if.valid; - assign sboard_wb_if.wid = writeback_if.wid; - assign sboard_wb_if.PC = writeback_if.PC; - assign sboard_wb_if.rd = writeback_if.rd; - assign sboard_wb_if.eop = writeback_if.eop; - assign sboard_wb_if.ready = writeback_if.ready; - - VX_ibuffer_if sboard_ib_if(); - assign sboard_ib_if.valid = ibuffer_if.valid && idmux_ib_if.ready; - assign sboard_ib_if.wid = ibuffer_if.wid; - assign sboard_ib_if.PC = ibuffer_if.PC; - assign sboard_ib_if.wb = ibuffer_if.wb; - assign sboard_ib_if.rd = ibuffer_if.rd; - assign sboard_ib_if.rd_n = ibuffer_if.rd_n; - assign sboard_ib_if.rs1_n = ibuffer_if.rs1_n; - assign sboard_ib_if.rs2_n = ibuffer_if.rs2_n; - assign sboard_ib_if.rs3_n = ibuffer_if.rs3_n; - assign sboard_ib_if.wid_n = ibuffer_if.wid_n; + VX_ibuffer_if scoreboard_if(); + VX_ibuffer_if dispatch_if(); - VX_ibuffer_if idmux_ib_if(); - assign idmux_ib_if.valid = ibuffer_if.valid && sboard_ib_if.ready; - assign idmux_ib_if.wid = ibuffer_if.wid; - assign idmux_ib_if.tmask = ibuffer_if.tmask; - assign idmux_ib_if.PC = ibuffer_if.PC; - assign idmux_ib_if.ex_type = ibuffer_if.ex_type; - assign idmux_ib_if.op_type = ibuffer_if.op_type; - assign idmux_ib_if.op_mod = ibuffer_if.op_mod; - assign idmux_ib_if.wb = ibuffer_if.wb; - assign idmux_ib_if.rd = ibuffer_if.rd; - assign idmux_ib_if.rs1 = ibuffer_if.rs1; - assign idmux_ib_if.imm = ibuffer_if.imm; - assign idmux_ib_if.use_PC = ibuffer_if.use_PC; - assign idmux_ib_if.use_imm = ibuffer_if.use_imm; + // GPR request interface + assign gpr_req_if.wid = ibuffer_if.wid; + assign gpr_req_if.rs1 = ibuffer_if.rs1; + assign gpr_req_if.rs2 = ibuffer_if.rs2; + assign gpr_req_if.rs3 = ibuffer_if.rs3; + + // scoreboard writeback interface + assign sboard_wb_if.valid = writeback_if.valid; + assign sboard_wb_if.wid = writeback_if.wid; + assign sboard_wb_if.PC = writeback_if.PC; + assign sboard_wb_if.rd = writeback_if.rd; + assign sboard_wb_if.eop = writeback_if.eop; + + // scoreboard interface + assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready; + assign scoreboard_if.wid = ibuffer_if.wid; + assign scoreboard_if.PC = ibuffer_if.PC; + assign scoreboard_if.wb = ibuffer_if.wb; + assign scoreboard_if.rd = ibuffer_if.rd; + assign scoreboard_if.rd_n = ibuffer_if.rd_n; + assign scoreboard_if.rs1_n = ibuffer_if.rs1_n; + assign scoreboard_if.rs2_n = ibuffer_if.rs2_n; + assign scoreboard_if.rs3_n = ibuffer_if.rs3_n; + assign scoreboard_if.wid_n = ibuffer_if.wid_n; + + // dispatch interface + assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready; + assign dispatch_if.wid = ibuffer_if.wid; + assign dispatch_if.tmask = ibuffer_if.tmask; + assign dispatch_if.PC = ibuffer_if.PC; + assign dispatch_if.ex_type = ibuffer_if.ex_type; + assign dispatch_if.op_type = ibuffer_if.op_type; + assign dispatch_if.op_mod = ibuffer_if.op_mod; + assign dispatch_if.wb = ibuffer_if.wb; + assign dispatch_if.rd = ibuffer_if.rd; + assign dispatch_if.rs1 = ibuffer_if.rs1; + assign dispatch_if.imm = ibuffer_if.imm; + assign dispatch_if.use_PC = ibuffer_if.use_PC; + assign dispatch_if.use_imm = ibuffer_if.use_imm; // issue the instruction - assign ibuffer_if.ready = sboard_ib_if.ready && idmux_ib_if.ready; + assign ibuffer_if.ready = scoreboard_if.ready && dispatch_if.ready; `RESET_RELAY (ibuf_reset); + `RESET_RELAY (scoreboard_reset); `RESET_RELAY (gpr_reset); - `RESET_RELAY (demux_reset); + `RESET_RELAY (dispatch_reset); VX_ibuffer #( .CORE_ID(CORE_ID) @@ -87,9 +91,9 @@ module VX_issue #( .CORE_ID(CORE_ID) ) scoreboard ( .clk (clk), - .reset (reset), - .ibuffer_if (sboard_ib_if), - .writeback_if(sboard_wb_if) + .reset (scoreboard_reset), + .writeback_if(sboard_wb_if), + .ibuffer_if (scoreboard_if) ); VX_gpr_stage #( @@ -102,10 +106,10 @@ module VX_issue #( .gpr_rsp_if (gpr_rsp_if) ); - VX_instr_demux instr_demux ( + VX_dispatch dispatch ( .clk (clk), - .reset (demux_reset), - .ibuffer_if (idmux_ib_if), + .reset (dispatch_reset), + .ibuffer_if (dispatch_if), .gpr_rsp_if (gpr_rsp_if), .alu_req_if (alu_req_if), .lsu_req_if (lsu_req_if), @@ -131,11 +135,11 @@ module VX_issue #( `SCOPE_ASSIGN (issue_imm, ibuffer_if.imm); `SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC); `SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm); - `SCOPE_ASSIGN (scoreboard_delay, !sboard_wb_if.ready); - `SCOPE_ASSIGN (execute_delay, !idmux_ib_if.ready); - `SCOPE_ASSIGN (gpr_rsp_a, gpr_rsp_if.rs1_data); - `SCOPE_ASSIGN (gpr_rsp_b, gpr_rsp_if.rs2_data); - `SCOPE_ASSIGN (gpr_rsp_c, gpr_rsp_if.rs3_data); + `SCOPE_ASSIGN (scoreboard_delay, !scoreboard_if.ready); + `SCOPE_ASSIGN (dispatch_delay, !dispatch_if.ready); + `SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data); + `SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data); + `SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data); `SCOPE_ASSIGN (writeback_valid, writeback_if.valid); `SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask); `SCOPE_ASSIGN (writeback_wid, writeback_if.wid); @@ -170,7 +174,7 @@ module VX_issue #( if (decode_if.valid & !decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1; end - if (ibuffer_if.valid & !sboard_wb_if.ready) begin + if (scoreboard_if.valid & !scoreboard_if.ready) begin perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1; end if (alu_req_if.valid & !alu_req_if.ready) begin @@ -204,7 +208,7 @@ module VX_issue #( `endif `endif -`ifdef DBG_PRINT_PIPELINE +`ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (alu_req_if.valid && alu_req_if.ready) begin dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=", @@ -246,6 +250,8 @@ module VX_issue #( `TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS); dpi_trace(", rs2_data="); `TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS); + dpi_trace(", rs3_data="); + `TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS); dpi_trace("\n"); end end diff --git a/hw/rtl/VX_lsu_unit.sv b/hw/rtl/VX_lsu_unit.sv index b636158b..8541f4c6 100644 --- a/hw/rtl/VX_lsu_unit.sv +++ b/hw/rtl/VX_lsu_unit.sv @@ -24,7 +24,7 @@ module VX_lsu_unit #( localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); - localparam ADDR_TYPEW = `NC_FLAG_BITS + `SM_ENABLE; + localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE; `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) @@ -80,6 +80,8 @@ module VX_lsu_unit #( wire lsu_valid = lsu_req_if.valid && ~fence_wait; + wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch; + VX_pipe_register #( .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .RESETW (1) @@ -87,8 +89,8 @@ module VX_lsu_unit #( .clk (clk), .reset (reset), .enable (!stall_in), - .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb | lsu_req_if.is_prefetch, lsu_req_if.store_data}), - .data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) + .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}), + .data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) ); // Can accept new request? @@ -103,6 +105,7 @@ module VX_lsu_unit #( wire rsp_is_prefetch; `UNUSED_VAR (rsp_type) + `UNUSED_VAR (rsp_is_prefetch) reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; wire [`NUM_THREADS-1:0] rsp_rem_mask_n; @@ -132,7 +135,11 @@ module VX_lsu_unit #( wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); - assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; + assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; + `UNUSED_VAR (dcache_rsp_if.tag) + + // do not writeback from software prefetch + wire req_wb2 = req_wb && ~req_is_prefetch; VX_index_buffer #( .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), @@ -143,8 +150,8 @@ module VX_lsu_unit #( .write_addr (mbuf_waddr), .acquire_slot (mbuf_push), .read_addr (mbuf_raddr), - .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb, req_type, req_offset, req_is_dup, req_is_prefetch}), - .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), + .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}), + .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), .release_addr (mbuf_raddr), .release_slot (mbuf_pop), .full (mbuf_full), @@ -276,8 +283,6 @@ module VX_lsu_unit #( // send load commit - // ignore responce from software prefetch - wire rsp_valid = (rsp_is_prefetch)? 0:(| dcache_rsp_if.valid); wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid; VX_pipe_register #( @@ -287,12 +292,12 @@ module VX_lsu_unit #( .clk (clk), .reset (reset), .enable (!load_rsp_stall), - .data_in ({rsp_valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), + .data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) ); // Can accept new cache response? - assign dcache_rsp_if.ready = rsp_is_prefetch ? 1 : ~load_rsp_stall; + assign dcache_rsp_if.ready = ~load_rsp_stall; // scope registration `SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire); @@ -333,7 +338,7 @@ module VX_lsu_unit #( end `endif -`ifdef DBG_PRINT_CORE_DCACHE +`ifdef DBG_TRACE_CORE_DCACHE wire dcache_req_fire_any = (| dcache_req_fire); always @(posedge clk) begin if (lsu_req_if.valid && fence_wait) begin @@ -349,7 +354,7 @@ module VX_lsu_unit #( `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); dpi_trace("\n"); end else begin - dpi_trace("%d: D$%0d Rd Req: req_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); + dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); `TRACE_ARRAY1D(req_addr, `NUM_THREADS); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); @@ -357,7 +362,7 @@ module VX_lsu_unit #( end end if (dcache_rsp_fire) begin - dpi_trace("%d: D$%0d Rsp: rsp_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", + dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); dpi_trace(", is_dup=%b\n", rsp_is_dup); diff --git a/hw/rtl/VX_mem_unit.sv b/hw/rtl/VX_mem_unit.sv index 7c53ccc8..56de47ef 100644 --- a/hw/rtl/VX_mem_unit.sv +++ b/hw/rtl/VX_mem_unit.sv @@ -206,6 +206,7 @@ module VX_mem_unit # ( .LANES (`NUM_THREADS), .DATA_SIZE (4), .TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH), + .TAG_SEL_IDX (0), // SM flag .TYPE ("P"), .BUFFERED_REQ (2), .BUFFERED_RSP (1) diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 202da95c..908428b7 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -119,9 +119,9 @@ `define UP(x) (((x) > 0) ? (x) : 1) -`define RTRIM(x,s) x[$bits(x)-1:($bits(x)-s)] +`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)] -`define LTRIM(x,s) x[s-1:0] +`define LTRIM(x, s) x[s-1:0] `define TRACE_ARRAY1D(a, m) \ dpi_trace("{"); \ diff --git a/hw/rtl/VX_scoreboard.sv b/hw/rtl/VX_scoreboard.sv index 9503ecdf..6ba4e998 100644 --- a/hw/rtl/VX_scoreboard.sv +++ b/hw/rtl/VX_scoreboard.sv @@ -6,8 +6,8 @@ module VX_scoreboard #( input wire clk, input wire reset, - VX_ibuffer_if.scoreboard ibuffer_if, - VX_writeback_if.scoreboard writeback_if + VX_ibuffer_if.slave ibuffer_if, + VX_writeback_if.slave writeback_if ); reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; @@ -53,11 +53,12 @@ module VX_scoreboard #( reg [31:0] deadlock_ctr; wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); + always @(posedge clk) begin if (reset) begin deadlock_ctr <= 0; end else begin - `ifdef DBG_PRINT_PIPELINE + `ifdef DBG_TRACE_PIPELINE if (ibuffer_if.valid && ~ibuffer_if.ready) begin dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, diff --git a/hw/rtl/VX_print_instr.vh b/hw/rtl/VX_trace_instr.vh similarity index 97% rename from hw/rtl/VX_print_instr.vh rename to hw/rtl/VX_trace_instr.vh index 614a7d46..e228179e 100644 --- a/hw/rtl/VX_print_instr.vh +++ b/hw/rtl/VX_trace_instr.vh @@ -1,9 +1,9 @@ -`ifndef VX_PRINT_INSTR -`define VX_PRINT_INSTR +`ifndef VX_TRACE_INSTR +`define VX_TRACE_INSTR `include "VX_define.vh" -task print_ex_type ( +task trace_ex_type ( input [`EX_BITS-1:0] ex_type ); case (ex_type) @@ -16,7 +16,7 @@ task print_ex_type ( endcase endtask -task print_ex_op ( +task trace_ex_op ( input [`EX_BITS-1:0] ex_type, input [`INST_OP_BITS-1:0] op_type, input [`INST_MOD_BITS-1:0] op_mod @@ -137,6 +137,7 @@ task print_ex_op ( `INST_GPU_JOIN: dpi_trace("JOIN"); `INST_GPU_BAR: dpi_trace("BAR"); `INST_GPU_PRED: dpi_trace("PRED"); + `INST_GPU_TEX: dpi_trace("TEX"); default: dpi_trace("?"); endcase end diff --git a/hw/rtl/VX_warp_sched.sv b/hw/rtl/VX_warp_sched.sv index 9495c001..979a3536 100644 --- a/hw/rtl/VX_warp_sched.sv +++ b/hw/rtl/VX_warp_sched.sv @@ -71,8 +71,8 @@ module VX_warp_sched #( // activate first warp warp_pcs[0] <= `STARTUP_ADDR; - active_warps[0] <= '1; - thread_masks[0] <= '1; + active_warps[0] <= 1; + thread_masks[0] <= 1; end else begin if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1)); diff --git a/hw/rtl/VX_writeback.sv b/hw/rtl/VX_writeback.sv index fb2bfcb7..cdf7f988 100644 --- a/hw/rtl/VX_writeback.sv +++ b/hw/rtl/VX_writeback.sv @@ -12,7 +12,8 @@ module VX_writeback #( VX_commit_if.slave csr_commit_if, `ifdef EXT_F_ENABLE VX_commit_if.slave fpu_commit_if, -`endif +`endif + VX_commit_if.slave gpu_commit_if, // outputs VX_writeback_if.master writeback_if @@ -22,9 +23,17 @@ module VX_writeback #( localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1; `ifdef EXT_F_ENABLE +`ifdef EXT_TEX_ENABLE + localparam NUM_RSPS = 5; +`else + localparam NUM_RSPS = 4; +`endif +`else +`ifdef EXT_TEX_ENABLE localparam NUM_RSPS = 4; `else localparam NUM_RSPS = 3; +`endif `endif wire wb_valid; @@ -40,22 +49,27 @@ module VX_writeback #( wire [NUM_RSPS-1:0] rsp_ready; wire stall; - assign rsp_valid = { + assign rsp_valid = { + `ifdef EXT_TEX_ENABLE + gpu_commit_if.valid && gpu_commit_if.wb, + `endif csr_commit_if.valid && csr_commit_if.wb, - alu_commit_if.valid && alu_commit_if.wb, - + alu_commit_if.valid && alu_commit_if.wb, `ifdef EXT_F_ENABLE fpu_commit_if.valid && fpu_commit_if.wb, `endif ld_commit_if.valid && ld_commit_if.wb }; - assign rsp_data = { + assign rsp_data = { + `ifdef EXT_TEX_ENABLE + {gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop}, + `endif {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop}, {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop}, `ifdef EXT_F_ENABLE {fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop}, - `endif + `endif { ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop} }; @@ -82,8 +96,20 @@ module VX_writeback #( `else assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb; assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb; +`ifdef EXT_TEX_ENABLE + assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; +`endif +`endif + +`ifdef EXT_TEX_ENABLE +`ifdef EXT_F_ENABLE + assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb; +`else + assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb; +`endif +`else + assign gpu_commit_if.ready = 1; `endif - assign stall = ~writeback_if.ready && writeback_if.valid; diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index 03469568..96b5602b 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -201,7 +201,7 @@ module Vortex ( `SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag); `SCOPE_ASSIGN (busy, busy); -`ifdef DBG_PRINT_MEM +`ifdef DBG_TRACE_MEM always @(posedge clk) begin if (mem_req_valid && mem_req_ready) begin if (mem_req_rw) diff --git a/hw/rtl/afu/VX_avs_wrapper.sv b/hw/rtl/afu/VX_avs_wrapper.sv index d6aaf890..755cdf05 100644 --- a/hw/rtl/afu/VX_avs_wrapper.sv +++ b/hw/rtl/afu/VX_avs_wrapper.sv @@ -158,7 +158,7 @@ module VX_avs_wrapper #( .ready_out (mem_rsp_ready) ); -`ifdef DBG_PRINT_AVS +`ifdef DBG_TRACE_AVS always @(posedge clk) begin if (mem_req_valid && mem_req_ready) begin if (mem_req_rw) begin diff --git a/hw/rtl/afu/vortex_afu.sv b/hw/rtl/afu/vortex_afu.sv index 696725e2..22d1ec29 100644 --- a/hw/rtl/afu/vortex_afu.sv +++ b/hw/rtl/afu/vortex_afu.sv @@ -45,12 +45,14 @@ localparam CCI_DATA_WIDTH = $bits(t_ccip_clData); localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8; localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE); + localparam AVS_RD_QUEUE_SIZE = 4; -localparam AVS_REQ_TAGW_VX_ = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH); -localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, AVS_REQ_TAGW_VX_); -localparam AVS_REQ_TAGW_CCI_ = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH); -localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, AVS_REQ_TAGW_CCI_); -localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI); +localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH; +localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH); +localparam _AVS_REQ_TAGW_VX2 = `MAX(_VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX); +localparam _AVS_REQ_TAGW_CCI = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH); +localparam _AVS_REQ_TAGW_CCI2 = `MAX(CCI_ADDR_WIDTH, _AVS_REQ_TAGW_CCI); +localparam AVS_REQ_TAGW = `MAX(_AVS_REQ_TAGW_VX2, _AVS_REQ_TAGW_CCI2); localparam CCI_RD_WINDOW_SIZE = 8; localparam CCI_RW_PENDING_SIZE= 256; @@ -185,36 +187,36 @@ always @(posedge clk) begin case (mmio_hdr.address) MMIO_IO_ADDR: begin cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data); - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: MMIO_IO_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); `endif end MMIO_MEM_ADDR: begin cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data); - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: MMIO_MEM_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data)); `endif end MMIO_DATA_SIZE: begin cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data); - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: MMIO_DATA_SIZE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data)); `endif end MMIO_CMD_TYPE: begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: MMIO_CMD_TYPE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data)); `endif end `ifdef SCOPE MMIO_SCOPE_WRITE: begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: MMIO_SCOPE_WRITE: addr=%0h, data=%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)); `endif end `endif default: begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: Unknown MMIO Wr: addr=%0h, data=%0h\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data)); `endif end @@ -241,7 +243,7 @@ always @(posedge clk) begin 16'h0008: mmio_tx.data <= 64'h0; // reserved MMIO_STATUS: begin mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)}); - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE if (state != STATE_WIDTH'(mmio_tx.data)) begin dpi_trace("%d: MMIO_STATUS: addr=%0h, state=%0d\n", $time, mmio_hdr.address, state); end @@ -250,20 +252,20 @@ always @(posedge clk) begin `ifdef SCOPE MMIO_SCOPE_READ: begin mmio_tx.data <= cmd_scope_rdata; - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: MMIO_SCOPE_READ: addr=%0h, data=%0h\n", $time, mmio_hdr.address, cmd_scope_rdata); `endif end `endif MMIO_DEV_CAPS: begin mmio_tx.data <= dev_caps; - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: MMIO_DEV_CAPS: addr=%0h, data=%0h\n", $time, mmio_hdr.address, dev_caps); `endif end default: begin mmio_tx.data <= 64'h0; - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: Unknown MMIO Rd: addr=%0h\n", $time, mmio_hdr.address); `endif end @@ -297,19 +299,19 @@ always @(posedge clk) begin STATE_IDLE: begin case (cmd_type) CMD_MEM_READ: begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: STATE READ: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size); `endif state <= STATE_READ; end CMD_MEM_WRITE: begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: STATE WRITE: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size); `endif state <= STATE_WRITE; end CMD_RUN: begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: STATE START\n", $time); `endif vx_reset <= 1; @@ -324,7 +326,7 @@ always @(posedge clk) begin STATE_READ: begin if (cmd_read_done) begin state <= STATE_IDLE; - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: STATE IDLE\n", $time); `endif end @@ -333,7 +335,7 @@ always @(posedge clk) begin STATE_WRITE: begin if (cmd_write_done) begin state <= STATE_IDLE; - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: STATE IDLE\n", $time); `endif end @@ -345,7 +347,7 @@ always @(posedge clk) begin if (cmd_run_done) begin vx_started <= 0; state <= STATE_IDLE; - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: STATE IDLE\n", $time); `endif end @@ -699,7 +701,7 @@ always @(posedge clk) begin if (cci_rd_req_fire) begin cci_rd_req_addr <= cci_rd_req_addr + 1; cci_rd_req_ctr <= cci_rd_req_ctr + 1; - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads); `endif end @@ -709,13 +711,13 @@ always @(posedge clk) begin if (CCI_RD_QUEUE_TAGW'(cci_rd_rsp_ctr) == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE); end - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data); `endif end if (cci_rdq_pop) begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads); `endif end @@ -856,13 +858,13 @@ begin if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin cci_wr_req_done <= 1; end - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data); `endif end if (cci_wr_rsp_fire) begin - `ifdef DBG_PRINT_OPAE + `ifdef DBG_TRACE_OPAE dpi_trace("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes); `endif end diff --git a/hw/rtl/cache/VX_bank.sv b/hw/rtl/cache/VX_bank.sv index 1f05ae04..14d50e29 100644 --- a/hw/rtl/cache/VX_bank.sv +++ b/hw/rtl/cache/VX_bank.sv @@ -509,7 +509,7 @@ module VX_bank #( assign perf_mshr_stalls = mshr_alm_full; `endif -`ifdef DBG_PRINT_CACHE_BANK +`ifdef DBG_TRACE_CACHE_BANK wire crsq_fire = crsq_valid && crsq_ready; wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid) && ~(mshr_fire || mem_rsp_fire || creq_fire); diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index fc1864e9..c0709cce 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -53,7 +53,7 @@ `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] -`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1:(CORE_TAG_WIDTH-`NW_BITS-32) +`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW) /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_data_access.sv b/hw/rtl/cache/VX_data_access.sv index 5b81140d..a1a5247b 100644 --- a/hw/rtl/cache/VX_data_access.sv +++ b/hw/rtl/cache/VX_data_access.sv @@ -119,7 +119,7 @@ module VX_data_access #( `UNUSED_VAR (stall) -`ifdef DBG_PRINT_CACHE_DATA +`ifdef DBG_TRACE_CACHE_DATA always @(posedge clk) begin if (fill && ~stall) begin dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); diff --git a/hw/rtl/cache/VX_miss_resrv.sv b/hw/rtl/cache/VX_miss_resrv.sv index 152a6702..bda63bb1 100644 --- a/hw/rtl/cache/VX_miss_resrv.sv +++ b/hw/rtl/cache/VX_miss_resrv.sv @@ -202,7 +202,7 @@ module VX_miss_resrv #( `UNUSED_VAR (lookup_valid) -`ifdef DBG_PRINT_CACHE_MSHR +`ifdef DBG_TRACE_CACHE_MSHR always @(posedge clk) begin if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin if (allocate_fire) diff --git a/hw/rtl/cache/VX_shared_mem.sv b/hw/rtl/cache/VX_shared_mem.sv index 51c60a38..46ea0cfc 100644 --- a/hw/rtl/cache/VX_shared_mem.sv +++ b/hw/rtl/cache/VX_shared_mem.sv @@ -229,7 +229,7 @@ module VX_shared_mem #( core_rsp_data_in = 'x; bank_rsp_sel_n = bank_rsp_sel_r; for (integer i = 0; i < NUM_BANKS; i++) begin - if (per_bank_core_req_valid[i] + if (core_req_read_mask[i] && (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin core_rsp_valids_in[per_bank_core_req_tid[i]] = 1; core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i]; @@ -271,7 +271,7 @@ module VX_shared_mem #( end `endif -`ifdef DBG_PRINT_CACHE_BANK +`ifdef DBG_TRACE_CACHE_BANK reg is_multi_tag_req; `IGNORE_UNUSED_BEGIN diff --git a/hw/rtl/cache/VX_tag_access.sv b/hw/rtl/cache/VX_tag_access.sv index e3433528..55124a65 100644 --- a/hw/rtl/cache/VX_tag_access.sv +++ b/hw/rtl/cache/VX_tag_access.sv @@ -61,7 +61,7 @@ module VX_tag_access #( `UNUSED_VAR (stall) -`ifdef DBG_PRINT_CACHE_TAG +`ifdef DBG_TRACE_CACHE_TAG always @(posedge clk) begin if (fill && ~stall) begin dpi_trace("%d: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag); diff --git a/hw/rtl/fp_cores/VX_fpu_fpnew.sv b/hw/rtl/fp_cores/VX_fpu_fpnew.sv index 3711dc3b..deaf62de 100644 --- a/hw/rtl/fp_cores/VX_fpu_fpnew.sv +++ b/hw/rtl/fp_cores/VX_fpu_fpnew.sv @@ -3,8 +3,7 @@ `include "defs_div_sqrt_mvp.sv" `TRACING_OFF -module VX_fpu_fpnew -#( +module VX_fpu_fpnew #( parameter TAGW = 1, parameter FMULADD = 1, parameter FDIVSQRT = 1, diff --git a/hw/rtl/interfaces/VX_gpu_req_if.sv b/hw/rtl/interfaces/VX_gpu_req_if.sv index e3511043..50ac8c7c 100644 --- a/hw/rtl/interfaces/VX_gpu_req_if.sv +++ b/hw/rtl/interfaces/VX_gpu_req_if.sv @@ -12,9 +12,11 @@ interface VX_gpu_req_if(); wire [31:0] PC; wire [31:0] next_PC; wire [`INST_GPU_BITS-1:0] op_type; + wire [`INST_MOD_BITS-1:0] op_mod; wire [`NT_BITS-1:0] tid; wire [`NUM_THREADS-1:0][31:0] rs1_data; - wire [31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs2_data; + wire [`NUM_THREADS-1:0][31:0] rs3_data; wire [`NR_BITS-1:0] rd; wire wb; @@ -27,9 +29,11 @@ interface VX_gpu_req_if(); output PC, output next_PC, output op_type, + output op_mod, output tid, output rs1_data, output rs2_data, + output rs3_data, output rd, output wb, input ready @@ -42,9 +46,11 @@ interface VX_gpu_req_if(); input PC, input next_PC, input op_type, + input op_mod, input tid, input rs1_data, input rs2_data, + input rs3_data, input rd, input wb, output ready diff --git a/hw/rtl/interfaces/VX_ibuffer_if.sv b/hw/rtl/interfaces/VX_ibuffer_if.sv index 45569371..bb791737 100644 --- a/hw/rtl/interfaces/VX_ibuffer_if.sv +++ b/hw/rtl/interfaces/VX_ibuffer_if.sv @@ -76,20 +76,6 @@ interface VX_ibuffer_if (); input wid_n, output ready ); - - modport scoreboard ( - input valid, - input wid, - input PC, - input wb, - input rd, - input rd_n, - input rs1_n, - input rs2_n, - input rs3_n, - input wid_n, - output ready - ); endinterface diff --git a/hw/rtl/interfaces/VX_tex_csr_if.sv b/hw/rtl/interfaces/VX_tex_csr_if.sv new file mode 100644 index 00000000..a83c9479 --- /dev/null +++ b/hw/rtl/interfaces/VX_tex_csr_if.sv @@ -0,0 +1,26 @@ +`ifndef VX_TEX_CSR_IF +`define VX_TEX_CSR_IF + +`include "VX_define.vh" + +interface VX_tex_csr_if (); + + wire write_enable; + wire [`CSR_ADDR_BITS-1:0] write_addr; + wire [31:0] write_data; + + modport master ( + output write_enable, + output write_addr, + output write_data + ); + + modport slave ( + input write_enable, + input write_addr, + input write_data + ); + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_tex_req_if.sv b/hw/rtl/interfaces/VX_tex_req_if.sv new file mode 100644 index 00000000..f1eaa1be --- /dev/null +++ b/hw/rtl/interfaces/VX_tex_req_if.sv @@ -0,0 +1,51 @@ +`ifndef VX_TEX_REQ_IF +`define VX_TEX_REQ_IF + +`include "VX_define.vh" + +interface VX_tex_req_if (); + + wire valid; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] tmask; + wire [31:0] PC; + wire [`NR_BITS-1:0] rd; + wire wb; + + wire [`NTEX_BITS-1:0] unit; + wire [1:0][`NUM_THREADS-1:0][31:0] coords; + wire [`NUM_THREADS-1:0][31:0] lod; + + wire ready; + + modport master ( + output valid, + output wid, + output tmask, + output PC, + output rd, + output wb, + output unit, + output coords, + output lod, + input ready + ); + + modport slave ( + input valid, + input wid, + input tmask, + input PC, + input rd, + input wb, + input unit, + input coords, + input lod, + output ready + ); + +endinterface +`endif + + + \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_tex_rsp_if.sv b/hw/rtl/interfaces/VX_tex_rsp_if.sv new file mode 100644 index 00000000..b3dbd65d --- /dev/null +++ b/hw/rtl/interfaces/VX_tex_rsp_if.sv @@ -0,0 +1,43 @@ +`ifndef VX_TEX_RSP_IF +`define VX_TEX_RSP_IF + +`include "VX_define.vh" + +interface VX_tex_rsp_if (); + + wire valid; + wire [`NW_BITS-1:0] wid; + wire [`NUM_THREADS-1:0] tmask; + wire [31:0] PC; + wire [`NR_BITS-1:0] rd; + wire wb; + wire [`NUM_THREADS-1:0][31:0] data; + wire ready; + + modport master ( + output valid, + output wid, + output tmask, + output PC, + output rd, + output wb, + output data, + input ready + ); + + modport slave ( + input valid, + input wid, + input tmask, + input PC, + input rd, + input wb, + input data, + output ready + ); + +endinterface + +`endif + + diff --git a/hw/rtl/interfaces/VX_writeback_if.sv b/hw/rtl/interfaces/VX_writeback_if.sv index b3e2060d..8f05fc7a 100644 --- a/hw/rtl/interfaces/VX_writeback_if.sv +++ b/hw/rtl/interfaces/VX_writeback_if.sv @@ -36,15 +36,6 @@ interface VX_writeback_if (); output ready ); - modport scoreboard ( - input valid, - input wid, - input PC, - input rd, - input eop, - output ready - ); - endinterface `endif diff --git a/hw/rtl/libs/VX_scope.sv b/hw/rtl/libs/VX_scope.sv index 051bdb58..2cd8798b 100644 --- a/hw/rtl/libs/VX_scope.sv +++ b/hw/rtl/libs/VX_scope.sv @@ -93,13 +93,13 @@ module VX_scope #( CMD_SET_START: begin delay_val <= $bits(delay_val)'(cmd_data); cmd_start <= 1; - `ifdef DBG_PRINT_SCOPE + `ifdef DBG_TRACE_SCOPE dpi_trace("%d: *** scope: CMD_SET_START: delay_val=%0d\n", $time, $bits(delay_val)'(cmd_data)); `endif end CMD_SET_STOP: begin waddr_end <= $bits(waddr)'(cmd_data); - `ifdef DBG_PRINT_SCOPE + `ifdef DBG_TRACE_SCOPE dpi_trace("%d: *** scope: CMD_SET_STOP: waddr_end=%0d\n", $time, $bits(waddr)'(cmd_data)); `endif end @@ -116,7 +116,7 @@ module VX_scope #( delta <= 0; delay_cntr <= 0; start_time <= timestamp; - `ifdef DBG_PRINT_SCOPE + `ifdef DBG_TRACE_SCOPE dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp); `endif end else begin @@ -132,7 +132,7 @@ module VX_scope #( recording <= 1; delta <= 0; start_time <= timestamp; - `ifdef DBG_PRINT_SCOPE + `ifdef DBG_TRACE_SCOPE dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp); `endif end @@ -161,7 +161,7 @@ module VX_scope #( if (stop || (waddr >= waddr_end)) begin - `ifdef DBG_PRINT_SCOPE + `ifdef DBG_TRACE_SCOPE dpi_trace("%d: *** scope: recording stop - waddr=(%0d, %0d)\n", $time, waddr, waddr_end); `endif waddr <= waddr; // keep last address @@ -229,7 +229,7 @@ module VX_scope #( assign bus_out = bus_out_r; -`ifdef DBG_PRINT_SCOPE +`ifdef DBG_TRACE_SCOPE always @(posedge clk) begin if (bus_read) begin dpi_trace("%d: scope-read: cmd=%0d, addr=%0d, value=%0h\n", $time, get_cmd, raddr, bus_out); diff --git a/hw/rtl/tex_unit/VX_tex_addr.sv b/hw/rtl/tex_unit/VX_tex_addr.sv new file mode 100644 index 00000000..26a20566 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_addr.sv @@ -0,0 +1,178 @@ +`include "VX_tex_define.vh" + +module VX_tex_addr #( + parameter CORE_ID = 0, + parameter REQ_INFOW = 1, + parameter NUM_REQS = 1 +) ( + input wire clk, + input wire reset, + + // inputs + + input wire req_valid, + input wire [NUM_REQS-1:0] req_tmask, + input wire [1:0][NUM_REQS-1:0][31:0] req_coords, + input wire [`TEX_FORMAT_BITS-1:0] req_format, + input wire [`TEX_FILTER_BITS-1:0] req_filter, + input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps, + input wire [`TEX_ADDR_BITS-1:0] req_baseaddr, + input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff, + input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims, + input wire [REQ_INFOW-1:0] req_info, + output wire req_ready, + + // outputs + + output wire rsp_valid, + output wire [NUM_REQS-1:0] rsp_tmask, + output wire [`TEX_FILTER_BITS-1:0] rsp_filter, + output wire [`TEX_STRIDE_BITS-1:0] rsp_stride, + output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr, + output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends, + output wire [REQ_INFOW-1:0] rsp_info, + input wire rsp_ready +); + + `UNUSED_PARAM (CORE_ID) + + localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1; + localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS; + localparam SCALED_X_W = (2 * `FIXED_INT); + localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS; + + wire valid_s0; + wire [NUM_REQS-1:0] tmask_s0; + wire [`TEX_FILTER_BITS-1:0] filter_s0; + wire [REQ_INFOW-1:0] req_info_s0; + wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0; + wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0; + wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0; + wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0; + wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0; + wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0; + + wire stall_out; + + // stride + + VX_tex_stride #( + .CORE_ID (CORE_ID) + ) tex_stride ( + .format (req_format), + .log_stride (log_stride) + ); + + // addressing mode + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 2; ++j) begin + wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]); + wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i]; + wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i]; + + VX_tex_wrap #( + .CORE_ID (CORE_ID) + ) tex_wrap_lo ( + .wrap_i (req_wraps[j]), + .coord_i (coord_lo), + .coord_o (clamped_lo[i][j]) + ); + + VX_tex_wrap #( + .CORE_ID (CORE_ID) + ) tex_wrap_hi ( + .wrap_i (req_wraps[j]), + .coord_i (coord_hi), + .coord_o (clamped_hi[i][j]) + ); + end + assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride); + assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]); + end + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)), + .RESETW (1) + ) pipe_reg0 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}), + .data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0}) + ); + + // addresses generation + + wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo; + wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi; + wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends; + wire [NUM_REQS-1:0][3:0][31:0] addr; + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 2; ++j) begin + assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]); + assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]); + assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0); + end + end + + `UNUSED_VAR (log_pitch_s0) + + for (genvar i = 0; i < NUM_REQS; ++i) begin + wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0; + wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0; + + wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i]; + wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i]; + + wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo); + wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi); + + assign addr[i][0] = base_addr_lo + 32'(offset_u_lo); + assign addr[i][1] = base_addr_lo + 32'(offset_u_hi); + assign addr[i][2] = base_addr_hi + 32'(offset_u_lo); + assign addr[i][3] = base_addr_hi + 32'(offset_u_hi); + end + + assign stall_out = rsp_valid && ~rsp_ready; + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW), + .RESETW (1) + ) pipe_reg1 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}), + .data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info}) + ); + + assign req_ready = ~stall_out; + +`ifdef DBG_TRACE_TEX + wire [`NW_BITS-1:0] rsp_wid; + wire [31:0] rsp_PC; + + assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; + + always @(posedge clk) begin + if (rsp_valid && rsp_ready) begin + dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride); + `TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS); + dpi_trace("\n"); + end + end +`endif + +function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src, + input logic [`TEX_DIM_BITS-1:0] dim); +`IGNORE_WARNINGS_BEGIN + logic [`FIXED_BITS-1:0] out; +`IGNORE_WARNINGS_END + out = `FIXED_BITS'(src) << dim; + return out[`FIXED_FRAC +: `FIXED_INT]; +endfunction + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_define.vh b/hw/rtl/tex_unit/VX_tex_define.vh new file mode 100644 index 00000000..16272fc9 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_define.vh @@ -0,0 +1,39 @@ +`ifndef VX_TEX_DEFINE +`define VX_TEX_DEFINE + +`include "VX_define.vh" + +`define FIXED_BITS 32 +`define FIXED_FRAC 20 +`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC) +`define FIXED_ONE (2 ** `FIXED_FRAC) +`define FIXED_HALF (`FIXED_ONE >> 1) +`define FIXED_MASK (`FIXED_ONE - 1) + +`define TEX_ADDR_BITS 32 +`define TEX_FORMAT_BITS 3 +`define TEX_WRAP_BITS 2 +`define TEX_DIM_BITS 4 +`define TEX_FILTER_BITS 1 + +`define TEX_MIPOFF_BITS (2*12+1) +`define TEX_STRIDE_BITS 2 + +`define TEX_LOD_BITS 4 +`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS) + +`define TEX_WRAP_CLAMP 0 +`define TEX_WRAP_REPEAT 1 +`define TEX_WRAP_MIRROR 2 + +`define BLEND_FRAC 8 +`define BLEND_ONE (2 ** `BLEND_FRAC) + +`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0) +`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1) +`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2) +`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3) +`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4) +`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5) + +`endif \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_format.sv b/hw/rtl/tex_unit/VX_tex_format.sv new file mode 100644 index 00000000..91e0e6f8 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_format.sv @@ -0,0 +1,58 @@ +`include "VX_tex_define.vh" + +module VX_tex_format #( + parameter CORE_ID = 0 +) ( + input wire [`TEX_FORMAT_BITS-1:0] format, + input wire [31:0] texel_in, + output wire [31:0] texel_out +); + `UNUSED_PARAM (CORE_ID) + + reg [31:0] texel_out_r; + + always @(*) begin + case (format) + `TEX_FORMAT_R8G8B8A8: begin + texel_out_r[07:00] = texel_in[7:0]; + texel_out_r[15:08] = texel_in[15:8]; + texel_out_r[23:16] = texel_in[23:16]; + texel_out_r[31:24] = texel_in[31:24]; + end + `TEX_FORMAT_R5G6B5: begin + texel_out_r[07:00] = {texel_in[15:11], texel_in[15:13]}; + texel_out_r[15:08] = {texel_in[10:5], texel_in[10:9]}; + texel_out_r[23:16] = {texel_in[4:0], texel_in[4:2]}; + texel_out_r[31:24] = 8'hff; + end + `TEX_FORMAT_R4G4B4A4: begin + texel_out_r[07:00] = {texel_in[11:8], texel_in[15:12]}; + texel_out_r[15:08] = {2{texel_in[7:4]}}; + texel_out_r[23:16] = {2{texel_in[3:0]}}; + texel_out_r[31:24] = {2{texel_in[15:12]}}; + end + `TEX_FORMAT_L8A8: begin + texel_out_r[07:00] = texel_in[7:0]; + texel_out_r[15:08] = texel_in[7:0]; + texel_out_r[23:16] = texel_in[7:0]; + texel_out_r[31:24] = texel_in[15:8]; + end + `TEX_FORMAT_L8: begin + texel_out_r[07:00] = texel_in[7:0]; + texel_out_r[15:08] = texel_in[7:0]; + texel_out_r[23:16] = texel_in[7:0]; + texel_out_r[31:24] = 8'hff; + end + //`TEX_FORMAT_A8 + default: begin + texel_out_r[07:00] = 0; + texel_out_r[15:08] = 0; + texel_out_r[23:16] = 0; + texel_out_r[31:24] = texel_in[7:0]; + end + endcase + end + + assign texel_out = texel_out_r; + +endmodule diff --git a/hw/rtl/tex_unit/VX_tex_lerp.sv b/hw/rtl/tex_unit/VX_tex_lerp.sv new file mode 100644 index 00000000..6dce57e3 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_lerp.sv @@ -0,0 +1,16 @@ +`include "VX_tex_define.vh" + +module VX_tex_lerp ( + input wire [3:0][7:0] in1, + input wire [3:0][7:0] in2, + input wire [8:0] alpha, + input wire [7:0] beta, + output wire [3:0][7:0] out +); + for (genvar i = 0; i < 4; ++i) begin + wire [16:0] sum = in1[i] * alpha + in2[i] * beta; + `UNUSED_VAR (sum) + assign out[i] = sum[15:8]; + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_mem.sv b/hw/rtl/tex_unit/VX_tex_mem.sv new file mode 100644 index 00000000..91aa0438 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_mem.sv @@ -0,0 +1,295 @@ +`include "VX_tex_define.vh" +module VX_tex_mem #( + parameter CORE_ID = 0, + parameter REQ_INFOW = 1, + parameter NUM_REQS = 1 +) ( + input wire clk, + input wire reset, + + // memory interface + VX_dcache_req_if.master dcache_req_if, + VX_dcache_rsp_if.slave dcache_rsp_if, + + // inputs + input wire req_valid, + input wire [NUM_REQS-1:0] req_tmask, + input wire [`TEX_FILTER_BITS-1:0] req_filter, + input wire [`TEX_STRIDE_BITS-1:0] req_stride, + input wire [NUM_REQS-1:0][3:0][31:0] req_addr, + input wire [REQ_INFOW-1:0] req_info, + output wire req_ready, + + // outputs + output wire rsp_valid, + output wire [NUM_REQS-1:0] rsp_tmask, + output wire [NUM_REQS-1:0][3:0][31:0] rsp_data, + output wire [REQ_INFOW-1:0] rsp_info, + input wire rsp_ready +); + + `UNUSED_PARAM (CORE_ID) + + localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1); + + wire [3:0] dup_reqs; + wire [3:0][NUM_REQS-1:0][29:0] req_addr_w; + wire [3:0][NUM_REQS-1:0][1:0] align_offs; + + // reorder address into quads + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 4; ++j) begin + assign req_addr_w[j][i] = req_addr[i][j][31:2]; + assign align_offs[j][i] = req_addr[i][j][1:0]; + end + end + + // find duplicate addresses + + for (genvar i = 0; i < 4; ++i) begin + wire [NUM_REQS-1:0] addr_matches; + for (genvar j = 0; j < NUM_REQS; j++) begin + assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j]; + end + assign dup_reqs[i] = req_tmask[0] && (& addr_matches); + end + + // save request addresses into fifo + + wire reqq_push, reqq_pop, reqq_empty, reqq_full; + + wire [3:0][NUM_REQS-1:0][29:0] q_req_addr; + wire [NUM_REQS-1:0] q_req_tmask; + wire [`TEX_FILTER_BITS-1:0] q_req_filter; + wire [REQ_INFOW-1:0] q_req_info; + wire [`TEX_STRIDE_BITS-1:0] q_req_stride; + wire [3:0][NUM_REQS-1:0][1:0] q_align_offs; + wire [3:0] q_dup_reqs; + + assign reqq_push = req_valid && req_ready; + + VX_fifo_queue #( + .DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4), + .SIZE (`LSUQ_SIZE), + .OUT_REG (1) + ) req_queue ( + .clk (clk), + .reset (reset), + .push (reqq_push), + .pop (reqq_pop), + .data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}), + .data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}), + .empty (reqq_empty), + .full (reqq_full), + `UNUSED_PIN (alm_full), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (size) + ); + + // can take more requests? + assign req_ready = ~reqq_full; + + /////////////////////////////////////////////////////////////////////////// + + wire req_texel_valid; + wire sent_all_ready, last_texel_sent; + wire req_texel_dup; + wire [NUM_REQS-1:0][29:0] req_texel_addr; + reg [1:0] req_texel_idx; + reg req_texels_done; + + always @(posedge clk) begin + if (reset || last_texel_sent) begin + req_texel_idx <= 0; + end else if (req_texel_valid && sent_all_ready) begin + req_texel_idx <= req_texel_idx + 1; + end + end + + always @(posedge clk) begin + if (reset || reqq_pop) begin + req_texels_done <= 0; + end else if (last_texel_sent) begin + req_texels_done <= 1; + end + end + + assign req_texel_valid = ~reqq_empty && ~req_texels_done; + assign req_texel_addr = q_req_addr[req_texel_idx]; + assign req_texel_dup = q_dup_reqs[req_texel_idx]; + + wire is_last_texel = (req_texel_idx == (q_req_filter ? 3 : 0)); + assign last_texel_sent = req_texel_valid && sent_all_ready && is_last_texel; + + // DCache Request + + reg [NUM_REQS-1:0] texel_sent_mask; + + wire [NUM_REQS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready; + + wire dcache_req_fire_any = (| dcache_req_fire); + + assign sent_all_ready = (&(dcache_req_if.ready | texel_sent_mask | ~q_req_tmask)) + || (req_texel_dup & dcache_req_if.ready[0]); + + always @(posedge clk) begin + if (reset || sent_all_ready) begin + texel_sent_mask <= 0; + end else begin + texel_sent_mask <= texel_sent_mask | dcache_req_fire; + end + end + + wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1}; + + assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask; + assign dcache_req_if.rw = {NUM_REQS{1'b0}}; + assign dcache_req_if.addr = req_texel_addr; + assign dcache_req_if.byteen = {NUM_REQS{4'b1111}}; + assign dcache_req_if.data = 'x; + +`ifdef DBG_CACHE_REQ_INFO + assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}}; +`else + assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}}; +`endif + + // Dcache Response + + reg [3:0][NUM_REQS-1:0][31:0] rsp_texels, rsp_texels_n; + wire [NUM_REQS-1:0][3:0][31:0] rsp_texels_qual; + reg [NUM_REQS-1:0][31:0] rsp_data_qual; + reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init; + wire [RSP_CTR_W-1:0] rsp_rem_ctr_n; + wire dcache_rsp_fire; + wire [1:0] rsp_texel_idx; + wire rsp_texel_dup; + + assign rsp_texel_idx = dcache_rsp_if.tag[1:0]; + `UNUSED_VAR (dcache_rsp_if.tag) + + assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx]; + + assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready; + + for (genvar i = 0; i < NUM_REQS; i++) begin + wire [31:0] src_mask = {32{dcache_rsp_if.tmask[i]}}; + wire [31:0] src_data = ((i == 0 || rsp_texel_dup) ? dcache_rsp_if.data[0] : dcache_rsp_if.data[i]) & src_mask; + + reg [31:0] rsp_data_shifted; + always @(*) begin + rsp_data_shifted[31:16] = src_data[31:16]; + rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0]; + rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0]; + end + + always @(*) begin + case (q_req_stride) + 0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]); + 1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]); + default: rsp_data_qual[i] = rsp_data_shifted; + endcase + end + end + + always @(*) begin + rsp_texels_n = rsp_texels; + rsp_texels_n[rsp_texel_idx] |= rsp_data_qual; + end + + always @(posedge clk) begin + if (reset || reqq_pop) begin + rsp_texels <= '0; + end else if (dcache_rsp_fire) begin + rsp_texels <= rsp_texels_n; + end + end + + always @(*) begin + rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask)); + if (q_req_filter) begin + for (integer i = 1; i < 4; ++i) begin + rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask)); + end + end + end + + assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask)); + + always @(posedge clk) begin + if (reset) begin + rsp_rem_ctr <= 0; + end else begin + if (dcache_req_fire_any && 0 == rsp_rem_ctr) begin + rsp_rem_ctr <= rsp_rem_ctr_init; + end else if (dcache_rsp_fire) begin + rsp_rem_ctr <= rsp_rem_ctr_n; + end + end + end + + for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar j = 0; j < 4; ++j) begin + assign rsp_texels_qual[i][j] = rsp_texels_n[j][i]; + end + end + + wire stall_out = rsp_valid && ~rsp_ready; + + wire is_last_rsp = (0 == rsp_rem_ctr_n); + + wire rsp_texels_done = dcache_rsp_fire && is_last_rsp; + + assign reqq_pop = rsp_texels_done && ~stall_out; + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFOW + (4 * NUM_REQS * 32)), + .RESETW (1) + ) rsp_pipe_reg ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({rsp_texels_done, q_req_tmask, q_req_info, rsp_texels_qual}), + .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) + ); + + // Can accept new cache response? + assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out); + +`ifdef DBG_TRACE_TEX + wire [`NW_BITS-1:0] q_req_wid, req_wid, rsp_wid; + wire [31:0] q_req_PC, req_PC, rsp_PC; + assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0]; + assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0]; + assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; + + always @(posedge clk) begin + if (dcache_req_fire_any) begin + dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx); + `TRACE_ARRAY1D(req_texel_addr, NUM_REQS); + dpi_trace(", is_dup=%b\n", req_texel_dup); + end + if (dcache_rsp_fire) begin + dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=", + $time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx); + `TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS); + dpi_trace("\n"); + end + if (req_valid && req_ready) begin + dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=", + $time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride); + `TRACE_ARRAY2D(req_addr, 4, NUM_REQS); + dpi_trace("\n"); + end + if (rsp_valid && rsp_ready) begin + dpi_trace("%d: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask); + `TRACE_ARRAY2D(rsp_data, 4, NUM_REQS); + dpi_trace("\n"); + end + end +`endif + +endmodule diff --git a/hw/rtl/tex_unit/VX_tex_sampler.sv b/hw/rtl/tex_unit/VX_tex_sampler.sv new file mode 100644 index 00000000..ac0f1496 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_sampler.sv @@ -0,0 +1,146 @@ +`include "VX_tex_define.vh" + +module VX_tex_sampler #( + parameter CORE_ID = 0, + parameter REQ_INFOW = 1, + parameter NUM_REQS = 1 +) ( + input wire clk, + input wire reset, + + // inputs + input wire req_valid, + input wire [NUM_REQS-1:0] req_tmask, + input wire [`TEX_FORMAT_BITS-1:0] req_format, + input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends, + input wire [NUM_REQS-1:0][3:0][31:0] req_data, + input wire [REQ_INFOW-1:0] req_info, + output wire req_ready, + + // ouputs + output wire rsp_valid, + output wire [NUM_REQS-1:0] rsp_tmask, + output wire [NUM_REQS-1:0][31:0] rsp_data, + output wire [REQ_INFOW-1:0] rsp_info, + input wire rsp_ready +); + + `UNUSED_PARAM (CORE_ID) + + wire valid_s0; + wire [NUM_REQS-1:0] tmask_s0; + wire [REQ_INFOW-1:0] req_info_s0; + wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh; + wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0; + wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0; + wire [NUM_REQS-1:0][31:0] texel_v; + + wire stall_out; + + for (genvar i = 0; i < NUM_REQS; ++i) begin + + wire [3:0][31:0] fmt_texels; + + for (genvar j = 0; j < 4; ++j) begin + VX_tex_format #( + .CORE_ID (CORE_ID) + ) tex_format ( + .format (req_format), + .texel_in (req_data[i][j]), + .texel_out (fmt_texels[j]) + ); + end + + wire [7:0] beta = req_blends[i][0]; + wire [8:0] alpha = `BLEND_ONE - beta; + + VX_tex_lerp #( + ) tex_lerp_ul ( + .in1 (fmt_texels[0]), + .in2 (fmt_texels[1]), + .alpha (alpha), + .beta (beta), + .out (texel_ul[i]) + ); + + VX_tex_lerp #( + ) tex_lerp_uh ( + .in1 (fmt_texels[2]), + .in2 (fmt_texels[3]), + .alpha (alpha), + .beta (beta), + .out (texel_uh[i]) + ); + + assign blend_v[i] = req_blends[i][1]; + end + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)), + .RESETW (1) + ) pipe_reg0 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}), + .data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0}) + ); + + for (genvar i = 0; i < NUM_REQS; i++) begin + wire [7:0] beta = blend_v_s0[i]; + wire [8:0] alpha = `BLEND_ONE - beta; + + VX_tex_lerp #( + ) tex_lerp_v ( + .in1 (texel_ul_s0[i]), + .in2 (texel_uh_s0[i]), + .alpha (alpha), + .beta (beta), + .out (texel_v[i]) + ); + end + + assign stall_out = rsp_valid && ~rsp_ready; + + VX_pipe_register #( + .DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)), + .RESETW (1) + ) pipe_reg1 ( + .clk (clk), + .reset (reset), + .enable (~stall_out), + .data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}), + .data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data}) + ); + + // can accept new request? + assign req_ready = ~stall_out; + +`ifdef DBG_TRACE_TEX + wire [`NW_BITS-1:0] req_wid, rsp_wid; + wire [31:0] req_PC, rsp_PC; + + assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0]; + assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0]; + + always @(posedge clk) begin + if (req_valid && req_ready) begin + dpi_trace("%d: core%0d-tex-sampler-req: wid=%0d, PC=%0h, tmask=%b, format=%0d, data=", + $time, CORE_ID, req_wid, req_PC, req_tmask, req_format); + `TRACE_ARRAY2D(req_data, 4, NUM_REQS); + dpi_trace(", u0="); + `TRACE_ARRAY1D(req_blends[0], NUM_REQS); + dpi_trace(", v0="); + `TRACE_ARRAY1D(req_blends[1], NUM_REQS); + dpi_trace("\n"); + end + if (rsp_valid && rsp_ready) begin + dpi_trace("%d: core%0d-tex-sampler-rsp: wid=%0d, PC=%0h, tmask=%b, data=", + $time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask); + `TRACE_ARRAY1D(rsp_data, NUM_REQS); + dpi_trace("\n"); + end + end +`endif + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_sat.sv b/hw/rtl/tex_unit/VX_tex_sat.sv new file mode 100644 index 00000000..f8e20d08 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_sat.sv @@ -0,0 +1,21 @@ +`include "VX_platform.vh" + +module VX_tex_sat #( + parameter IN_W = 1, + parameter OUT_W = 1, + parameter MODEL = 1 +) ( + input wire [IN_W-1:0] data_in, + output wire [OUT_W-1:0] data_out +); + `STATIC_ASSERT(((OUT_W+1) < IN_W), ("invalid parameter")) + + if (MODEL == 1) begin + wire [OUT_W-1:0] underflow_mask = {OUT_W{~data_in[IN_W-1]}}; + wire [OUT_W-1:0] overflow_mask = {OUT_W{(| data_in[IN_W-2:OUT_W])}}; + assign data_out = (data_in[OUT_W-1:0] | overflow_mask) & underflow_mask; + end else begin + assign data_out = data_in[IN_W-1] ? OUT_W'(0) : ((data_in > {OUT_W{1'b1}}) ? {OUT_W{1'b1}} : OUT_W'(data_in)); + end + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_stride.sv b/hw/rtl/tex_unit/VX_tex_stride.sv new file mode 100644 index 00000000..50393fe9 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_stride.sv @@ -0,0 +1,27 @@ +`include "VX_tex_define.vh" + +module VX_tex_stride #( + parameter CORE_ID = 0 +) ( + input wire [`TEX_FORMAT_BITS-1:0] format, + output wire [`TEX_STRIDE_BITS-1:0] log_stride +); + `UNUSED_PARAM (CORE_ID) + + reg [`TEX_STRIDE_BITS-1:0] log_stride_r; + + always @(*) begin + case (format) + `TEX_FORMAT_A8: log_stride_r = 0; + `TEX_FORMAT_L8: log_stride_r = 0; + `TEX_FORMAT_L8A8: log_stride_r = 1; + `TEX_FORMAT_R5G6B5: log_stride_r = 1; + `TEX_FORMAT_R4G4B4A4: log_stride_r = 1; + //`TEX_FORMAT_R8G8B8A8 + default: log_stride_r = 2; + endcase + end + + assign log_stride = log_stride_r; + +endmodule diff --git a/hw/rtl/tex_unit/VX_tex_unit.sv b/hw/rtl/tex_unit/VX_tex_unit.sv new file mode 100644 index 00000000..6be6aa43 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_unit.sv @@ -0,0 +1,234 @@ +`include "VX_tex_define.vh" + +module VX_tex_unit #( + parameter CORE_ID = 0 +) ( + input wire clk, + input wire reset, + + // Texture unit <-> Memory Unit + VX_dcache_req_if.master dcache_req_if, + VX_dcache_rsp_if.slave dcache_rsp_if, + + // Inputs + VX_tex_req_if.slave tex_req_if, + VX_tex_csr_if.slave tex_csr_if, + + // Outputs + VX_tex_rsp_if.master tex_rsp_if +); + + localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32; + localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S; + localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A; + + reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0]; + reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0]; + reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0]; + reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0]; + reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0]; + + // CSRs programming + + reg [`NUM_TEX_UNITS-1:0] csrs_dirty; + `UNUSED_VAR (csrs_dirty) + + for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin + wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS]; + always @(posedge clk) begin + if (tex_csr_if.write_enable) begin + case (tex_csr_if.write_addr) + `CSR_TEX_ADDR(i) : begin + tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0]; + csrs_dirty[i] <= 1; + end + `CSR_TEX_FORMAT(i) : begin + tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0]; + csrs_dirty[i] <= 1; + end + `CSR_TEX_WRAP(i) : begin + tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS]; + tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS]; + csrs_dirty[i] <= 1; + end + `CSR_TEX_FILTER(i) : begin + tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0]; + csrs_dirty[i] <= 1; + end + `CSR_TEX_MIPOFF(i) : begin + tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0]; + csrs_dirty[i] <= 1; + end + `CSR_TEX_WIDTH(i) : begin + tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; + csrs_dirty[i] <= 1; + end + `CSR_TEX_HEIGHT(i) : begin + tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0]; + csrs_dirty[i] <= 1; + end + endcase + end + if (reset || (tex_req_if.valid && tex_req_if.ready)) begin + csrs_dirty[i] <= '0; + end + end + end + + // mipmap attributes + + wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff; + wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims; + + for (genvar i = 0; i < `NUM_THREADS; ++i) begin + wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0]; + wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS]; + assign sel_mipoff[i] = tex_mipoff[unit][mip_level]; + assign sel_dims[i] = tex_dims[unit][mip_level]; + end + + // address generation + + wire mem_req_valid; + wire [`NUM_THREADS-1:0] mem_req_tmask; + wire [`TEX_FILTER_BITS-1:0] mem_req_filter; + wire [`TEX_STRIDE_BITS-1:0] mem_req_stride; + wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends; + wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr; + wire [REQ_INFOW_A-1:0] mem_req_info; + wire mem_req_ready; + + VX_tex_addr #( + .CORE_ID (CORE_ID), + .REQ_INFOW (REQ_INFOW_A), + .NUM_REQS (`NUM_THREADS) + ) tex_addr ( + .clk (clk), + .reset (reset), + + .req_valid (tex_req_if.valid), + .req_tmask (tex_req_if.tmask), + .req_coords (tex_req_if.coords), + .req_format (tex_format[tex_req_if.unit]), + .req_filter (tex_filter[tex_req_if.unit]), + .req_wraps (tex_wraps[tex_req_if.unit]), + .req_baseaddr (tex_baddr[tex_req_if.unit]), + .req_mipoff (sel_mipoff), + .req_logdims (sel_dims), + .req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}), + .req_ready (tex_req_if.ready), + + .rsp_valid (mem_req_valid), + .rsp_tmask (mem_req_tmask), + .rsp_filter (mem_req_filter), + .rsp_stride (mem_req_stride), + .rsp_addr (mem_req_addr), + .rsp_blends (mem_req_blends), + .rsp_info (mem_req_info), + .rsp_ready (mem_req_ready) + ); + + // retrieve texel values from memory + + wire mem_rsp_valid; + wire [`NUM_THREADS-1:0] mem_rsp_tmask; + wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data; + wire [REQ_INFOW_M-1:0] mem_rsp_info; + wire mem_rsp_ready; + + VX_tex_mem #( + .CORE_ID (CORE_ID), + .REQ_INFOW (REQ_INFOW_M), + .NUM_REQS (`NUM_THREADS) + ) tex_mem ( + .clk (clk), + .reset (reset), + + // memory interface + .dcache_req_if (dcache_req_if), + .dcache_rsp_if (dcache_rsp_if), + + // inputs + .req_valid (mem_req_valid), + .req_tmask (mem_req_tmask), + .req_filter(mem_req_filter), + .req_stride(mem_req_stride), + .req_addr (mem_req_addr), + .req_info ({mem_req_blends, mem_req_info}), + .req_ready (mem_req_ready), + + // outputs + .rsp_valid (mem_rsp_valid), + .rsp_tmask (mem_rsp_tmask), + .rsp_data (mem_rsp_data), + .rsp_info (mem_rsp_info), + .rsp_ready (mem_rsp_ready) + ); + + // apply sampler + + wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends; + wire [`TEX_FORMAT_BITS-1:0] rsp_format; + wire [REQ_INFOW_S-1:0] rsp_info; + + assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info; + + VX_tex_sampler #( + .CORE_ID (CORE_ID), + .REQ_INFOW (REQ_INFOW_S), + .NUM_REQS (`NUM_THREADS) + ) tex_sampler ( + .clk (clk), + .reset (reset), + + // inputs + .req_valid (mem_rsp_valid), + .req_tmask (mem_rsp_tmask), + .req_data (mem_rsp_data), + .req_format (rsp_format), + .req_blends (rsp_blends), + .req_info (rsp_info), + .req_ready (mem_rsp_ready), + + // outputs + .rsp_valid (tex_rsp_if.valid), + .rsp_tmask (tex_rsp_if.tmask), + .rsp_data (tex_rsp_if.data), + .rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}), + .rsp_ready (tex_rsp_if.ready) + ); + +`ifdef DBG_TRACE_TEX + always @(posedge clk) begin + if (tex_req_if.valid && tex_req_if.ready) begin + for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin + if (csrs_dirty[i]) begin + dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]); + dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]); + end + end + + dpi_trace("%d: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=", + $time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod); + `TRACE_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS); + dpi_trace(", v="); + `TRACE_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS); + dpi_trace("\n"); + end + if (tex_rsp_if.valid && tex_rsp_if.ready) begin + dpi_trace("%d: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=", + $time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask); + `TRACE_ARRAY1D(tex_rsp_if.data, `NUM_THREADS); + dpi_trace("\n"); + end + end +`endif + +endmodule \ No newline at end of file diff --git a/hw/rtl/tex_unit/VX_tex_wrap.sv b/hw/rtl/tex_unit/VX_tex_wrap.sv new file mode 100644 index 00000000..8cc7b2f5 --- /dev/null +++ b/hw/rtl/tex_unit/VX_tex_wrap.sv @@ -0,0 +1,38 @@ +`include "VX_tex_define.vh" + +module VX_tex_wrap #( + parameter CORE_ID = 0 +) ( + input wire [`TEX_WRAP_BITS-1:0] wrap_i, + input wire [31:0] coord_i, + output wire [`FIXED_FRAC-1:0] coord_o +); + + `UNUSED_PARAM (CORE_ID) + + reg [`FIXED_FRAC-1:0] coord_r; + + wire [`FIXED_FRAC-1:0] clamp; + + VX_tex_sat #( + .IN_W (32), + .OUT_W (`FIXED_FRAC) + ) sat_fx ( + .data_in (coord_i), + .data_out (clamp) + ); + + always @(*) begin + case (wrap_i) + `TEX_WRAP_CLAMP: + coord_r = clamp; + `TEX_WRAP_MIRROR: + coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}}; + default: //`TEX_WRAP_REPEAT + coord_r = coord_i[`FIXED_FRAC-1:0]; + endcase + end + + assign coord_o = coord_r; + +endmodule \ No newline at end of file diff --git a/hw/scripts/scope.json b/hw/scripts/scope.json index 50348b7f..2c9f8355 100644 --- a/hw/scripts/scope.json +++ b/hw/scripts/scope.json @@ -194,9 +194,9 @@ "issue_imm": 32, "issue_use_pc": 1, "issue_use_imm": 1, - "gpr_rsp_a":"`NUM_THREADS * 32", - "gpr_rsp_b":"`NUM_THREADS * 32", - "gpr_rsp_c":"`NUM_THREADS * 32", + "gpr_rs1":"`NUM_THREADS * 32", + "gpr_rs2":"`NUM_THREADS * 32", + "gpr_rs3":"`NUM_THREADS * 32", "?writeback_valid": 1, "writeback_wid":"`NW_BITS", "writeback_pc": 32, @@ -205,7 +205,7 @@ "writeback_data":"`NUM_THREADS * 32", "writeback_eop": 1, "!scoreboard_delay": 1, - "!execute_delay": 1 + "!dispatch_delay": 1 }, "afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": { "?valid_st0": 1, diff --git a/hw/scripts/scope.py b/hw/scripts/scope.py index 9dda8ffe..679209a9 100755 --- a/hw/scripts/scope.py +++ b/hw/scripts/scope.py @@ -262,7 +262,7 @@ def expand_text(text, params): has_func = do_repl.has_func if not (params_updated or do_repl.expanded): break - text = new_text + text = new_text changed = True if not has_func: break diff --git a/hw/syn/opae/Makefile b/hw/syn/opae/Makefile index 9a078089..010baea3 100644 --- a/hw/syn/opae/Makefile +++ b/hw/syn/opae/Makefile @@ -8,20 +8,21 @@ else RUN_SYNTH=qsub-synth endif -# control RTL debug print states -DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE -DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE -DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA -DBG_PRINT_FLAGS += -DDBG_PRINT_MEM -DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -DBG_PRINT_FLAGS += -DDBG_PRINT_AVS -DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE +# control RTL debug tracing states +DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE +DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE +DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA +DBG_TRACE_FLAGS += -DDBG_TRACE_MEM +DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE +DBG_TRACE_FLAGS += -DDBG_TRACE_AVS +DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE +DBG_TRACE_FLAGS += -DDBG_TRACE_TEX -DBG_FLAGS += $(DBG_PRINT_FLAGS) +DBG_FLAGS += $(DBG_TRACE_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) @@ -33,7 +34,8 @@ CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_ CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3_CACHE_SIZE=524288 $(CONFIGS) FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY) -RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu +TEX_INCLUDE = -I$(RTL_DIR)/tex_unit +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE) $(TEX_INCLUDE) CFLAGS += $(RTL_INCLUDE) diff --git a/hw/syn/quartus/core/Makefile b/hw/syn/quartus/core/Makefile index 37e42df0..b976110c 100644 --- a/hw/syn/quartus/core/Makefile +++ b/hw/syn/quartus/core/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/pipeline/Makefile b/hw/syn/quartus/pipeline/Makefile index 7f7228a8..e4cad107 100644 --- a/hw/syn/quartus/pipeline/Makefile +++ b/hw/syn/quartus/pipeline/Makefile @@ -12,12 +12,12 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE) + PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf -# Part, Family -FAMILY = "Arria 10" -DEVICE = 10AX115N3F40E2SG +PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Executable Configuration SYN_ARGS = --parallel --read_settings_files=on diff --git a/hw/syn/quartus/top1/Makefile b/hw/syn/quartus/top1/Makefile index 465cb192..374f84e1 100644 --- a/hw/syn/quartus/top1/Makefile +++ b/hw/syn/quartus/top1/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top16/Makefile b/hw/syn/quartus/top16/Makefile index a15a3582..78f4df68 100644 --- a/hw/syn/quartus/top16/Makefile +++ b/hw/syn/quartus/top16/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top2/Makefile b/hw/syn/quartus/top2/Makefile index 31234939..f8801373 100644 --- a/hw/syn/quartus/top2/Makefile +++ b/hw/syn/quartus/top2/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top32/Makefile b/hw/syn/quartus/top32/Makefile index 89ce5340..cea702f5 100644 --- a/hw/syn/quartus/top32/Makefile +++ b/hw/syn/quartus/top32/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top4/Makefile b/hw/syn/quartus/top4/Makefile index 3b71cdd2..bfe734a7 100644 --- a/hw/syn/quartus/top4/Makefile +++ b/hw/syn/quartus/top4/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top64/Makefile b/hw/syn/quartus/top64/Makefile index 95ebb30d..604f794f 100644 --- a/hw/syn/quartus/top64/Makefile +++ b/hw/syn/quartus/top64/Makefile @@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/top8/Makefile b/hw/syn/quartus/top8/Makefile index 07b0a46e..0614e0d5 100644 --- a/hw/syn/quartus/top8/Makefile +++ b/hw/syn/quartus/top8/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/quartus/unittest/Makefile b/hw/syn/quartus/unittest/Makefile index 81219d6f..3b1bc6da 100644 --- a/hw/syn/quartus/unittest/Makefile +++ b/hw/syn/quartus/unittest/Makefile @@ -12,7 +12,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) + PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf # Executable Configuration diff --git a/hw/syn/quartus/vortex/Makefile b/hw/syn/quartus/vortex/Makefile index 48e40608..6874cce3 100644 --- a/hw/syn/quartus/vortex/Makefile +++ b/hw/syn/quartus/vortex/Makefile @@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE) +TEX_INCLUDE = $(RTL_DIR)/tex_unit +RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE) PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/unit_tests/cache/Makefile b/hw/unit_tests/cache/Makefile index de775d4f..96c2e9f1 100644 --- a/hw/unit_tests/cache/Makefile +++ b/hw/unit_tests/cache/Makefile @@ -1,46 +1,42 @@ -PARAM += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4 - +PARAMS += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4 +# control RTL debug tracing states +DBG_TRACE_FLAGS = -DDBG_TRACE_CORE_ICACHE \ + -DDBG_TRACE_CORE_DCACHE \ + -DDBG_TRACE_CACHE_BANK \ + -DDBG_TRACE_CACHE_SNP \ + -DDBG_TRACE_CACHE_MSHR \ + -DDBG_TRACE_CACHE_TAG \ + -DDBG_TRACE_CACHE_DATA \ + -DDBG_TRACE_MEM \ + -DDBG_TRACE_OPAE \ + -DDBG_TRACE_AVS -# control RTL debug print states -DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \ - -DDBG_PRINT_CORE_DCACHE \ - -DDBG_PRINT_CACHE_BANK \ - -DDBG_PRINT_CACHE_SNP \ - -DDBG_PRINT_CACHE_MSHR \ - -DDBG_PRINT_CACHE_TAG \ - -DDBG_PRINT_CACHE_DATA \ - -DDBG_PRINT_MEM \ - -DDBG_PRINT_OPAE \ - -DDBG_PRINT_AVS - -#DBG_PRINT=$(DBG_PRINT_FLAGS) +#DBG_PRINT=$(DBG_TRACE_FLAGS) INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs - SRCS = cachesim.cpp testbench.cpp all: build CF += -std=c++11 -fms-extensions -I../.. +CF += $(PARAMS) VF += --language 1800-2009 --assert -Wall --trace #-Wpedantic VF += -Wno-DECLFILENAME VF += --x-initial unique VF += -exe $(SRCS) $(INCLUDE) - -DBG += -DVCD_OUTPUT $(DBG_PRINT) - +VF += $(PARAMS) gen: - verilator $(VF) -DNDEBUG -cc VX_cache.v $(PARAM) -CFLAGS '$(CF) -DNDEBUG $(PARAM)' --exe $(SRCS) + verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS) build: gen - (cd obj_dir && make -j -f VVX_cache.mk) + (cd obj_dir && make -j -f V$(TOP).mk) run: build - (cd obj_dir && ./VVX_cache) + (cd obj_dir && ./V$(TOP)) clean: rm -rf obj_dir diff --git a/hw/unit_tests/cache/cachesim.cpp b/hw/unit_tests/cache/cachesim.cpp index 951740d8..736b5cb2 100644 --- a/hw/unit_tests/cache/cachesim.cpp +++ b/hw/unit_tests/cache/cachesim.cpp @@ -173,10 +173,10 @@ void CacheSim::stall_mem(){ } void CacheSim::send_snoop_req(){ - cache_->snp_req_valid = 1; + /*cache_->snp_req_valid = 1; cache_->snp_req_addr = 0x12222222; cache_->snp_req_invalidate = 1; - cache_->snp_req_tag = 0xff; + cache_->snp_req_tag = 0xff; */ } void CacheSim::eval_mem_bus() { @@ -274,9 +274,9 @@ bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){ //DEBUG void CacheSim::display_miss(){ - int i = (unsigned int)cache_->miss_vec; - std::bitset<8> x(i); - if (i) std::cout << "Miss Vec " << x << std::endl; + //int i = (unsigned int)cache_->miss_vec; + //std::bitset<8> x(i); + //if (i) std::cout << "Miss Vec " << x << std::endl; //std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl; } diff --git a/hw/unit_tests/generic_queue/Makefile b/hw/unit_tests/generic_queue/Makefile index 76d53af2..f13d14a1 100644 --- a/hw/unit_tests/generic_queue/Makefile +++ b/hw/unit_tests/generic_queue/Makefile @@ -1,11 +1,30 @@ -all: testbench.iv +TOP = VX_fifo_queue -testbench.iv: testbench.v - iverilog testbench.v -o testbench.iv -I ../../rtl/ +PARAMS ?= -run: testbench.iv - ! vvp testbench.iv | grep 'ERROR' || false +INCLUDE = -I../../rtl/ -I../../rtl/libs + +SRCS = main.cpp + +all: build + +CF += -std=c++11 -fms-extensions -I../.. +VF += $(PARAMS) + +VF += --language 1800-2009 --assert -Wall --trace +VF += -Wno-DECLFILENAME +VF += --x-initial unique +VF += -exe $(SRCS) $(INCLUDE) +VF += $(PARAMS) + +gen: + verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS) + +build: gen + (cd obj_dir && make -j -f V$(TOP).mk) + +run: build + (cd obj_dir && ./V$(TOP)) clean: - rm testbench.iv - + rm -rf obj_dir diff --git a/hw/unit_tests/generic_queue/main.cpp b/hw/unit_tests/generic_queue/main.cpp new file mode 100644 index 00000000..c753a7c8 --- /dev/null +++ b/hw/unit_tests/generic_queue/main.cpp @@ -0,0 +1,93 @@ +#include "vl_simulator.h" +#include "VVX_fifo_queue.h" +#include + +#define MAX_TICKS 20 + +#define CHECK(x) \ + do { \ + if (x) \ + break; \ + std::cout << "FAILED: " << #x << std::endl; \ + std::abort(); \ + } while (false) + +uint64_t ticks = 0; + +double sc_time_stamp() { + return ticks; +} + +using Device = VVX_fifo_queue; + +int main(int argc, char **argv) { + // Initialize Verilators variables + Verilated::commandArgs(argc, argv); + + vl_simulator sim; + + // run test + ticks = sim.reset(0); + while (ticks < MAX_TICKS) { + switch (ticks) { + case 0: + // initial values + sim->pop = 0; + sim->push = 0; + ticks = sim.step(ticks, 2); + break; + case 2: + // Verify outputs + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x1); + // push 0xa + sim->pop = 0; + sim->push = 1; + sim->data_in = 0xa; + break; + case 4: + // verify outputs + CHECK(sim->data_out == 0xa); + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x0); + // push 0xb + sim->pop = 0; + sim->push = 1; + sim->data_in = 0xb; + break; + case 6: + // verify outputs + CHECK(sim->data_out == 0xa); + CHECK(sim->full == 0x1); + CHECK(sim->empty == 0x0); + // pop + sim->pop = 1; + sim->push = 0; + break; + case 8: + // verify outputs + CHECK(sim->data_out == 0xb); + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x0); + // pop + sim->pop = 1; + sim->push = 0; + break; + case 10: + // verify outputs + CHECK(sim->full == 0x0); + CHECK(sim->empty == 0x1); + sim->pop = 0; + sim->push = 0; + break; + } + + // advance clock + ticks = sim.step(ticks, 2); + } + + std::cout << "PASSED!" << std::endl; + std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/hw/unit_tests/generic_queue/vl_simulator.h b/hw/unit_tests/generic_queue/vl_simulator.h new file mode 100644 index 00000000..16486adf --- /dev/null +++ b/hw/unit_tests/generic_queue/vl_simulator.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include +#include "verilated.h" + +#ifdef VM_TRACE +#include // Trace file format header +#endif + +template +class vl_simulator { +private: + + T top_; +#ifdef VM_TRACE + VerilatedVcdC tfp_; +#endif + +public: + + vl_simulator() { + top_.clk = 0; + top_.reset = 0; + #ifdef VM_TRACE + Verilated::traceEverOn(true); + top_.trace(&tfp_, 99); + tfp_.open("trace.vcd"); + #endif + } + + ~vl_simulator() { + #ifdef VM_TRACE + tfp_.close(); + #endif + top_.final(); + } + + uint64_t reset(uint64_t ticks) { + top_.reset = 1; + ticks = this->step(ticks, 2); + top_.reset = 0; + return ticks; + } + + uint64_t step(uint64_t ticks, uint32_t count = 1) { + while (count--) { + top_.eval(); + #ifdef VM_TRACE + tfp_.dump(ticks); + #endif + top_.clk = !top_.clk; + ++ticks; + } + return ticks; + } + + T* operator->() { + return &top_; + } +}; + +template +void vl_setw(uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + sig[i] = arr[i]; + } +} + +template +int vl_cmpw(const uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + if (sig[i] < arr[i]) + return -1; + if (sig[i] > arr[i]) + return 1; + } + return 0; +} \ No newline at end of file diff --git a/hw/unit_tests/tex_unit/tex_sampler/Makefile b/hw/unit_tests/tex_unit/tex_sampler/Makefile new file mode 100644 index 00000000..c6de8aa1 --- /dev/null +++ b/hw/unit_tests/tex_unit/tex_sampler/Makefile @@ -0,0 +1,30 @@ +TOP = VX_tex_sampler + +PARAMS ?= + +INCLUDE = -I../../../rtl/ -I../../../rtl/libs -I../../../rtl/tex_unit + +SRCS = main.cpp + +all: build + +CF += -std=c++11 -fms-extensions -I../.. +VF += $(PARAMS) + +VF += --language 1800-2009 --assert -Wall --trace +VF += -Wno-DECLFILENAME +VF += --x-initial unique +VF += -exe $(SRCS) $(INCLUDE) +VF += $(PARAMS) + +gen: + verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS) + +build: gen + (cd obj_dir && make -j -f V$(TOP).mk) + +run: build + (cd obj_dir && ./V$(TOP)) + +clean: + rm -rf obj_dir diff --git a/hw/unit_tests/tex_unit/tex_sampler/main.cpp b/hw/unit_tests/tex_unit/tex_sampler/main.cpp new file mode 100644 index 00000000..a67b38cb --- /dev/null +++ b/hw/unit_tests/tex_unit/tex_sampler/main.cpp @@ -0,0 +1,215 @@ +#include "vl_simulator.h" +#include "VVX_tex_sampler.h" +#include +#include + +#define MAX_TICKS 20 +#define MAX_UNIT_CYCLES 5 +#define NUM_THREADS + +#define CHECK(x) \ + do { \ + if (x) \ + break; \ + std::cout << "FAILED: " << #x << std::endl; \ + std::abort(); \ + } while (false) + +uint64_t ticks = 0; + +// using Device = VVX_tex_sampler; + +template +class testbench +{ +private: + vl_simulator sim; + std::map input_map; + std::map output_map; + +public: + + struct UnitTest { + bool use_reset; + unsigned int num_cycles; + bool use_cmodel; + struct Output outputs[MAX_UNIT_CYCLES]; + struct Input inputs[MAX_UNIT_CYCLES]; + unsigned int num_output_check; + unsigned int check_output_cycle[MAX_UNIT_CYCLES]; + } + + struct Input { + bool req_valid; + unsigned int req_wid; + unsigned int req_tmask; + unsigned int req_PC; + unsigned int req_rd; + unsigned int req_wb; + unsigned int req_filter; + unsigned int req_format; + unsigned int req_u[NUM_THREADS]; + unsigned int req_v[NUM_THREADS]; + unsigned int req_texels[NUM_THREADS][4]; + bool rsp_ready; + } + + struct Output { + int output_cycle; + // outputs + bool req_ready; + bool rsp_valid; + unsigned int rsp_wid; + unsigned int rsp_tmask; + unsigned int rsp_PC; + unsigned int rsp_rd; + bool rsp_wb; + unsigned int rsp_data[NUM_THREADS]; + } + + testbench(/* args */){ + + } + + ~testbench(){ + } + + void unittest_Cmodel(struct UnitTest * test){ + int cycles = test->num_cycles; + int num_outputs = test->num_output_check; + + // struct Input* inputs = new (struct Input)[cycles]; + struct Output* outputs = new (struct Output)[num_outputs]; + + // implement c model and assign outputs to struct + + if (test->inputs[0]->req_filter == 0){ + for (int i = 0; i < NUM_THREADS; i++) + outputs[0]->rsp_data[0] = test->inputs->req_texels[i][0]; + } else { + // for (int i = 0; i < NUM_THREADS; i++){ + // uint32_t low[4], high[4]; + // for (int j = 0; j < 4; j++){ + // low[j] = test->inputs->req_texels[i][j] & 0x00ff00ff; + // high[j] = (test->inputs->req_texels[i][j] >> 8) & 0x00ff00ff; + // } + + // } + } + outputs[0]->output_cycle = 1; + test->num_cycles = 1; + test->outputs = &outputs; + + } + + void generate_test_vectors(struct UnitTest * tests, int num_tests, bool is_pipe){ + // for all unit tests create output test vectors (w w/o c-model) + int prev_test_cycle = 0; + + for (int i = 0; i < num_tests; i++) + { + int op_counter = 0; + int ip_counter = 0; + + int test_cycle = 0; + int last_ip_cycle = 0; + + struct UnitTest curr_test = tests[i]; + + if (curr_test->use_cmodel){ + unittest_Cmodel(&curr_test); + } + + for (int j = 0; j < curr_test->num_cycles; j++) + { + if (curr_test->inputs[ip_counter]->input_cycle == test_cycle){ + input_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->inputs[j])); + last_ip_cycle = prev_test_cycle + test_cycle; + ip_counter++; + } + + if (curr_test->outputs[op_counter]->output_cycle == test_cycle){ + output_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->outputs[op_counter])); + op_counter++; + } + + test_cycle++; + } + + if(!is_pipe){ + prev_test_cycle += (test_cycle - 1); + } + else{ + prev_test_cycle = last_ip_cycle + 1; + } + + } + + } + + void run(){ + + ticks = sim.reset(0); + int cycle = 0; + + while (ticks < MAX_TICKS) { + + auto input = input_map.find(cycle); + auto output = output_map.find(cycle); + + if (input != input_map.end()){ + sim->req_valid = input->req_valid; + sim->req_wid = input->req_wid; + sim->req_tmask = input->req_tmask; + sim->req_PC = input->req_PC; + sim->req_rd = input->req_rd; + sim->req_wb = input->req_wb; + sim->req_filter = input->req_filter; + sim->req_format = input->req_format; + // sim->req_u = input->req_u[NUM_THREADS]; + // sim->req_v = input->req_v[NUM_THREADS]; + vl_setw(sim->req_texels, input->req_texels) + // sim->req_texels = input->req_texels[NUM_THREADS][4]; + sim->rsp_ready = input->rsp_ready; + } else{ + std::cout << "Warning! No Input on Cycle " << cycle << std::endl; + } + + if(output != output_map.end()){ + CHECK(sim->req_ready == output->req_ready); + CHECK(sim->rsp_valid == output->rsp_valid); + CHECK(sim->rsp_wid == output->rsp_wid); + CHECK(sim->rsp_tmask == output->rsp_tmask); + CHECK(sim->rsp_PC == output->rsp_PC); + CHECK(sim->rsp_rd == output->rsp_rd); + CHECK(sim->rsp_wb == output->rsp_wb); + CHECK(vl_cmpw(sim->rsp_data, output->rsp_data)); + } + + cycle++; + ticks = sim.step(ticks,2); + } + } + + std::cout << "PASSED!" << std::endl; + std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl; + +}; + + +double sc_time_stamp() { + return ticks; +} + +int main(int argc, char **argv) { + // Initialize Verilators variables + Verilated::commandArgs(argc, argv); + + testbench sampler_testbench; + + sampler_testbench.generate_test_vectors(tests, 1, 0); + sampler_test_bench.run(); + + + return 0; +} \ No newline at end of file diff --git a/hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h b/hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h new file mode 100644 index 00000000..16486adf --- /dev/null +++ b/hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h @@ -0,0 +1,81 @@ +#pragma once + +#include +#include +#include "verilated.h" + +#ifdef VM_TRACE +#include // Trace file format header +#endif + +template +class vl_simulator { +private: + + T top_; +#ifdef VM_TRACE + VerilatedVcdC tfp_; +#endif + +public: + + vl_simulator() { + top_.clk = 0; + top_.reset = 0; + #ifdef VM_TRACE + Verilated::traceEverOn(true); + top_.trace(&tfp_, 99); + tfp_.open("trace.vcd"); + #endif + } + + ~vl_simulator() { + #ifdef VM_TRACE + tfp_.close(); + #endif + top_.final(); + } + + uint64_t reset(uint64_t ticks) { + top_.reset = 1; + ticks = this->step(ticks, 2); + top_.reset = 0; + return ticks; + } + + uint64_t step(uint64_t ticks, uint32_t count = 1) { + while (count--) { + top_.eval(); + #ifdef VM_TRACE + tfp_.dump(ticks); + #endif + top_.clk = !top_.clk; + ++ticks; + } + return ticks; + } + + T* operator->() { + return &top_; + } +}; + +template +void vl_setw(uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + sig[i] = arr[i]; + } +} + +template +int vl_cmpw(const uint32_t* sig, Args&&... args) { + std::array arr{static_cast(std::forward(args))...}; + for (size_t i = 0; i < sizeof... (Args); ++i) { + if (sig[i] < arr[i]) + return -1; + if (sig[i] > arr[i]) + return 1; + } + return 0; +} \ No newline at end of file diff --git a/runtime/include/vx_intrinsics.h b/runtime/include/vx_intrinsics.h index 5c10b6c9..9c3149d7 100644 --- a/runtime/include/vx_intrinsics.h +++ b/runtime/include/vx_intrinsics.h @@ -5,7 +5,62 @@ #ifdef __cplusplus extern "C" { + #endif +#ifdef __ASSEMBLY__ +#define __ASM_STR(x) x +#else +#define __ASM_STR(x) #x +#endif + +#define vx_csr_swap(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ + __v; \ +}) + +#define vx_csr_read(csr) ({ \ + register unsigned __v; \ + __asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \ + __v; \ +}) + +#define vx_csr_write(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +}) + +#define vx_csr_read_set(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ + __v; \ +}) + +#define vx_csr_set(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +}) + +#define vx_csr_read_clear(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \ + __v; \ +}) + +#define vx_csr_clear(csr, val) ({ \ + unsigned __v = (unsigned )(val); \ + __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ +}) + +// Texture load +#define vx_tex(unit, u, v, l) ({ \ + unsigned __r; \ + unsigned __u = u; \ + unsigned __v = v; \ + unsigned __l = l; \ + __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ + __r; \ +}) #ifdef __ASSEMBLY__ #define __ASM_STR(x) x @@ -52,6 +107,16 @@ extern "C" { __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ }) +// Texture load +#define vx_tex(unit, u, v, l) ({ \ + unsigned __r; \ + unsigned __u = u; \ + unsigned __v = v; \ + unsigned __l = l; \ + __asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \ + __r; \ +}) + // Set thread mask inline void vx_tmc(unsigned thread_mask) { asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask)); @@ -86,7 +151,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) { // Prefetch inline void vx_prefetch(unsigned addr) { - asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) ); + asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) ); } // Return active warp's thread id @@ -170,6 +235,8 @@ inline void vx_fence() { #define __endif vx_join(); +#define __DIVERGENT__ __attribute__((annotate("divergent"))) + #ifdef __cplusplus } #endif diff --git a/runtime/src/vx_print.c b/runtime/src/vx_print.c index b43cdd4a..86458644 100644 --- a/runtime/src/vx_print.c +++ b/runtime/src/vx_print.c @@ -34,7 +34,7 @@ int vx_vprintf(const char* format, va_list va) { printf_arg_t arg; arg.format = format; arg.va = &va; - vx_serial(__printf_cb, &arg); + vx_serial((vx_serial_cb)__printf_cb, &arg); return arg.ret; } @@ -63,7 +63,7 @@ void vx_putint(int value, int base) { putint_arg_t arg; arg.value = value; arg.base = base; - vx_serial(__putint_cb, &arg); + vx_serial((vx_serial_cb)__putint_cb, &arg); } static void __putfloat_cb(const putfloat_arg_t* arg) { @@ -83,7 +83,7 @@ void vx_putfloat(float value, int precision) { putfloat_arg_t arg; arg.value = value; arg.precision = precision; - vx_serial(__putfloat_cb, &arg); + vx_serial((vx_serial_cb)__putfloat_cb, &arg); } #ifdef __cplusplus diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index 6059e711..a0c8d339 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -1,32 +1,34 @@ RTL_DIR=../../hw/rtl DPI_DIR=../../hw/dpi -CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors +CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I../../../hw -I../../common CXXFLAGS += -I../../common/softfloat/source/include LDFLAGS += ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a -# control RTL debug print states -DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE -DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE -DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA -DBG_PRINT_FLAGS += -DDBG_PRINT_MEM -DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -DBG_PRINT_FLAGS += -DDBG_PRINT_AVS -DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE +# control RTL debug tracing states +DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE +DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE +DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA +DBG_TRACE_FLAGS += -DDBG_TRACE_MEM +DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE +DBG_TRACE_FLAGS += -DDBG_TRACE_AVS +DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE +DBG_TRACE_FLAGS += -DDBG_TRACE_TEX -DBG_FLAGS += $(DBG_PRINT_FLAGS) +DBG_FLAGS += $(DBG_TRACE_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO DBG_FLAGS += -DVCD_OUTPUT FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src -RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) +TEX_INCLUDE = -I$(RTL_DIR)/tex_unit +RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE) SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp diff --git a/sim/simX/decode.cpp b/sim/simX/decode.cpp index c23427bb..dbc7115a 100644 --- a/sim/simX/decode.cpp +++ b/sim/simX/decode.cpp @@ -182,7 +182,7 @@ static const char* op_string(const Instr &instr) { case 2: return "SPLIT"; case 3: return "JOIN"; case 4: return "BAR"; - case 5: return "PREFETCH"; + case 6: return "PREFETCH"; default: std::abort(); } diff --git a/sim/simX/execute.cpp b/sim/simX/execute.cpp index c3f9ba0f..47bf4e04 100644 --- a/sim/simX/execute.cpp +++ b/sim/simX/execute.cpp @@ -712,7 +712,7 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) { pipeline->stall_warp = true; runOnce = true; } break; - case 5: { + case 6: { // PREFETCH int addr = rsdata[0]; printf("*** PREFETCHED %d ***\n", addr); diff --git a/sim/vlsim/Makefile b/sim/vlsim/Makefile index 7de01df9..ce01395d 100644 --- a/sim/vlsim/Makefile +++ b/sim/vlsim/Makefile @@ -2,27 +2,28 @@ RTL_DIR = ../../hw/rtl DPI_DIR = ../../hw/dpi SCRIPT_DIR=../../hw/scripts -CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors +CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I.. -I../../../hw -I../../common CXXFLAGS += -I../../common/softfloat/source/include LDFLAGS += -shared ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a -# control RTL debug print states -DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE -DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE -DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG -DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA -DBG_PRINT_FLAGS += -DDBG_PRINT_MEM -DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE -DBG_PRINT_FLAGS += -DDBG_PRINT_AVS -DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE +# control RTL debug tracing states +DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE +DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE +DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG +DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA +DBG_TRACE_FLAGS += -DDBG_TRACE_MEM +DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE +DBG_TRACE_FLAGS += -DDBG_TRACE_AVS +DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE +DBG_TRACE_FLAGS += -DDBG_TRACE_TEX -DBG_FLAGS += $(DBG_PRINT_FLAGS) +DBG_FLAGS += $(DBG_TRACE_FLAGS) DBG_FLAGS += -DDBG_CACHE_REQ_INFO SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp @@ -30,7 +31,8 @@ SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += fpga.cpp opae_sim.cpp FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src -RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) +TEX_INCLUDE = -I$(RTL_DIR)/tex_unit +RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE) RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip TOP = vortex_afu_shim @@ -84,12 +86,12 @@ VL_FLAGS += -D$(FPU_CORE) PROJECT = libopae-c-vlsim -all: shared +all: $(PROJECT).so vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh $(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h -shared: $(SRCS) vortex_afu.h +$(PROJECT).so: $(SRCS) vortex_afu.h verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so static: $(SRCS) vortex_afu.h diff --git a/tests/opencl/guassian/clutils.cpp b/tests/opencl/guassian/clutils.cpp index c977477a..e10fcba2 100755 --- a/tests/opencl/guassian/clutils.cpp +++ b/tests/opencl/guassian/clutils.cpp @@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) { #else commandQueue = clCreateCommandQueue(context, - devices[device_touse], NULL, &status); + devices[device_touse], 0, &status); #endif // PROFILING @@ -451,8 +451,8 @@ void cl_cleanup() printf("clReleaseContext()\n"); } - for (int p = 0; p < numPlatforms; ++p) { - for (int d = 0; d < numDevices[p]; ++d) { + for (cl_uint p = 0; p < numPlatforms; ++p) { + for (cl_uint d = 0; d < numDevices[p]; ++d) { status = clReleaseDevice(devices[d]); cl_errChk(status, "Oops!", true); printf("clReleaseDevice()\n"); diff --git a/tests/opencl/nearn/clutils.cpp b/tests/opencl/nearn/clutils.cpp index c977477a..e10fcba2 100755 --- a/tests/opencl/nearn/clutils.cpp +++ b/tests/opencl/nearn/clutils.cpp @@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) { #else commandQueue = clCreateCommandQueue(context, - devices[device_touse], NULL, &status); + devices[device_touse], 0, &status); #endif // PROFILING @@ -451,8 +451,8 @@ void cl_cleanup() printf("clReleaseContext()\n"); } - for (int p = 0; p < numPlatforms; ++p) { - for (int d = 0; d < numDevices[p]; ++d) { + for (cl_uint p = 0; p < numPlatforms; ++p) { + for (cl_uint d = 0; d < numDevices[p]; ++d) { status = clReleaseDevice(devices[d]); cl_errChk(status, "Oops!", true); printf("clReleaseDevice()\n"); diff --git a/tests/opencl/oclprintf/Makefile b/tests/opencl/oclprintf/Makefile index 92df9612..2c2fffa5 100644 --- a/tests/opencl/oclprintf/Makefile +++ b/tests/opencl/oclprintf/Makefile @@ -16,13 +16,13 @@ K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-secti CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing CXXFLAGS += -I$(POCL_RT_PATH)/include LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex -PROJECT = printf +PROJECT = oclprintf SRCS = main.cc diff --git a/tests/opencl/oclprintf/kernel.pocl b/tests/opencl/oclprintf/kernel.pocl index a687db75..e615c954 100644 Binary files a/tests/opencl/oclprintf/kernel.pocl and b/tests/opencl/oclprintf/kernel.pocl differ diff --git a/tests/opencl/psort/Makefile b/tests/opencl/psort/Makefile index e7795db0..747e185b 100644 --- a/tests/opencl/psort/Makefile +++ b/tests/opencl/psort/Makefile @@ -16,7 +16,7 @@ K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-secti CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-strict-aliasing -Wno-narrowing CXXFLAGS += -I$(POCL_RT_PATH)/include diff --git a/tests/opencl/results.txt b/tests/opencl/results.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/opencl/saxpy/Makefile b/tests/opencl/saxpy/Makefile index a4a2db87..6dc44a19 100644 --- a/tests/opencl/saxpy/Makefile +++ b/tests/opencl/saxpy/Makefile @@ -16,7 +16,7 @@ K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-secti CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing CXXFLAGS += -I$(POCL_RT_PATH)/include diff --git a/tests/opencl/saxpy/main.cc b/tests/opencl/saxpy/main.cc index cf090486..afe297ea 100644 --- a/tests/opencl/saxpy/main.cc +++ b/tests/opencl/saxpy/main.cc @@ -157,7 +157,7 @@ int main(int argc, char **argv) { context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err)); cl_command_queue queue; - queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, NULL, &_err)); + queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err)); cl_kernel kernel = 0; cl_mem memObjects[2] = {0, 0}; diff --git a/tests/opencl/sfilter/Makefile b/tests/opencl/sfilter/Makefile index bf9849bb..423a8976 100644 --- a/tests/opencl/sfilter/Makefile +++ b/tests/opencl/sfilter/Makefile @@ -16,7 +16,7 @@ K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-secti CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing CXXFLAGS += -I$(POCL_RT_PATH)/include diff --git a/tests/opencl/sgemm/Makefile b/tests/opencl/sgemm/Makefile index adb0b79e..64c3b818 100644 --- a/tests/opencl/sgemm/Makefile +++ b/tests/opencl/sgemm/Makefile @@ -16,7 +16,7 @@ K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-secti CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing CXXFLAGS += -I$(POCL_RT_PATH)/include diff --git a/tests/opencl/vecadd/Makefile b/tests/opencl/vecadd/Makefile index b08a0be7..76db3f46 100644 --- a/tests/opencl/vecadd/Makefile +++ b/tests/opencl/vecadd/Makefile @@ -16,7 +16,7 @@ K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-secti CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors #CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors -CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter +CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing CXXFLAGS += -I$(POCL_RT_PATH)/include diff --git a/tests/regression/Makefile b/tests/regression/Makefile index 58b246c8..157d74a6 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -20,7 +20,7 @@ run-simx: $(MAKE) -C io_addr run-simx $(MAKE) -C printf run-simx $(MAKE) -C diverge run-simx - #$(MAKE) -C sort run-simx + $(MAKE) -C sort run-simx $(MAKE) -C fence run-simx $(MAKE) -C no_mf_ext run-simx $(MAKE) -C no_smem run-simx @@ -34,7 +34,7 @@ run-rtlsim: $(MAKE) -C io_addr run-rtlsim $(MAKE) -C printf run-rtlsim $(MAKE) -C diverge run-rtlsim - #$(MAKE) -C sort run-rtlsim + $(MAKE) -C sort run-rtlsim $(MAKE) -C fence run-rtlsim $(MAKE) -C no_mf_ext run-rtlsim $(MAKE) -C no_smem run-rtlsim @@ -48,7 +48,7 @@ run-vlsim: $(MAKE) -C io_addr run-vlsim $(MAKE) -C printf run-vlsim $(MAKE) -C diverge run-vlsim - #$(MAKE) -C sort run-vlsim + $(MAKE) -C sort run-vlsim $(MAKE) -C fence run-vlsim $(MAKE) -C no_mf_ext run-vlsim $(MAKE) -C no_smem run-vlsim @@ -81,4 +81,3 @@ clean-all: $(MAKE) -C no_mf_ext clean-all $(MAKE) -C no_smem clean-all $(MAKE) -C prefetch clean-all - diff --git a/tests/regression/basic/kernel.c b/tests/regression/basic/kernel.c index 5279d156..bc5ec076 100644 --- a/tests/regression/basic/kernel.c +++ b/tests/regression/basic/kernel.c @@ -3,7 +3,7 @@ #include "common.h" void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; uint32_t count = arg->count; int32_t* src_ptr = (int32_t*)arg->src_ptr; int32_t* dst_ptr = (int32_t*)arg->dst_ptr; diff --git a/tests/regression/demo/kernel.c b/tests/regression/demo/kernel.c index 7e2b5dcd..40fe4273 100644 --- a/tests/regression/demo/kernel.c +++ b/tests/regression/demo/kernel.c @@ -3,7 +3,7 @@ #include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { +void kernel_body(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -17,6 +17,6 @@ void kernel_body(int task_id, const kernel_arg_t* arg) { } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_tasks, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/diverge/kernel.c b/tests/regression/diverge/kernel.c index 5d0745a1..98fd2b0e 100644 --- a/tests/regression/diverge/kernel.c +++ b/tests/regression/diverge/kernel.c @@ -5,7 +5,7 @@ // Parallel Selection sort -void kernel_body(int task_id, const kernel_arg_t* arg) { +void kernel_body(int task_id, kernel_arg_t* arg) { int32_t* src_ptr = (int32_t*)arg->src_ptr; int32_t* dst_ptr = (int32_t*)arg->dst_ptr; @@ -44,6 +44,6 @@ void kernel_body(int task_id, const kernel_arg_t* arg) { } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_points, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/dogfood/kernel.c b/tests/regression/dogfood/kernel.c index f61e6a4e..ce65f0d8 100644 --- a/tests/regression/dogfood/kernel.c +++ b/tests/regression/dogfood/kernel.c @@ -4,14 +4,14 @@ #include #include "common.h" -typedef void (*PFN_Kernel)(int task_id, const kernel_arg_t* arg); +typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* arg); inline float __ieee754_sqrtf (float x) { asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x)); return x; } -void kernel_iadd(int task_id, const kernel_arg_t* arg) { +void kernel_iadd(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -26,7 +26,7 @@ void kernel_iadd(int task_id, const kernel_arg_t* arg) { } } -void kernel_imul(int task_id, const kernel_arg_t* arg) { +void kernel_imul(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -41,7 +41,7 @@ void kernel_imul(int task_id, const kernel_arg_t* arg) { } } -void kernel_idiv(int task_id, const kernel_arg_t* arg) { +void kernel_idiv(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -56,7 +56,7 @@ void kernel_idiv(int task_id, const kernel_arg_t* arg) { } } -void kernel_idiv_mul(int task_id, const kernel_arg_t* arg) { +void kernel_idiv_mul(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -73,7 +73,7 @@ void kernel_idiv_mul(int task_id, const kernel_arg_t* arg) { } } -void kernel_fadd(int task_id, const kernel_arg_t* arg) { +void kernel_fadd(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -88,7 +88,7 @@ void kernel_fadd(int task_id, const kernel_arg_t* arg) { } } -void kernel_fsub(int task_id, const kernel_arg_t* arg) { +void kernel_fsub(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -103,7 +103,7 @@ void kernel_fsub(int task_id, const kernel_arg_t* arg) { } } -void kernel_fmul(int task_id, const kernel_arg_t* arg) { +void kernel_fmul(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -118,7 +118,7 @@ void kernel_fmul(int task_id, const kernel_arg_t* arg) { } } -void kernel_fmadd(int task_id, const kernel_arg_t* arg) { +void kernel_fmadd(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -133,7 +133,7 @@ void kernel_fmadd(int task_id, const kernel_arg_t* arg) { } } -void kernel_fmsub(int task_id, const kernel_arg_t* arg) { +void kernel_fmsub(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -148,7 +148,7 @@ void kernel_fmsub(int task_id, const kernel_arg_t* arg) { } } -void kernel_fnmadd(int task_id, const kernel_arg_t* arg) { +void kernel_fnmadd(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -163,7 +163,7 @@ void kernel_fnmadd(int task_id, const kernel_arg_t* arg) { } } -void kernel_fnmsub(int task_id, const kernel_arg_t* arg) { +void kernel_fnmsub(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -178,7 +178,7 @@ void kernel_fnmsub(int task_id, const kernel_arg_t* arg) { } } -void kernel_fnmadd_madd(int task_id, const kernel_arg_t* arg) { +void kernel_fnmadd_madd(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -195,7 +195,7 @@ void kernel_fnmadd_madd(int task_id, const kernel_arg_t* arg) { } } -void kernel_fdiv(int task_id, const kernel_arg_t* arg) { +void kernel_fdiv(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -210,7 +210,7 @@ void kernel_fdiv(int task_id, const kernel_arg_t* arg) { } } -void kernel_fdiv2(int task_id, const kernel_arg_t* arg) { +void kernel_fdiv2(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -227,7 +227,7 @@ void kernel_fdiv2(int task_id, const kernel_arg_t* arg) { } } -void kernel_fsqrt(int task_id, const kernel_arg_t* arg) { +void kernel_fsqrt(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -242,7 +242,7 @@ void kernel_fsqrt(int task_id, const kernel_arg_t* arg) { } } -void kernel_ftoi(int task_id, const kernel_arg_t* arg) { +void kernel_ftoi(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -258,7 +258,7 @@ void kernel_ftoi(int task_id, const kernel_arg_t* arg) { } } -void kernel_ftou(int task_id, const kernel_arg_t* arg) { +void kernel_ftou(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; float* src0_ptr = (float*)arg->src0_ptr; float* src1_ptr = (float*)arg->src1_ptr; @@ -274,7 +274,7 @@ void kernel_ftou(int task_id, const kernel_arg_t* arg) { } } -void kernel_itof(int task_id, const kernel_arg_t* arg) { +void kernel_itof(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -290,7 +290,7 @@ void kernel_itof(int task_id, const kernel_arg_t* arg) { } } -void kernel_utof(int task_id, const kernel_arg_t* arg) { +void kernel_utof(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -329,6 +329,6 @@ static const PFN_Kernel sc_tests[] = { }; void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_tasks, sc_tests[arg->testid], arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg); } \ No newline at end of file diff --git a/tests/regression/fence/kernel.c b/tests/regression/fence/kernel.c index bc39537f..d3c1c431 100644 --- a/tests/regression/fence/kernel.c +++ b/tests/regression/fence/kernel.c @@ -3,7 +3,7 @@ #include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { +void kernel_body(int task_id, kernel_arg_t* arg) { uint32_t count = arg->task_size; int32_t* src0_ptr = (int32_t*)arg->src0_ptr; int32_t* src1_ptr = (int32_t*)arg->src1_ptr; @@ -19,6 +19,6 @@ void kernel_body(int task_id, const kernel_arg_t* arg) { } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_tasks, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/io_addr/kernel.c b/tests/regression/io_addr/kernel.c index 39d4c5c6..2f8483d8 100644 --- a/tests/regression/io_addr/kernel.c +++ b/tests/regression/io_addr/kernel.c @@ -3,7 +3,7 @@ #include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { +void kernel_body(int task_id, kernel_arg_t* arg) { uint32_t* src_ptr = (uint32_t*)arg->src_ptr; uint32_t* dst_ptr = (uint32_t*)arg->dst_ptr; @@ -13,6 +13,6 @@ void kernel_body(int task_id, const kernel_arg_t* arg) { } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_points, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/mstress/kernel.c b/tests/regression/mstress/kernel.c index 2d2a86b9..91d9a455 100644 --- a/tests/regression/mstress/kernel.c +++ b/tests/regression/mstress/kernel.c @@ -3,7 +3,7 @@ #include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { +void kernel_body(int task_id, kernel_arg_t* arg) { uint32_t stride = arg->stride; uint32_t* addr_ptr = (uint32_t*)arg->addr_ptr; float* src_ptr = (float*)arg->src_ptr; @@ -23,6 +23,6 @@ void kernel_body(int task_id, const kernel_arg_t* arg) { } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_tasks, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/no_mf_ext/kernel.c b/tests/regression/no_mf_ext/kernel.c index c15ad5fc..a0c19e53 100644 --- a/tests/regression/no_mf_ext/kernel.c +++ b/tests/regression/no_mf_ext/kernel.c @@ -4,7 +4,7 @@ #include "common.h" void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; uint32_t size = arg->size; int32_t* src_ptr = (int32_t*)arg->src_ptr; diff --git a/tests/regression/no_smem/kernel.c b/tests/regression/no_smem/kernel.c index c15ad5fc..a0c19e53 100644 --- a/tests/regression/no_smem/kernel.c +++ b/tests/regression/no_smem/kernel.c @@ -4,7 +4,7 @@ #include "common.h" void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; uint32_t size = arg->size; int32_t* src_ptr = (int32_t*)arg->src_ptr; diff --git a/tests/regression/printf/kernel.c b/tests/regression/printf/kernel.c index 2e3b6566..340b4d97 100644 --- a/tests/regression/printf/kernel.c +++ b/tests/regression/printf/kernel.c @@ -4,12 +4,12 @@ #include #include "common.h" -void kernel_body(int task_id, const kernel_arg_t* arg) { +void kernel_body(int task_id, kernel_arg_t* arg) { int* src_ptr = (int*)arg->src_ptr; vx_printf("task=%d, value=%d\n", task_id, src_ptr[task_id]); } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_points, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/sort/Makefile b/tests/regression/sort/Makefile index ef4d86f0..dfb7db47 100644 --- a/tests/regression/sort/Makefile +++ b/tests/regression/sort/Makefile @@ -1,18 +1,26 @@ + RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain VORTEX_DRV_PATH ?= $(realpath ../../../driver) VORTEX_RT_PATH ?= $(realpath ../../../runtime) +VORTEX_HW_PATH ?= $(realpath ../../../hw) +LLVM_PREFIX ?= /opt/llvm-riscv +SYSROOT=${RISCV_TOOLCHAIN_PATH}/riscv32-unknown-elf OPTS ?= -n16 -VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc -VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ -VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump -VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy +VX_CC = ${LLVM_PREFIX}/bin/clang +VX_CXX = ${LLVM_PREFIX}/bin/clang++ +VX_DP = ${LLVM_PREFIX}/bin/llvm-objdump +VX_CP = ${LLVM_PREFIX}/bin/llvm-objcopy -VX_CFLAGS += -march=rv32imf -mabi=ilp32f -Os -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw +VX_CFLAGS += -O3 -march=rv32imf -mabi=ilp32f -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -Xclang -target-feature -Xclang +vortex +VX_CFLAGS += --sysroot=${SYSROOT} --gcc-toolchain=${RISCV_TOOLCHAIN_PATH} +VX_CFLAGS += -I${VORTEX_HW_PATH} -I${VORTEX_RT_PATH}/include -VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a +VX_LDFLAGS += -Wl,-Bstatic,-T${VORTEX_RT_PATH}/linker/vx_link.ld,--gc-sections ${VORTEX_RT_PATH}/libvortexrt.a + +VX_DPFLAGS = -arch=riscv32 -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex VX_SRCS = kernel.c @@ -30,7 +38,7 @@ SRCS = main.cpp all: $(PROJECT) kernel.bin kernel.dump kernel.dump: kernel.elf - $(VX_DP) -D kernel.elf > kernel.dump + $(VX_DP) $(VX_DPFLAGS) -D kernel.elf > kernel.dump kernel.bin: kernel.elf $(VX_CP) -O binary kernel.elf kernel.bin diff --git a/tests/regression/sort/kernel.c b/tests/regression/sort/kernel.c index d89a9cb7..5798aafa 100644 --- a/tests/regression/sort/kernel.c +++ b/tests/regression/sort/kernel.c @@ -1,26 +1,9 @@ #include #include #include -#include #include "common.h" -// Parallel Selection sort - -int __attribute__((noinline)) __smaller(int index, int tid, int32_t cur_value, int32_t ref_value) { - int ret = 0; - __if (cur_value < ref_value) { - ret = 1; - } __else { - __if (cur_value == ref_value) { - __if (index < tid) { - ret = 1; - } __endif - } __endif - } __endif - return ret; -} - -void kernel_body(int task_id, const kernel_arg_t* arg) { +void kernel_body(int __DIVERGENT__ task_id, kernel_arg_t* arg) { uint32_t num_points = arg->num_points; int32_t* src_ptr = (int32_t*)arg->src_ptr; int32_t* dst_ptr = (int32_t*)arg->dst_ptr; @@ -30,13 +13,12 @@ void kernel_body(int task_id, const kernel_arg_t* arg) { uint32_t pos = 0; for (uint32_t i = 0; i < num_points; ++i) { int32_t cur_value = src_ptr[i]; - pos += __smaller(i, task_id, cur_value, ref_value); + pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id)); } dst_ptr[pos] = ref_value; - vx_printf("taskid=%d, pos=%d, value=%d\n", task_id, pos, ref_value); } void main() { - const kernel_arg_t* arg = (const kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; - vx_spawn_tasks(arg->num_points, kernel_body, arg); + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); } \ No newline at end of file diff --git a/tests/regression/tex/Makefile b/tests/regression/tex/Makefile new file mode 100644 index 00000000..8b313d25 --- /dev/null +++ b/tests/regression/tex/Makefile @@ -0,0 +1,72 @@ +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +VORTEX_DRV_PATH ?= $(realpath ../../../driver) +VORTEX_RT_PATH ?= $(wildcard ../../../runtime) + +OPTS ?= -g1 + +VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ +VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump +VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw + +VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a + +VX_SRCS = kernel.c + +#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors +CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors + +CXXFLAGS += -DLUPNG_USE_ZLIB + +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include + +LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex -lz + +PROJECT = tex + +SRCS = main.cpp utils.cpp tga.cpp lupng.c + +all: $(PROJECT) kernel.bin kernel.dump + +kernel.dump: kernel.elf + $(VX_DP) -D kernel.elf > kernel.dump + +kernel.bin: kernel.elf + $(VX_CP) -O binary kernel.elf kernel.bin + +kernel.elf: $(VX_SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-simx: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-fpga: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.elf *.bin *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/tests/regression/tex/blitter.h b/tests/regression/tex/blitter.h new file mode 100644 index 00000000..e05f64b8 --- /dev/null +++ b/tests/regression/tex/blitter.h @@ -0,0 +1,268 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include "surfacedesc.h" + +class BlitTable { +public: + typedef int (*PfnCopy)(const SurfaceDesc &dstDesc, + uint32_t dstOffsetX, + uint32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + uint32_t srcOffsetX, + uint32_t srcOffsetY); + + BlitTable() { + for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { + for (uint32_t d = 0; d < FORMAT_COLOR_SIZE_; ++d) { + copyFuncs_[s][d] = CopyInvalid; + } + } + + for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) { + switch (s) { + case FORMAT_A8: + case FORMAT_L8: + copyFuncs_[s][s] = CopyFast; + break; + + case FORMAT_A8L8: + copyFuncs_[FORMAT_A8L8][FORMAT_A8] = Copy; + copyFuncs_[FORMAT_A8L8][FORMAT_A8L8] = CopyFast; + break; + + case FORMAT_R5G6B5: + copyFuncs_[FORMAT_R5G6B5][FORMAT_L8] = Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_R5G6B5] = CopyFast; + copyFuncs_[FORMAT_R5G6B5][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_A8B8G8R8] = + Copy; + copyFuncs_[FORMAT_R5G6B5][FORMAT_A8R8G8B8] = + Copy; + break; + + case FORMAT_A1R5G5B5: + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8R8G8B8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R5G5B5A1] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R4G4B4A4] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8B8G8R8] = + Copy; + break; + + case FORMAT_A4R4G4B4: + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8R8G8B8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R5G5B5A1] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R4G4B4A4] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8B8G8R8] = + Copy; + break; + + case FORMAT_R8G8B8: + copyFuncs_[FORMAT_R8G8B8][FORMAT_L8] = Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_R5G6B5] = + Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_R8G8B8] = CopyFast; + copyFuncs_[FORMAT_R8G8B8][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_A8B8G8R8] = + Copy; + copyFuncs_[FORMAT_R8G8B8][FORMAT_A8R8G8B8] = + Copy; + break; + + case FORMAT_A8R8G8B8: + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G6B5] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R8G8B8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8R8G8B8] = CopyFast; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G5B5A1] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R4G4B4A4] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_B8G8R8] = + Copy; + copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8B8G8R8] = + Copy; + break; + + case FORMAT_R5G5B5A1: + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_RGB] = + Copy; + copyFuncs_[FORMAT_R5G5B5A1][FORMAT_ARGB] = + Copy; + break; + + case FORMAT_R4G4B4A4: + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_RGB] = + Copy; + copyFuncs_[FORMAT_R4G4B4A4][FORMAT_ARGB] = + Copy; + break; + + case FORMAT_B8G8R8: + copyFuncs_[FORMAT_B8G8R8][FORMAT_L8] = Copy; + copyFuncs_[FORMAT_B8G8R8][FORMAT_RGB] = Copy; + break; + + case FORMAT_A8B8G8R8: + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_L8] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8L8] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_RGB] = + Copy; + copyFuncs_[FORMAT_A8B8G8R8][FORMAT_ARGB] = + Copy; + break; + } + } + } + + PfnCopy get(uint32_t srcFormat, uint32_t dstFormat) const { + assert(srcFormat < FORMAT_COLOR_SIZE_); + assert(dstFormat < FORMAT_COLOR_SIZE_); + return copyFuncs_[srcFormat][dstFormat]; + } + +private: + template + static int Copy(const SurfaceDesc &dstDesc, + uint32_t dstOffsetX, + uint32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + uint32_t srcOffsetX, + uint32_t srcOffsetY) { + auto srcBPP = TFormatInfo::CBSIZE; + auto dstBPP = TFormatInfo::CBSIZE; + auto srcNextLine = srcDesc.Pitch; + auto dstNextLine = dstDesc.Pitch; + + auto pbSrc = srcDesc.pBits + srcOffsetX * srcBPP + srcOffsetY * srcDesc.Pitch; + auto pbDst = dstDesc.pBits + dstOffsetX * dstBPP + dstOffsetY * dstDesc.Pitch; + + while (copyHeight--) { + auto pSrc = reinterpret_cast::TYPE *>(pbSrc); + for (auto *pDst = reinterpret_cast::TYPE *>( + pbDst), + *const pEnd = pDst + copyWidth; + pDst != pEnd; ++pDst, ++pSrc) { + auto tmp = Format::ConvertFrom(pSrc); + Format::ConvertTo(pDst, tmp); + } + + pbSrc += srcNextLine; + pbDst += dstNextLine; + } + return 0; + } + + template + static int CopyFast(const SurfaceDesc &dstDesc, + uint32_t dstOffsetX, + uint32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + uint32_t srcOffsetX, + uint32_t srcOffsetY) { + auto nBPP = sizeof(Type); + auto srcNextLine = srcDesc.Pitch; + auto dstNextLine = dstDesc.Pitch; + + auto pbSrc = srcDesc.pBits + srcOffsetX * nBPP + srcOffsetY * srcDesc.Pitch; + auto pbDst = dstDesc.pBits + dstOffsetX * nBPP + dstOffsetY * dstDesc.Pitch; + + while (copyHeight--) { + auto pSrc = reinterpret_cast(pbSrc); + for (auto *pDst = reinterpret_cast(pbDst), *const pEnd = pDst + copyWidth; + pDst != pEnd; ++pDst, ++pSrc) { + *pDst = *pSrc; + } + pbSrc += srcNextLine; + pbDst += dstNextLine; + } + return 0; + } + + static int CopyInvalid(const SurfaceDesc & /*dstDesc*/, + uint32_t /*dstOffsetX*/, + uint32_t /*dstOffsetY*/, + uint32_t /*copyWidth*/, + uint32_t /*copyHeight*/, + const SurfaceDesc & /*srcDesc*/, + uint32_t /*srcOffsetX*/, + uint32_t /*srcOffsetY*/) + { + std::cout << "Error: invalid format" << std::endl; + return -1; + } + + PfnCopy copyFuncs_[FORMAT_COLOR_SIZE_][FORMAT_COLOR_SIZE_]; +}; \ No newline at end of file diff --git a/tests/regression/tex/color.h b/tests/regression/tex/color.h new file mode 100644 index 00000000..708565a3 --- /dev/null +++ b/tests/regression/tex/color.h @@ -0,0 +1,68 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include +#include + +struct ColorARGB { + union { + struct { + uint32_t value; + }; + struct { + uint8_t b, g, r, a; + }; + struct { + uint8_t m[4]; + }; + }; + + ColorARGB() {} + + ColorARGB(int a, int r, int g, int b) { + assert((a >= 0) && (a <= 0xff)); + assert((r >= 0) && (r <= 0xff)); + assert((g >= 0) && (g <= 0xff)); + assert((b >= 0) && (b <= 0xff)); + + this->b = static_cast(b); + this->g = static_cast(g); + this->r = static_cast(r); + this->a = static_cast(a); + } + + ColorARGB(int r, int g, int b) { + assert((r >= 0) && (r <= 0xff)); + assert((g >= 0) && (g <= 0xff)); + assert((b >= 0) && (b <= 0xff)); + + this->b = static_cast(b); + this->g = static_cast(g); + this->r = static_cast(r); + } + + ColorARGB(int value) { + this->value = value; + } + + void operator=(const ColorARGB &rhs) { + this->value = rhs.value; + } + + operator uint32_t() const { + return this->value; + } +}; \ No newline at end of file diff --git a/tests/regression/tex/common.h b/tests/regression/tex/common.h new file mode 100644 index 00000000..2abb7234 --- /dev/null +++ b/tests/regression/tex/common.h @@ -0,0 +1,25 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +typedef struct { + uint32_t num_tasks; + uint8_t format; + uint8_t filter; + uint8_t wrap; + uint8_t use_sw; + uint32_t lod; + uint8_t src_logWidth; + uint8_t src_logHeight; + uint8_t src_stride; + uint8_t src_pitch; + uint32_t src_ptr; + uint32_t dst_width; + uint32_t dst_height; + uint8_t dst_stride; + uint32_t dst_pitch; + uint32_t dst_ptr; +} kernel_arg_t; + +#endif \ No newline at end of file diff --git a/tests/regression/tex/earth.png b/tests/regression/tex/earth.png new file mode 100644 index 00000000..d329a0aa Binary files /dev/null and b/tests/regression/tex/earth.png differ diff --git a/tests/regression/tex/flower.png b/tests/regression/tex/flower.png new file mode 100644 index 00000000..29468a86 Binary files /dev/null and b/tests/regression/tex/flower.png differ diff --git a/tests/regression/tex/format.h b/tests/regression/tex/format.h new file mode 100644 index 00000000..4ee8268e --- /dev/null +++ b/tests/regression/tex/format.h @@ -0,0 +1,1022 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include "int24.h" +#include "color.h" +#include + +enum ePixelFormat { + FORMAT_UNKNOWN, + FORMAT_A8, + FORMAT_L8, + FORMAT_A8L8, + FORMAT_R5G6B5, + FORMAT_A8R8G8B8, + FORMAT_A1R5G5B5, + FORMAT_R8G8B8, + FORMAT_A4R4G4B4, + FORMAT_A8B8G8R8, + FORMAT_R5G5B5A1, + FORMAT_B8G8R8, + FORMAT_R4G4B4A4, + FORMAT_COLOR_SIZE_, + FORMAT_D16 = FORMAT_COLOR_SIZE_, + FORMAT_X8S8D16, + FORMAT_PAL4_B8G8R8, + FORMAT_PAL4_A8B8G8R8, + FORMAT_PAL4_R5G6B5, + FORMAT_PAL4_R4G4B4A4, + FORMAT_PAL4_R5G5B5A1, + FORMAT_PAL8_B8G8R8, + FORMAT_PAL8_A8B8G8R8, + FORMAT_PAL8_R5G6B5, + FORMAT_PAL8_R4G4B4A4, + FORMAT_PAL8_R5G5B5A1, + FORMAT_SIZE_, +}; + +#define FORMAT_A FORMAT_A8 +#define FORMAT_RGB FORMAT_R5G6B5 +#define FORMAT_RGB_ FORMAT_R8G8B8 +#define FORMAT_ARGB FORMAT_A8R8G8B8 +#define FORMAT_ARGB_ FORMAT_A4R4G4B4 + +template +struct TFormatInfo {}; + +template <> +struct TFormatInfo { + typedef uint8_t TYPE; + + enum { + CBSIZE = 0, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + RED = 5, + GREEN = 6, + BLUE = 5, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint24_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint24_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint32_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint32_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint8_t TYPE; + + enum { + CBSIZE = 1, + ALPHA = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint8_t TYPE; + + enum { + CBSIZE = 1, + LUMINANCE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 8, + LUMINANCE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + DEPTH = 16, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 4, + DEPTH = 16, + STENCIL = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 4, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 4, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + RED = 5, + GREEN = 6, + BLUE = 5, + PALETTE = 4, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + PALETTE = 4, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + PALETTE = 4, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 3, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 4, + ALPHA = 8, + RED = 8, + GREEN = 8, + BLUE = 8, + PALETTE = 8, + LERP = 8, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + RED = 5, + GREEN = 6, + BLUE = 5, + PALETTE = 8, + LERP = 5, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 4, + RED = 4, + GREEN = 4, + BLUE = 4, + PALETTE = 8, + LERP = 4, + }; +}; + +template <> +struct TFormatInfo { + typedef uint16_t TYPE; + + enum { + CBSIZE = 2, + ALPHA = 1, + RED = 5, + GREEN = 5, + BLUE = 5, + PALETTE = 8, + LERP = 5, + }; +}; + +/////////////////////////////////////////////////////////////////////////////// + +#define DEF_GET_ENUM_VALUE(Name, Default) \ + template \ + struct enum_get_##Name { \ + static constexpr int value = Default; \ + }; \ + template \ + struct enum_get_##Name::type> { \ + static constexpr int value = T::Name; \ + } + +#define __formatInfo(format) \ + { \ + TFormatInfo::CBSIZE, FormatSize>::RED, \ + FormatSize>::GREEN, \ + FormatSize>::BLUE, \ + FormatSize>::ALPHA, \ + FormatSize>::LUMINANCE, \ + FormatSize>::DEPTH, \ + FormatSize>::STENCIL, \ + FormatSize>::PALETTE, \ + FormatSize>::LERP \ + } + +/////////////////////////////////////////////////////////////////////////////// + +struct FormatInfo { + uint8_t BytePerPixel; + uint8_t Red; + uint8_t Green; + uint8_t Blue; + uint8_t Alpha; + uint8_t Luminance; + uint8_t Depth; + uint8_t Stencil; + uint8_t PaletteBits; + uint8_t LerpBits; +}; + +template +class FormatSize { +protected: + DEF_GET_ENUM_VALUE(RED, 0); + DEF_GET_ENUM_VALUE(GREEN, 0); + DEF_GET_ENUM_VALUE(BLUE, 0); + DEF_GET_ENUM_VALUE(ALPHA, 0); + DEF_GET_ENUM_VALUE(LUMINANCE, 0); + DEF_GET_ENUM_VALUE(DEPTH, 0); + DEF_GET_ENUM_VALUE(STENCIL, 0); + DEF_GET_ENUM_VALUE(PALETTE, 0); + DEF_GET_ENUM_VALUE(LERP, 0); + +public: + enum { + RED = enum_get_RED::value, + GREEN = enum_get_GREEN::value, + BLUE = enum_get_BLUE::value, + ALPHA = enum_get_ALPHA::value, + LUMINANCE = enum_get_LUMINANCE::value, + DEPTH = enum_get_DEPTH::value, + STENCIL = enum_get_STENCIL::value, + PALETTE = enum_get_PALETTE::value, + LERP = enum_get_LERP::value, + + RGB = RED + GREEN + BLUE + LUMINANCE, + RGBA = RGB + ALPHA + }; +}; + +namespace Format { + +inline static const FormatInfo &GetInfo(ePixelFormat pixelFormat) { + static const FormatInfo sc_formatInfos[FORMAT_SIZE_] = { + __formatInfo(FORMAT_UNKNOWN), + __formatInfo(FORMAT_A8), + __formatInfo(FORMAT_L8), + __formatInfo(FORMAT_A8L8), + __formatInfo(FORMAT_RGB), + __formatInfo(FORMAT_ARGB), + __formatInfo(FORMAT_A1R5G5B5), + __formatInfo(FORMAT_RGB_), + __formatInfo(FORMAT_ARGB_), + __formatInfo(FORMAT_R4G4B4A4), + __formatInfo(FORMAT_R5G5B5A1), + __formatInfo(FORMAT_B8G8R8), + __formatInfo(FORMAT_A8B8G8R8), + __formatInfo(FORMAT_D16), + __formatInfo(FORMAT_X8S8D16), + __formatInfo(FORMAT_PAL4_B8G8R8), + __formatInfo(FORMAT_PAL4_A8B8G8R8), + __formatInfo(FORMAT_PAL4_R5G6B5), + __formatInfo(FORMAT_PAL4_R4G4B4A4), + __formatInfo(FORMAT_PAL4_R5G5B5A1), + __formatInfo(FORMAT_PAL8_B8G8R8), + __formatInfo(FORMAT_PAL8_A8B8G8R8), + __formatInfo(FORMAT_PAL8_R5G6B5), + __formatInfo(FORMAT_PAL8_R4G4B4A4), + __formatInfo(FORMAT_PAL8_R5G5B5A1), + }; + assert(pixelFormat < FORMAT_SIZE_); + return sc_formatInfos[pixelFormat]; +} + +#undef __formatInfo +#undef DEF_GET_ENUM_VALUE + +typedef ColorARGB (*pfn_convert_from)(const void *pIn); + +typedef void (*pfn_convert_to)(void *pOut, const ColorARGB &in); + +template +static uint32_t ConvertTo(const ColorARGB &color); + +template +static void ConvertTo(void *pOut, const ColorARGB &in) { + *reinterpret_cast::TYPE *>(pOut) = + static_cast::TYPE>( + ConvertTo(in)); +} + +template +static ColorARGB ConvertFrom(uint32_t in); + +template +static ColorARGB ConvertFrom(const void *pIn) { + return ConvertFrom( + *reinterpret_cast::TYPE *>(pIn)); +} + +inline static pfn_convert_to GetConvertTo(ePixelFormat pixelFormat) { + switch (pixelFormat) { + case FORMAT_A8: + return &ConvertTo; + case FORMAT_L8: + return &ConvertTo; + case FORMAT_A8L8: + return &ConvertTo; + case FORMAT_R5G6B5: + return &ConvertTo; + case FORMAT_A1R5G5B5: + return &ConvertTo; + case FORMAT_A4R4G4B4: + return &ConvertTo; + case FORMAT_R8G8B8: + return &ConvertTo; + case FORMAT_A8R8G8B8: + return &ConvertTo; + case FORMAT_R5G5B5A1: + return &ConvertTo; + case FORMAT_R4G4B4A4: + return &ConvertTo; + case FORMAT_B8G8R8: + return &ConvertTo; + case FORMAT_A8B8G8R8: + return &ConvertTo; + case FORMAT_D16: + return &ConvertTo; + case FORMAT_X8S8D16: + return &ConvertTo; + default: + return &ConvertTo; + } + return nullptr; +} + +inline static pfn_convert_from GetConvertFrom(ePixelFormat pixelFormat, + bool bForceAlpha) { + if (bForceAlpha) { + switch (pixelFormat) { + case FORMAT_A8: + return &ConvertFrom; + case FORMAT_L8: + return &ConvertFrom; + case FORMAT_A8L8: + return &ConvertFrom; + case FORMAT_R5G6B5: + return &ConvertFrom; + case FORMAT_A1R5G5B5: + return &ConvertFrom; + case FORMAT_A4R4G4B4: + return &ConvertFrom; + case FORMAT_R8G8B8: + return &ConvertFrom; + case FORMAT_A8R8G8B8: + return &ConvertFrom; + case FORMAT_R5G5B5A1: + return &ConvertFrom; + case FORMAT_R4G4B4A4: + return &ConvertFrom; + case FORMAT_B8G8R8: + return &ConvertFrom; + case FORMAT_A8B8G8R8: + return &ConvertFrom; + case FORMAT_D16: + return &ConvertFrom; + case FORMAT_X8S8D16: + return &ConvertFrom; + default: + return &ConvertFrom; + } + } else { + switch (pixelFormat) { + case FORMAT_A8: + return &ConvertFrom; + case FORMAT_L8: + return &ConvertFrom; + case FORMAT_A8L8: + return &ConvertFrom; + case FORMAT_R5G6B5: + return &ConvertFrom; + case FORMAT_A1R5G5B5: + return &ConvertFrom; + case FORMAT_A4R4G4B4: + return &ConvertFrom; + case FORMAT_R8G8B8: + return &ConvertFrom; + case FORMAT_A8R8G8B8: + return &ConvertFrom; + case FORMAT_R5G5B5A1: + return &ConvertFrom; + case FORMAT_R4G4B4A4: + return &ConvertFrom; + case FORMAT_B8G8R8: + return &ConvertFrom; + case FORMAT_A8B8G8R8: + return &ConvertFrom; + case FORMAT_D16: + return &ConvertFrom; + case FORMAT_X8S8D16: + return &ConvertFrom; + default: + return &ConvertFrom; + } + } + + return nullptr; +} + +inline static uint32_t GetNativeFormat(ePixelFormat pixelFormat) { + switch (pixelFormat) { + case FORMAT_PAL4_B8G8R8: + case FORMAT_PAL8_B8G8R8: + return FORMAT_B8G8R8; + + case FORMAT_PAL4_A8B8G8R8: + case FORMAT_PAL8_A8B8G8R8: + return FORMAT_A8B8G8R8; + + case FORMAT_PAL4_R5G6B5: + case FORMAT_PAL8_R5G6B5: + return FORMAT_R5G6B5; + + case FORMAT_PAL4_R4G4B4A4: + case FORMAT_PAL8_R4G4B4A4: + return FORMAT_R4G4B4A4; + + case FORMAT_PAL4_R5G5B5A1: + case FORMAT_PAL8_R5G5B5A1: + return FORMAT_R5G5B5A1; + + default: + return pixelFormat; + } +} + +/////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &/*in*/) { + return 0; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t /*in*/) { + return 0; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t /*in*/) { + return 0; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.r & 0xf8) << 8) | ((in.g & 0xfc) << 3) | (in.b >> 3); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = ((in >> 11) << 3) | (in >> 13); + ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = ((in >> 11) << 3) | (in >> 13); + ret.g = ((in >> 3) & 0xfc) | ((in >> 9) & 0x3); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a ? 0x8000 : 0) | ((in.r & 0xf8) << 7) | ((in.g & 0xf8) << 2) | + (in.b >> 3); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in >> 15); + ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); + ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in >> 15); + ret.r = ((in >> 7) & 0xf8) | ((in << 1) >> 13); + ret.g = ((in >> 2) & 0xf8) | ((in >> 7) & 7); + ret.b = ((in & 0x1f) << 3) | ((in & 0x1c) >> 2); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.r & 0xf8) << 8) | ((in.g & 0xf8) << 3) | ((in.b & 0xf8) >> 2) | + (in.a ? 0x1 : 0); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in & 0x1); + ret.r = ((in >> 8) & 0xf8) | (in >> 13); + ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); + ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff * (in & 0x1); + ret.r = ((in >> 8) & 0xf8) | (in >> 13); + ret.g = ((in >> 3) & 0xf8) | ((in >> 8) & 7); + ret.b = ((in & 0x3e) << 2) | ((in & 0x3e) >> 3); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.a & 0xf0) << 8) | ((in.r & 0xf0) << 4) | ((in.g & 0xf0) << 0) | + (in.b >> 4); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in >> 8) & 0xf0) | (in >> 12); + ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in >> 8) & 0xf0) | (in >> 12); + ret.r = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.g = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + ret.b = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return ((in.r & 0xf0) << 8) | ((in.g & 0xf0) << 4) | ((in.b & 0xf0) << 0) | + (in.a >> 4); +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + ret.r = ((in >> 8) & 0xf0) | (in >> 12); + ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = ((in & 0x0f) << 4) | ((in & 0x0f) >> 0); + ret.r = ((in >> 8) & 0xf0) | (in >> 12); + ret.g = ((in >> 4) & 0xf0) | ((in >> 8) & 0x0f); + ret.b = ((in & 0xf0) >> 0) | ((in & 0xf0) >> 4); + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.r << 16) | (in.g << 8) | in.b; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = in >> 16; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = in >> 16; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.b << 16) | (in.g << 8) | in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in >> 16; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in >> 16; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a << 24) | (in.r << 16) | (in.g << 8) | in.b; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = (in >> 16) & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = (in >> 16) & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = in & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a << 24) | (in.b << 16) | (in.g << 8) | in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = (in >> 16) & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 24; + ret.r = in & 0xff; + ret.g = (in >> 8) & 0xff; + ret.b = (in >> 16) & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.a; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.r = in; + ret.g = in; + ret.b = in; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = 0xff; + ret.r = in; + ret.g = in; + ret.b = in; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return (in.a << 8) | in.r; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 8; + ret.r = in & 0xff; + ret.g = in & 0xff; + ret.b = in & 0xff; + return ret; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.a = in >> 8; + ret.r = in & 0xff; + ret.g = in & 0xff; + ret.b = in & 0xff; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.value; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.value = in; + return ret; +} + +////////////////////////////////////////////////////////////////////////////// + +template <> +inline uint32_t ConvertTo(const ColorARGB &in) { + return in.b; +} + +template <> +inline ColorARGB ConvertFrom(uint32_t in) { + ColorARGB ret; + ret.value = in; + return ret; +} + +} // namespace Format \ No newline at end of file diff --git a/tests/regression/tex/int24.h b/tests/regression/tex/int24.h new file mode 100644 index 00000000..b08537a7 --- /dev/null +++ b/tests/regression/tex/int24.h @@ -0,0 +1,37 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include + +struct uint24_t { + uint8_t m[3]; + + explicit uint24_t(uint32_t value) { + m[0] = (value >> 0) & 0xff; + m[1] = (value >> 8) & 0xff; + m[2] = (value >> 16) & 0xff; + } + + explicit uint24_t(uint8_t x, uint8_t y, uint8_t z) { + m[0] = x; + m[1] = y; + m[2] = z; + } + + operator uint32_t() const { + return (m[2] << 16) | (m[1] << 8) | m[0]; + } +}; diff --git a/tests/regression/tex/kernel.c b/tests/regression/tex/kernel.c new file mode 100644 index 00000000..bd0cebb4 --- /dev/null +++ b/tests/regression/tex/kernel.c @@ -0,0 +1,67 @@ +#include +#include +#include +#include "common.h" +#include "texsw.h" + +#define ENABLE_SW + +typedef struct { + kernel_arg_t* state; + uint32_t tile_width; + uint32_t tile_height; + float deltaX; + float deltaY; +} tile_arg_t; + +void kernel_body(int task_id, tile_arg_t* arg) { + kernel_arg_t* state = arg->state; + + uint32_t xoffset = 0; + uint32_t yoffset = task_id * arg->tile_height; + uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch); + + float fv = yoffset * arg->deltaY; + for (uint32_t y = 0; y < arg->tile_height; ++y) { + uint32_t* dst_row = (uint32_t*)dst_ptr; + float fu = xoffset * arg->deltaX; + for (uint32_t x = 0; x < arg->tile_width; ++x) { + int32_t u = (int32_t)(fu * (1<<20)); + int32_t v = (int32_t)(fv * (1<<20)); + #ifdef ENABLE_SW + if (state->use_sw) { + dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod); + } else { + #endif + dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod); + #ifdef ENABLE_SW + } + #endif + fu += arg->deltaX; + } + dst_ptr += state->dst_pitch; + fv += arg->deltaY; + } +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + + // configure texture unit + vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr); + vx_csr_write(CSR_TEX_MIPOFF(0), 0); + vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth); + vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight); + vx_csr_write(CSR_TEX_FORMAT(0), arg->format); + vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap); + vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0)); + + tile_arg_t targ; + targ.state = arg; + targ.tile_width = arg->dst_width; + targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks; + targ.deltaX = 1.0f / arg->dst_width; + targ.deltaY = 1.0f / arg->dst_height; + + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ); +} \ No newline at end of file diff --git a/tests/regression/tex/lupng.c b/tests/regression/tex/lupng.c new file mode 100644 index 00000000..f612fbc9 --- /dev/null +++ b/tests/regression/tex/lupng.c @@ -0,0 +1,1313 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2014 Jan Solanti + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#ifndef LUPNG_USE_ZLIB +#include +#else +#include +#endif + +#include "lupng.h" + +#define PNG_NONE 0 +#define PNG_IHDR 0x01 +#define PNG_PLTE 0x02 +#define PNG_IDAT 0x04 +#define PNG_IEND 0x08 + +#define PNG_GRAYSCALE 0 +#define PNG_TRUECOLOR 2 +/* 24bpp RGB palette */ +#define PNG_PALETTED 3 +#define PNG_GRAYSCALE_ALPHA 4 +#define PNG_TRUECOLOR_ALPHA 6 + +#define PNG_FILTER_NONE 0 +#define PNG_FILTER_SUB 1 +#define PNG_FILTER_UP 2 +#define PNG_FILTER_AVERAGE 3 +#define PNG_FILTER_PAETH 4 + +#define PNG_SIG_SIZE 8 + +#define PNG_DONE 1 +#define PNG_OK 0 +#define PNG_ERROR -1 + +#define BUF_SIZE 8192 +#define MAX(x, y) (x > y ? x : y) + +#if defined(_MSC_VER) +#define LU_INLINE __inline /* MS-specific inline */ +#else +#define LU_INLINE inline /* rest of the world... */ +#endif + +#define SIZE_T_MAX_POSITIVE ( ((size_t)-1) >> 1 ) + +/******************************************************** + * CRC computation as per PNG spec + ********************************************************/ + +/* Precomputed table of CRCs of all 8-bit messages + using the polynomial from the PNG spec, 0xEDB88320L. */ +static const uint32_t crcTable[] = +{ + 0x0, 0x77073096, 0xEE0E612C, 0x990951BA, 0x76DC419, 0x706AF48F, + 0xE963A535, 0x9E6495A3, 0xEDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, + 0x9B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, + 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, + 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, + 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, + 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C, + 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, + 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, + 0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, + 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x1DB7106, + 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x6B6B51F, 0x9FBFE4A5, 0xE8B8D433, + 0x7807C9A2, 0xF00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x86D3D2D, + 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, + 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, + 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, + 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, + 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, + 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA, + 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, + 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, + 0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x3B6E20C, 0x74B1D29A, + 0xEAD54739, 0x9DD277AF, 0x4DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, + 0xD6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0xA00AE27, 0x7D079EB1, + 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, + 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, + 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E, + 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, + 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, + 0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, + 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, + 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, + 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x26D930A, 0x9C0906A9, 0xEB0E363F, + 0x72076785, 0x5005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0xCB61B38, + 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0xBDBDF21, 0x86D3D2D4, 0xF1D4E242, + 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, + 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, + 0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, + 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, + 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, + 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, + 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, + 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D +}; + +/* Update a running CRC with the bytes buf[0..len-1]--the CRC + should be initialized to all 1's, and the transmitted value + is the 1's complement of the final running CRC (see the + crc() routine below)). */ +static uint32_t updateCrc(uint32_t crc, unsigned char *buf, + size_t len) +{ + uint32_t c = crc; + size_t n; + + for (n = 0; n < len; n++) + c = crcTable[(c ^ buf[n]) & 0xFF] ^ (c >> 8); + + return c; +} + +/* Return the CRC of the bytes buf[0..len-1]. */ +static uint32_t crc(unsigned char *buf, size_t len) +{ + return updateCrc(0xFFFFFFFFL, buf, len) ^ 0xFFFFFFFFL; +} + + + +/******************************************************** + * Helper structs + ********************************************************/ + +typedef struct +{ + uint32_t length; + uint8_t *type; + uint8_t *data; + uint32_t crc; +} PngChunk; + +typedef struct { + const LuUserContext *userCtx; + int8_t chunksFound; + + /* IHDR info */ + int32_t width; + int32_t height; + uint8_t depth; + uint8_t colorType; + uint8_t channels; + uint8_t compression; + uint8_t filter; + uint8_t interlace; + + /* PLTE info */ + uint32_t paletteItems; + uint8_t *palette; + + /* fields used for (de)compression & (de-)filtering */ + z_stream stream; + size_t scanlineBytes; + int32_t currentCol; + int32_t currentRow; + uint32_t currentElem; + size_t currentByte; + int bytesPerPixel; + uint8_t *currentScanline; + uint8_t *previousScanline; + uint8_t currentFilter; + uint8_t interlacePass; + size_t compressedBytes; + + /* used for constructing 16 bit deep pixels */ + int tmpCount; + uint8_t tmpBytes[2]; + + /* the output image */ + LuImage *img; + const LuImage *cimg; /* constant pointer version */ +} PngInfoStruct; + +/* helper macro to output warning via user context of the info struct */ +#define LUPNG_WARN_UC(uc,...) do { if ((uc)->warnProc) { (uc)->warnProc((uc)->warnProcUserPtr, __VA_ARGS__); }} while(0) +#define LUPNG_WARN(info,...) LUPNG_WARN_UC((info)->userCtx, __VA_ARGS__) + +/* PNG header: */ +static const uint8_t PNG_SIG[] = +/* P N G \r \n SUB \n */ +{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}; + +static const int startingRow[] = { 0, 0, 0, 4, 0, 2, 0, 1 }; +static const int startingCol[] = { 0, 0, 4, 0, 2, 0, 1, 0 }; +static const int rowIncrement[] = { 1, 8, 8, 8, 4, 4, 2, 2 }; +static const int colIncrement[] = { 1, 8, 8, 4, 4, 2, 2, 1 }; + + + +/******************************************************** + * Helper functions + ********************************************************/ + +static LU_INLINE void releaseChunk(PngChunk *chunk, const LuUserContext *userCtx) +{ + /* Only release chunk->type since chunk->data points to the same memory. */ + userCtx->freeProc(chunk->type, userCtx->freeProcUserPtr); + userCtx->freeProc(chunk, userCtx->freeProcUserPtr); +} + +static LU_INLINE uint32_t swap32(uint32_t n) +{ + union { + unsigned char np[4]; + uint32_t i; + } u; + u.i = n; + + return ((uint32_t)u.np[0] << 24) | + ((uint32_t)u.np[1] << 16) | + ((uint32_t)u.np[2] << 8) | + (uint32_t)u.np[3]; +} + +static LU_INLINE uint16_t swap16(uint16_t n) +{ + union { + unsigned char np[2]; + uint16_t i; + } u; + u.i = n; + + return ((uint16_t)u.np[0] << 8) | (uint16_t)u.np[1]; +} + +static int bytesEqual(const uint8_t *a, const uint8_t *b, size_t count) +{ + size_t i; + for (i = 0; i < count; ++i) + { + if (*(a+i) != *(b+i)) + return 0; + } + + return 1; +} + +static void* internalMalloc(size_t size, void *userPtr) +{ + (void)userPtr; /* not used */ + return malloc(size); +} + +static void internalFree(void *ptr, void *userPtr) +{ + (void)userPtr; /* not used */ + free(ptr); +} + +static void internalPrintf(void *userPtr, const char *fmt, ...) +{ + FILE *outStream = (FILE*)userPtr; + va_list args; + + va_start(args, fmt); + vfprintf(outStream, fmt, args); + va_end(args); + fputc('\n', outStream); +} + +static size_t internalFread(void *ptr, size_t size, size_t count, void *userPtr) +{ + return fread(ptr, size, count, (FILE *)userPtr); +} + +static size_t internalFwrite(const void *ptr, size_t size, size_t count, void *userPtr) +{ + return fwrite(ptr, size, count, (FILE *)userPtr); +} + +/******************************************************** + * Png filter functions + ********************************************************/ +static LU_INLINE int absi(int val) +{ + return val > 0 ? val : -val; +} + +static LU_INLINE uint8_t raw(PngInfoStruct *info, size_t col) +{ + if (col > SIZE_T_MAX_POSITIVE) + return 0; + return info->currentScanline[col]; +} + +static LU_INLINE uint8_t prior(PngInfoStruct *info, size_t col) +{ + if (info->currentRow <= startingRow[info->interlacePass] || col > SIZE_T_MAX_POSITIVE) + return 0; + return info->previousScanline[col]; +} + + +static LU_INLINE uint8_t paethPredictor(uint8_t a, uint8_t b, uint8_t c) +{ + unsigned int A = a, B = b, C = c; + int p = (int)A + (int)B - (int)C; + int pa = absi(p - (int)A); + int pb = absi(p - (int)B); + int pc = absi(p - (int)C); + + if (pa <= pb && pa <= pc) + return a; + if (pb <= pc) + return b; + return c; +} + +static LU_INLINE uint8_t deSub(PngInfoStruct *info, uint8_t filtered) +{ + return filtered + raw(info, info->currentByte-info->bytesPerPixel); +} + +static LU_INLINE uint8_t deUp(PngInfoStruct *info, uint8_t filtered) +{ + return filtered + prior(info, info->currentByte); +} + +static LU_INLINE uint8_t deAverage(PngInfoStruct *info, uint8_t filtered) +{ + uint16_t avg = (uint16_t)(raw(info, info->currentByte-info->bytesPerPixel) + + prior(info, info->currentByte)); + avg >>= 1; + return filtered + avg; +} + +static LU_INLINE uint8_t dePaeth(PngInfoStruct *info, uint8_t filtered) +{ + return filtered + paethPredictor( + raw(info, info->currentByte-info->bytesPerPixel), + prior(info, info->currentByte), + prior(info, info->currentByte-info->bytesPerPixel)); +} + +static LU_INLINE uint8_t none(PngInfoStruct *info) +{ + return raw(info, info->currentByte); +} + +static LU_INLINE uint8_t sub(PngInfoStruct *info) +{ + return raw(info, info->currentByte) - raw(info, info->currentByte-info->bytesPerPixel); +} + +static LU_INLINE uint8_t up(PngInfoStruct *info) +{ + return raw(info, info->currentByte) - prior(info, info->currentByte); +} + +static LU_INLINE uint8_t average(PngInfoStruct *info) +{ + uint16_t avg = (uint16_t)(raw(info, info->currentByte-info->bytesPerPixel) + + prior(info, info->currentByte)); + avg >>= 1; + return raw(info, info->currentByte) - avg; +} + +static LU_INLINE uint8_t paeth(PngInfoStruct *info) +{ + return raw(info, info->currentByte) - paethPredictor( + raw(info, info->currentByte-info->bytesPerPixel), + prior(info, info->currentByte), + prior(info, info->currentByte-info->bytesPerPixel)); +} + + + +/******************************************************** + * Actual implementation + ********************************************************/ +static LU_INLINE int parseIhdr(PngInfoStruct *info, PngChunk *chunk) +{ + if (info->chunksFound) + { + LUPNG_WARN(info,"PNG: malformed PNG file!"); + return PNG_ERROR; + } + + info->chunksFound |= PNG_IHDR; + info->width = swap32(*(uint32_t *)chunk->data); + info->height = swap32(*((uint32_t *)chunk->data + 1)); + info->depth = *(chunk->data + 8); + info->colorType = *(chunk->data + 9); + info->compression = *(chunk->data + 10); + info->filter = *(chunk->data + 11); + info->interlace = *(chunk->data + 12); + + switch (info->colorType) + { + case PNG_GRAYSCALE: + info->channels = 1; + break; + case PNG_TRUECOLOR: + info->channels = 3; + break; + case PNG_PALETTED: + info->channels = 3; + break; + case PNG_GRAYSCALE_ALPHA: + info->channels = 2; + break; + case PNG_TRUECOLOR_ALPHA: + info->channels = 4; + break; + default: + LUPNG_WARN(info,"PNG: illegal color type: %u", + (unsigned int)info->colorType); + return PNG_ERROR; + break; + } + + if (info->width <= 0 || info->height <= 0) + { + LUPNG_WARN(info, "PNG: illegal dimensions"); + return PNG_ERROR; + } + + if ((info->colorType != PNG_GRAYSCALE && info->colorType != PNG_PALETTED && + info->depth < 8) || + (info->colorType == PNG_PALETTED && info->depth == 16) || + info->depth > 16) + { + LUPNG_WARN(info, "PNG: illegal bit depth for color type"); + return PNG_ERROR; + } + + if (info->compression) + { + LUPNG_WARN(info,"PNG: unknown compression method: %u", + (unsigned int)info->compression); + return PNG_ERROR; + } + + if (info->filter) + { + LUPNG_WARN(info,"PNG: unknown filter scheme: %u", + (unsigned int)info->filter); + return PNG_ERROR; + } + + memset(&(info->stream), 0, sizeof(info->stream)); + if(inflateInit(&(info->stream)) != Z_OK) + { + LUPNG_WARN(info, "PNG: inflateInit failed!"); + return PNG_ERROR; + } + info->img = luImageCreate(info->width, info->height, + info->channels, info->depth < 16 ? 8 : 16, NULL, info->userCtx); + info->cimg = info->img; + info->scanlineBytes = MAX((info->width * info->channels * info->depth) >> 3, 1); + info->currentScanline = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes, info->userCtx->allocProcUserPtr); + info->previousScanline = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes, info->userCtx->allocProcUserPtr); + info->currentCol = -1; + info->interlacePass = info->interlace ? 1 : 0; + info->bytesPerPixel = MAX((info->channels * info->depth) >> 3, 1); + if (!info->img || !info->currentScanline || !info->previousScanline) + { + LUPNG_WARN(info, "PNG: memory allocation failed!"); + return PNG_ERROR; + } + + return PNG_OK; +} + +static LU_INLINE int parsePlte(PngInfoStruct *info, PngChunk *chunk) +{ + if (info->chunksFound & PNG_PLTE) + { + LUPNG_WARN(info, "PNG: too many palette chunks in file!"); + return PNG_ERROR; + } + info->chunksFound |= PNG_PLTE; + + if (info->chunksFound & PNG_IDAT || !(info->chunksFound & PNG_IHDR)) + { + LUPNG_WARN(info, "PNG: malformed PNG file!"); + return PNG_ERROR; + } + + if (info->colorType == PNG_GRAYSCALE || info->colorType == PNG_GRAYSCALE_ALPHA) + { + LUPNG_WARN(info, "PNG: palettes are not allowed in grayscale images!"); + return PNG_ERROR; + } + + if (chunk->length % 3 != 0) + { + LUPNG_WARN(info, "PNG: invalid palette size!"); + return PNG_ERROR; + } + + info->paletteItems = chunk->length/3; + info->palette = (uint8_t *)info->userCtx->allocProc(chunk->length,info->userCtx->allocProcUserPtr); + if (!info->palette) + { + LUPNG_WARN(info, "PNG: memory allocation failed!"); + return PNG_ERROR; + } + memcpy(info->palette, chunk->data, chunk->length); + + return PNG_OK; +} + +static LU_INLINE void stretchBits(uint8_t inByte, uint8_t outBytes[8], int depth) +{ + int i; + switch (depth) { + case 1: + for (i = 0; i < 8; ++i) + outBytes[i] = (inByte >> (7-i)) & 0x01; + break; + + case 2: + outBytes[0] = (inByte >> 6) & 0x03; + outBytes[1] = (inByte >> 4) & 0x03; + outBytes[2] = (inByte >> 2) & 0x03; + outBytes[3] = inByte & 0x03; + break; + + case 4: + outBytes[0] = (inByte >> 4) & 0x0F; + outBytes[1] = inByte & 0x0F; + break; + + default: + break; + } +} + +/* returns: 1 if at end of scanline, 0 otherwise */ +static LU_INLINE int insertByte(PngInfoStruct *info, uint8_t byte) +{ + int advance = 0; + const uint8_t scale[] = {0x00, 0xFF, 0x55, 0x00, 0x11, 0x00, 0x00, 0x00}; + + /* for paletted images currentElem will always be 0 */ + size_t idx = info->currentRow * info->width * info->channels + + info->currentCol * info->channels + + info->currentElem; + + if (info->colorType != PNG_PALETTED) + { + if (info->depth == 8) + info->cimg->data[idx] = byte; + + else if (info->depth < 8) + info->cimg->data[idx] = byte * scale[info->depth]; + + else /* depth == 16 */ + { + info->tmpBytes[info->tmpCount] = byte; + if (info->tmpCount) /* just inserted 2nd byte */ + { + uint16_t val = *(uint16_t *)info->tmpBytes; + val = swap16(val); + info->tmpCount = 0; + + ((uint16_t *)(info->cimg->data))[idx] = val; + } + else + { + ++info->tmpCount; + return 0; + } + } + + ++info->currentElem; + if (info->currentElem >= info->channels) + { + advance = 1; + info->currentElem = 0; + } + } + else + { + /* The spec limits palette size to 256 entries */ + if (byte < info->paletteItems) + { + info->cimg->data[idx ] = info->palette[3*byte ]; + info->cimg->data[idx+1] = info->palette[3*byte+1]; + info->cimg->data[idx+2] = info->palette[3*byte+2]; + } + else + { + LUPNG_WARN(info,"PNG: invalid palette index encountered!"); + } + advance = 1; + } + + if (advance) + { + /* advance to next pixel */ + info->currentCol += colIncrement[info->interlacePass]; + + if (info->currentCol >= info->width) + { + uint8_t *tmp = info->currentScanline; + info->currentScanline = info->previousScanline; + info->previousScanline = tmp; + + info->currentCol = -1; + info->currentByte = 0; + + info->currentRow += rowIncrement[info->interlacePass]; + if (info->currentRow >= info->height && info->interlace) + { + ++info->interlacePass; + while (startingCol[info->interlacePass] >= info->width || + startingRow[info->interlacePass] >= info->height) + ++info->interlacePass; + info->currentRow = startingRow[info->interlacePass]; + } + return 1; + } + } + + return 0; +} + +static LU_INLINE int parseIdat(PngInfoStruct *info, PngChunk *chunk) +{ + unsigned char filtered[BUF_SIZE]; + int status = Z_OK; + + if (!(info->chunksFound & PNG_IHDR)) + { + LUPNG_WARN(info,"PNG: malformed PNG file!"); + return PNG_ERROR; + } + + if (info->colorType == PNG_PALETTED && !(info->chunksFound & PNG_PLTE)) + { + LUPNG_WARN(info,"PNG: palette required but missing!"); + return PNG_ERROR; + } + + info->chunksFound |= PNG_IDAT; + info->stream.next_in = (unsigned char *)chunk->data; + info->stream.avail_in = chunk->length; + do + { + size_t decompressed; + size_t i; + + info->stream.next_out = filtered; + info->stream.avail_out = BUF_SIZE; + status = inflate(&(info->stream), Z_NO_FLUSH); + decompressed = BUF_SIZE - info->stream.avail_out; + + if (status != Z_OK && + status != Z_STREAM_END && + status != Z_BUF_ERROR && + status != Z_NEED_DICT) + { + LUPNG_WARN(info, "PNG: inflate error!"); + return PNG_ERROR; + } + + for (i = 0; + i < decompressed && info->currentCol < info->width && info->currentRow < info->height; + ++i) + { + if (info->currentCol < 0) + { + info->currentCol = startingCol[info->interlacePass]; + info->currentFilter = filtered[i]; + } + else + { + uint8_t rawByte = 0; + uint8_t fullBytes[8] = {0}; + switch (info->currentFilter) + { + case PNG_FILTER_NONE: + rawByte = filtered[i]; + break; + case PNG_FILTER_SUB: + rawByte = deSub(info, filtered[i]); + break; + case PNG_FILTER_UP: + rawByte = deUp(info, filtered[i]); + break; + case PNG_FILTER_AVERAGE: + rawByte = deAverage(info, filtered[i]); + break; + case PNG_FILTER_PAETH: + rawByte = dePaeth(info, filtered[i]); + break; + default: + break; + } + + info->currentScanline[info->currentByte] = rawByte; + ++info->currentByte; + + if (info->depth < 8) + { + int j; + stretchBits(rawByte, fullBytes, info->depth); + for (j = 0; j < 8/info->depth; ++j) + if(insertByte(info, fullBytes[j])) + break; + } + else + insertByte(info, rawByte); + } + } + } while ((info->stream.avail_in > 0 || info->stream.avail_out == 0) + && info->currentCol < info->width && info->currentRow < info->height); + + return PNG_OK; +} + +static LU_INLINE PngChunk *readChunk(PngInfoStruct *info) +{ + PngChunk *chunk = (PngChunk *)info->userCtx->allocProc(sizeof(PngChunk),info->userCtx->allocProcUserPtr); + size_t read = 0; + if (!chunk) + { + LUPNG_WARN(info,"PNG: memory allocation failed!"); + return NULL; + } + + info->userCtx->readProc((void *)&chunk->length, 4, 1, info->userCtx->readProcUserPtr); + chunk->length = swap32(chunk->length); + if (chunk->length+4 < chunk->length) + { + LUPNG_WARN(info, "PNG: chunk claims to be absurdly large"); + info->userCtx->freeProc(chunk, info->userCtx->freeProcUserPtr); + return NULL; + } + + // Store chunk type and contents in the same buffer for convenience + chunk->type = (uint8_t *)info->userCtx->allocProc(chunk->length + 4, info->userCtx->allocProcUserPtr); + if (!chunk->type) + { + LUPNG_WARN(info,"PNG: memory allocation failed!"); + info->userCtx->freeProc(chunk, info->userCtx->freeProcUserPtr); + return NULL; + } + chunk->data = chunk->type + 4; + info->userCtx->readProc((void *)chunk->type, 1, chunk->length + 4, info->userCtx->readProcUserPtr); + read = info->userCtx->readProc((void *)&chunk->crc, 4, 1, info->userCtx->readProcUserPtr); + chunk->crc = swap32(chunk->crc); + + for (int i = 0; i < 4; ++i) + { + char byte = chunk->type[i]; + if ((byte < 'a' || byte > 'z') && (byte < 'A' || byte > 'Z')) + { + LUPNG_WARN(info, "PNG: invalid chunk name, possibly unprintable"); + releaseChunk(chunk, info->userCtx); + return NULL; + } + } + if (read != 1) + { + LUPNG_WARN(info, "PNG: read error"); + releaseChunk(chunk, info->userCtx); + return NULL; + } + + if (crc(chunk->type, chunk->length+4) != chunk->crc) + { + LUPNG_WARN(info, "PNG: CRC mismatch in \'%.4s\' chunk", (char *)chunk->type); + releaseChunk(chunk, info->userCtx); + return NULL; + } + + return chunk; +} + +static LU_INLINE int handleChunk(PngInfoStruct *info, PngChunk *chunk) +{ + /* critical chunk */ + if (!(chunk->type[0] & 0x20)) + { + if (bytesEqual(chunk->type, (const uint8_t *)"IHDR", 4)) + return parseIhdr(info, chunk); + if (bytesEqual(chunk->type, (const uint8_t *)"PLTE", 4)) + return parsePlte(info, chunk); + if (bytesEqual(chunk->type, (const uint8_t *)"IDAT", 4)) + return parseIdat(info, chunk); + if (bytesEqual(chunk->type, (const uint8_t *)"IEND", 4)) + { + info->chunksFound |= PNG_IEND; + if (!(info->chunksFound & PNG_IDAT)) + { + LUPNG_WARN(info, "PNG: no IDAT chunk found"); + return PNG_ERROR; + } + return PNG_DONE; + } + } + /* ignore ancillary chunks for now */ + + return PNG_OK; +} + +LuImage *luPngReadUC(const LuUserContext *userCtx) +{ + + uint8_t signature[PNG_SIG_SIZE]; + int status = PNG_ERROR; + + PngInfoStruct info; + memset(&info, 0, sizeof(PngInfoStruct)); + info.userCtx = userCtx; + + if (!userCtx->skipSig) + { + info.userCtx->readProc((void *)signature, 1, PNG_SIG_SIZE, info.userCtx->readProcUserPtr); + status = bytesEqual(signature, PNG_SIG, PNG_SIG_SIZE) ? PNG_OK : PNG_ERROR; + } + + if (status == PNG_OK) + { + PngChunk *chunk; + while ((chunk = readChunk(&info))) + { + status = handleChunk(&info, chunk); + releaseChunk(chunk, info.userCtx); + + if (status != PNG_OK) + break; + } + } + else + LUPNG_WARN(&info, "PNG: invalid header"); + + userCtx->freeProc(info.currentScanline, userCtx->freeProcUserPtr); + userCtx->freeProc(info.previousScanline, userCtx->freeProcUserPtr); + userCtx->freeProc(info.palette, userCtx->freeProcUserPtr); + inflateEnd(&info.stream); + + if (status == PNG_DONE) + return info.img; + else + if (info.img) + luImageRelease(info.img, info.userCtx); + + return NULL; +} + +LuImage *luPngRead(PngReadProc readProc, void *userPtr, int skipSig) +{ + LuUserContext userCtx; + + luUserContextInitDefault(&userCtx); + userCtx.readProc = readProc; + userCtx.readProcUserPtr = userPtr; + userCtx.skipSig = skipSig; + return luPngReadUC(&userCtx); +} + +LuImage *luPngReadFile(const char *filename, LuUserContext *userCtx) +{ + LuUserContext tmp_userCtx; + if (userCtx == NULL) { + luUserContextInitDefault(&tmp_userCtx); + userCtx = &tmp_userCtx; + } + + LuImage *img; + FILE *f = fopen(filename,"rb"); + + if (f) { + userCtx->readProc = internalFread; + userCtx->readProcUserPtr = f; + img = luPngReadUC(userCtx); + fclose(f); + } else { + LUPNG_WARN_UC(userCtx, "PNG: failed to open '%s'", filename); + img = NULL; + } + + return img; +} + +static LU_INLINE int writeIhdr(PngInfoStruct *info) +{ + static uint8_t buf[17]; + static const uint8_t colorType[] = { + PNG_GRAYSCALE, + PNG_GRAYSCALE_ALPHA, + PNG_TRUECOLOR, + PNG_TRUECOLOR_ALPHA + }; + size_t written = 0; + PngChunk c; + + if (info->cimg->channels > 4) + { + LUPNG_WARN(info, "PNG: too many channels in image"); + return PNG_ERROR; + } + + c.length = swap32(13); + c.type = buf; /* 4 (type) + 4 + 4 + 5x1 */ + c.data = c.type + 4; + + memcpy((void *)c.type, (void *)"IHDR", 4); + *(uint32_t *)(c.data) = swap32((uint32_t)info->cimg->width); + *(uint32_t *)(c.data + 4) = swap32((uint32_t)info->cimg->height); + *(c.data + 8) = info->cimg->depth; + *(c.data + 9) = colorType[info->cimg->channels-1]; + *(c.data + 10) = 0; /* compression method */ + *(c.data + 11) = 0; /* filter method */ + *(c.data + 12) = 0; /* interlace method: none */ + + c.crc = swap32(crc(c.type, 17)); + + written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; + written += info->userCtx->writeProc((void *)c.type, 1, 4, info->userCtx->writeProcUserPtr); + written += info->userCtx->writeProc((void *)c.data, 1, 13, info->userCtx->writeProcUserPtr); + written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; + + if (written != 25) + { + LUPNG_WARN(info, "PNG: write error"); + return PNG_ERROR; + } + + return PNG_OK; +} + +static LU_INLINE int writeIdat(PngInfoStruct *info, uint8_t *buf, size_t buflen) +{ + size_t written = 0; + PngChunk c; + + c.length = swap32((uint32_t)(buflen-4)); + c.crc = swap32(crc(buf, buflen)); + + written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; + written += info->userCtx->writeProc((void *)buf, 1, buflen, info->userCtx->writeProcUserPtr); + written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; + + if (written != buflen+8) + { + LUPNG_WARN(info, "PNG: write error"); + return PNG_ERROR; + } + + return PNG_OK; +} + +static LU_INLINE void advanceBytep(PngInfoStruct *info, int is16bit) +{ + if (is16bit) + { + if (info->currentByte%2) + --info->currentByte; + else + info->currentByte+=3; + } + else + ++info->currentByte; +} + +static LU_INLINE size_t filterScanline(PngInfoStruct *info, + uint8_t(*f)(PngInfoStruct *info), + uint8_t filter, + uint8_t *filterCandidate, + int is16bit) +{ + size_t curSum = 0; + size_t fc; + + filterCandidate[0] = filter; + for (info->currentByte = is16bit ? 1 : 0, fc = 1; + info->currentByte < info->scanlineBytes; ++fc, advanceBytep(info, is16bit) ) + { + uint8_t val = f(info); + filterCandidate[fc] = val; + curSum += val; + } + + return curSum; +} + +/* + * Processes the input image and calls writeIdat for every BUF_SIZE compressed + * bytes. + */ +static LU_INLINE int processPixels(PngInfoStruct *info) +{ + uint8_t idatBuf[BUF_SIZE+4] = {'I', 'D', 'A', 'T'}; + uint8_t *compressed = idatBuf+4; + uint8_t *filterCandidate = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes+1, info->userCtx->allocProcUserPtr); + uint8_t *bestCandidate = (uint8_t *)info->userCtx->allocProc(info->scanlineBytes+1, info->userCtx->allocProcUserPtr); + size_t minSum = (size_t)-1, curSum = 0; + int status = Z_OK; + int is16bit = info->cimg->depth == 16; + + if (!filterCandidate || !bestCandidate) + { + LUPNG_WARN(info, "PNG: memory allocation failed!"); + } + + memset(&(info->stream), 0, sizeof(info->stream)); + if(deflateInit(&(info->stream), info->userCtx->compressionLevel) != Z_OK) + { + LUPNG_WARN(info, "PNG: deflateInit failed!"); + info->userCtx->freeProc(filterCandidate, info->userCtx->freeProcUserPtr); + info->userCtx->freeProc(bestCandidate, info->userCtx->freeProcUserPtr); + return PNG_ERROR; + } + + info->stream.avail_out = BUF_SIZE; + info->stream.next_out = compressed; + + for (info->currentRow = 0; info->currentRow < info->cimg->height; + ++info->currentRow) + { + int flush = (info->currentRow < info->cimg->height-1) ? + Z_NO_FLUSH : Z_FINISH; + minSum = (size_t)-1; + + /* + * 1st time it doesn't matter, the filters never look at the previous + * scanline when processing row 0. And next time it'll be valid. + */ + info->previousScanline = info->currentScanline; + info->currentScanline = info->cimg->data + (info->currentRow*info->scanlineBytes); + + /* + * Try to choose the best filter for each scanline. + * Breaks in case of overflow, but hey it's just a heuristic. + */ + for (info->currentFilter = PNG_FILTER_NONE; info->currentFilter <= PNG_FILTER_PAETH; ++info->currentFilter) + { + + switch (info->currentFilter) + { + case PNG_FILTER_NONE: + curSum = filterScanline(info, none, PNG_FILTER_NONE, filterCandidate, is16bit); + break; + + case PNG_FILTER_SUB: + curSum = filterScanline(info, sub, PNG_FILTER_SUB, filterCandidate, is16bit); + break; + + case PNG_FILTER_UP: + curSum = filterScanline(info, up, PNG_FILTER_UP, filterCandidate, is16bit); + break; + + case PNG_FILTER_AVERAGE: + curSum = filterScanline(info, average, PNG_FILTER_AVERAGE, filterCandidate, is16bit); + break; + + case PNG_FILTER_PAETH: + curSum = filterScanline(info, paeth, PNG_FILTER_PAETH, filterCandidate, is16bit); + break; + + default: + break; + } + + if (curSum < minSum || !info->currentFilter) + { + uint8_t *tmp = bestCandidate; + bestCandidate = filterCandidate; + filterCandidate = tmp; + minSum = curSum; + } + } + + info->stream.avail_in = (unsigned int)info->scanlineBytes+1; + info->stream.next_in = bestCandidate; + + /* compress bestCandidate */ + do + { + status = deflate(&info->stream, flush); + + if (info->stream.avail_out < BUF_SIZE) + { + writeIdat(info, idatBuf, BUF_SIZE-info->stream.avail_out+4); + info->stream.next_out = compressed; + info->stream.avail_out = BUF_SIZE; + } + } while ((flush == Z_FINISH && status != Z_STREAM_END) + || (flush == Z_NO_FLUSH && info->stream.avail_in)); + } + + info->userCtx->freeProc(filterCandidate, info->userCtx->freeProcUserPtr); + info->userCtx->freeProc(bestCandidate, info->userCtx->freeProcUserPtr); + + return PNG_OK; +} + +static LU_INLINE int writeIend(PngInfoStruct *info) +{ + PngChunk c = { 0, (uint8_t *)"IEND", 0, 0 }; + size_t written = 0; + c.crc = swap32(crc(c.type, 4)); + + written += info->userCtx->writeProc((void *)&c.length, 4, 1, info->userCtx->writeProcUserPtr) * 4; + written += info->userCtx->writeProc((void *)c.type, 1, 4, info->userCtx->writeProcUserPtr); + written += info->userCtx->writeProc((void *)&c.crc, 4, 1, info->userCtx->writeProcUserPtr) * 4; + + if (written != 12) + { + LUPNG_WARN(info, "PNG: write error"); + return PNG_ERROR; + } + + return PNG_OK; +} + +int luPngWriteUC(const LuUserContext *userCtx, const LuImage *img) +{ + PngInfoStruct info; + memset(&info, 0, sizeof(PngInfoStruct)); + info.userCtx = userCtx; + info.cimg = img; + info.bytesPerPixel = (info.cimg->channels * info.cimg->depth) >> 3; + + if (info.userCtx->writeProc((void *)PNG_SIG, 1, PNG_SIG_SIZE, info.userCtx->writeProcUserPtr) != PNG_SIG_SIZE) + { + LUPNG_WARN(&info, "PNG: write error"); + return PNG_ERROR; + } + + if (writeIhdr(&info) != PNG_OK) + return PNG_ERROR; + + info.scanlineBytes = (info.cimg->depth >> 3) * info.cimg->channels * info.cimg->width; + if (processPixels(&info) != PNG_OK) + { + deflateEnd(&(info.stream)); + return PNG_ERROR; + } + + deflateEnd(&(info.stream)); + return writeIend(&info); +} + +int luPngWrite(PngWriteProc writeProc, void *userPtr, const LuImage *img) +{ + LuUserContext userCtx; + + luUserContextInitDefault(&userCtx); + userCtx.writeProc = writeProc; + userCtx.writeProcUserPtr = userPtr; + return luPngWriteUC(&userCtx, img); +} + +int luPngWriteFile(const char *filename, const LuImage *img) +{ + LuUserContext userCtx; + FILE *f; + + if (!img) + { + return PNG_ERROR; + } + + f = fopen(filename,"wb"); + luUserContextInitDefault(&userCtx); + if (f) + { + userCtx.writeProc = internalFwrite; + userCtx.writeProcUserPtr = f; + luPngWriteUC(&userCtx, img); + fclose(f); + } + else + { + LUPNG_WARN_UC(&userCtx, "PNG: failed to open '%s'", filename); + return PNG_ERROR; + } + + return PNG_OK; +} + +void luImageRelease(LuImage *img, const LuUserContext *userCtx) +{ + LuUserContext ucDefault; + + if (userCtx == NULL) + { + luUserContextInitDefault(&ucDefault); + userCtx = &ucDefault; + } + + userCtx->freeProc(img->data, userCtx->freeProcUserPtr); + if (userCtx->overrideImage != img) + userCtx->freeProc(img, userCtx->freeProcUserPtr); +} + +LuImage *luImageCreate(size_t width, size_t height, uint8_t channels, uint8_t depth, + uint8_t *buffer, const LuUserContext *userCtx) +{ + LuImage *img; + LuUserContext ucDefault; + + if (userCtx == NULL) { + luUserContextInitDefault(&ucDefault); + userCtx = &ucDefault; + } + + if (depth != 8 && depth != 16) + { + LUPNG_WARN_UC(userCtx,"Image: only bit depths 8 and 16 are supported!"); + return NULL; + } + if (width > 0x7FFFFFFF || height > 0x7FFFFFFF) { + LUPNG_WARN_UC(userCtx, "Image: only 32 bit signed image dimensions are supported!"); + return NULL; + } + + if (userCtx->overrideImage) + img = userCtx->overrideImage; + else + img = (LuImage *)userCtx->allocProc(sizeof(LuImage), userCtx->allocProcUserPtr); + if (!img) + return NULL; + + img->width = (int32_t)width; + img->height = (int32_t)height; + img->channels = channels; + img->depth = depth; + img->dataSize = (size_t)((depth >> 3) * width * height * channels); + if (buffer) + img->data = buffer; + else + img->data = (uint8_t *)userCtx->allocProc(img->dataSize, userCtx->allocProcUserPtr); + + if (img->data == NULL) + { + luImageRelease(img, userCtx); + return NULL; + } + + return img; +} + +uint8_t *luImageExtractBufAndRelease(LuImage *img, const LuUserContext *userCtx) +{ + uint8_t *data; + LuUserContext ucDefault; + + if (userCtx == NULL) { + luUserContextInitDefault(&ucDefault); + userCtx = &ucDefault; + } + + if (img) + { + data = img->data; + img->data = NULL; + luImageRelease(img, userCtx); + } + else + { + data = NULL; + } + + return data; +} + +void luUserContextInitDefault(LuUserContext *userCtx) +{ + userCtx->readProc=NULL; + userCtx->readProcUserPtr=NULL; + userCtx->skipSig = 0; + + userCtx->writeProc=NULL; + userCtx->writeProcUserPtr=NULL; + userCtx->compressionLevel=Z_DEFAULT_COMPRESSION; + + userCtx->allocProc=internalMalloc; + userCtx->allocProcUserPtr=NULL; + userCtx->freeProc=internalFree; + userCtx->freeProcUserPtr=NULL; + + userCtx->warnProc=internalPrintf; + userCtx->warnProcUserPtr=(void*)stderr; + + userCtx->overrideImage=NULL; +} \ No newline at end of file diff --git a/tests/regression/tex/lupng.h b/tests/regression/tex/lupng.h new file mode 100644 index 00000000..5c3f8465 --- /dev/null +++ b/tests/regression/tex/lupng.h @@ -0,0 +1,186 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2014 Jan Solanti + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#pragma once + +#if defined(_MSC_VER) && (_MSC_VER < 1600) +typedef __int8 int8_t; +typedef __int16 int16_t; +typedef __int32 int32_t; +typedef unsigned __int8 uint8_t; +typedef unsigned __int16 uint16_t; +typedef unsigned __int32 uint32_t; +#else +#include +#include +#endif + +typedef struct { + int32_t width; + int32_t height; + uint8_t channels; + uint8_t depth; /* must be 8 or 16 */ + size_t dataSize; + uint8_t *data; +} LuImage; + +typedef size_t (*PngReadProc)(void *outPtr, size_t size, size_t count, void *userPtr); +typedef size_t (*PngWriteProc)(const void *inPtr, size_t size, size_t count, void *userPtr); +typedef void* (*PngAllocProc)(size_t size, void *userPtr); +typedef void (*PngFreeProc)(void *ptr, void *userPtr); +typedef void (*PngWarnProc)(void *userPtr, const char *fmt, ...); + +typedef struct { + /* loader */ + PngReadProc readProc; + void *readProcUserPtr; + int skipSig; + + /* writer */ + PngWriteProc writeProc; + void *writeProcUserPtr; + int compressionLevel; + + /* memory allocation */ + PngAllocProc allocProc; + void *allocProcUserPtr; + PngFreeProc freeProc; + void *freeProcUserPtr; + + /* warnings/error output */ + PngWarnProc warnProc; /* set to NULL to disable output altogether */ + void *warnProcUserPtr; + + /* special case: avoid allocating a LuImage when loading or creating + * an image, just use this one */ + LuImage *overrideImage; +} LuUserContext; + +/** + * Initializes a LuUserContext to use the defaul malloc implementation. + * + * @param userCtx the LuUserContext to initialize + */ +void luUserContextInitDefault(LuUserContext *userCtx); + +/** + * Creates a new Image object with the specified attributes. + * The data store of the Image is allocated but its contents are undefined. + * Only 8 and 16 bits deep images with 1-4 channels are supported. + * + * @param buffer pointer to an existing buffer (which may already contain the + * image data), or NULL to internally allocate a new buffer + * @param userCtx the user context (with the memory allocator function + * pointers to use), or NULL to use the default allocator + * (malloc). + */ +LuImage *luImageCreate(size_t width, size_t height, uint8_t channels, uint8_t depth, + uint8_t *buffer, const LuUserContext *usrCtx); + +/** + * Releases the memory associated with the given Image object. + * + * @param userCtx the user context (with the memory deallocator function + * pointers to use), or NULL to use the default deallocator + * (free). The deallocator should match the ones used for + * allocation. + */ +void luImageRelease(LuImage *img, const LuUserContext *usrCtx); + +/** + * Extracts the raw image buffer form a LuImage and releases the + * then-orphaned LuImage object. This can be used if you want to use + * the image data in your own structures. + * + * @param userCtx the user context (with the memory deallocator function + * pointers to use), or NULL to use the default deallocator + * (free). The deallocator should match the ones used for + * allocation. + */ +uint8_t *luImageExtractBufAndRelease(LuImage *img, const LuUserContext *userCtx); + +/** + * Decodes a PNG image from a file + * + * @param filename the file name (optionally with full path) to read from. + * @param userCtx the user context (with the memory allocator function + * pointers to use), or NULL to use the default allocator + * (malloc). + */ +LuImage *luPngReadFile(const char *filename, LuUserContext *userCtx); + +/** + * Decodes a PNG image with the provided read function into a LuImage struct + * + * @param readProc a function pointer to a user-defined function to use for + * reading the PNG data. + * @param userPtr an opaque pointer provided as an argument to readProc + * @param skipSig don't verify PNG signature - the bytes have already been + * removed from the input stream + */ +LuImage *luPngRead(PngReadProc readProc, void *userPtr, int skipSig); + +/** + * Decodes a PNG image with the provided user context into a LuImage struct + * + * @param userCtx the LuUserContext to use + */ +LuImage *luPngReadUC(const LuUserContext *userCtx); + +/** + * Encodes a LuImage struct to PNG and writes it out to a file. + * + * @param filename the file name (optionally with full path) to write to. + * Existing files will be overwritten! + * @param img the LuImage to encode + */ +int luPngWriteFile(const char *filename, const LuImage *img); + +/** + * Encodes a LuImage struct to PNG and writes it out using a user-defined write + * function. + * + * @param writeProc a function pointer to a user-defined function that will be + * used for writing the final PNG data. + * @param userPtr an opaque pointer provided as an argument to writeProc + * @param img the LuImage to encode + */ +int luPngWrite(PngWriteProc writeProc, void *userPtr, const LuImage *img); + +/** + * Encodes a LuImage struct to PNG and writes it out with the provided user + * context. + * + * @param userCtx the LuUserContext to use + * @param img the LuImage to encode + */ +int luPngWriteUC(const LuUserContext *userCtx, const LuImage *img); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/tests/regression/tex/main.cpp b/tests/regression/tex/main.cpp new file mode 100644 index 00000000..39ffea0c --- /dev/null +++ b/tests/regression/tex/main.cpp @@ -0,0 +1,256 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include "utils.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +const char* input_file = "palette64.png"; +const char* output_file = "output.png"; +int wrap = 0; +int filter = 0; +float scale = 1.0f; +int format = 0; +bool use_sw = false; +ePixelFormat eformat = FORMAT_A8R8G8B8; + +vx_device_h device = nullptr; +vx_buffer_h buffer = nullptr; + +static void show_usage() { + std::cout << "Vortex Texture Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "zi:o:k:w:f:g:h?")) != -1) { + switch (c) { + case 'i': + input_file = optarg; + break; + case 'o': + output_file = optarg; + break; + case 's': + scale = std::stof(optarg, NULL); + break; + case 'w': + wrap = std::atoi(optarg); + break; + case 'z': + use_sw = true; + break; + case 'f': { + format = std::atoi(optarg); + switch (format) { + case 0: eformat = FORMAT_A8R8G8B8; break; + case 1: eformat = FORMAT_R5G6B5; break; + case 2: eformat = FORMAT_R4G4B4A4; break; + case 3: eformat = FORMAT_L8; break; + case 4: eformat = FORMAT_A8; break; + default: + std::cout << "Error: invalid format: " << format << std::endl; + exit(1); + } + } break; + case 'g': + filter = std::atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (buffer) { + vx_buf_release(buffer); + } + if (device) { + vx_dev_close(device); + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t width, + uint32_t height) { + auto time_start = std::chrono::high_resolution_clock::now(); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, -1)); + + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0)); + + std::vector dst_pixels(buf_size); + auto buf_ptr = (uint8_t*)vx_host_ptr(buffer); + for (uint32_t i = 0; i < buf_size; ++i) { + dst_pixels[i] = buf_ptr[i]; + } + + // save output image + std::cout << "save output image" << std::endl; + //dump_image(dst_pixels, width, height, bpp); + RT_CHECK(SaveImage(output_file, FORMAT_A8R8G8B8, dst_pixels, width, height)); + + return 0; +} + +int main(int argc, char *argv[]) { + kernel_arg_t kernel_arg; + std::vector src_pixels; + uint32_t src_width; + uint32_t src_height; + + // parse command arguments + parse_args(argc, argv); + + RT_CHECK(LoadImage(input_file, eformat, src_pixels, &src_width, &src_height)); + + // check power of two support + if (!ISPOW2(src_width) || !ISPOW2(src_height)) { + std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl; + return -1; + } + + uint32_t src_bpp = Format::GetInfo(eformat).BytePerPixel; + + //dump_image(src_pixels, src_width, src_height, src_bpp); + + uint32_t src_bufsize = src_bpp * src_width * src_height; + + uint32_t dst_width = (uint32_t)(src_width * scale); + uint32_t dst_height = (uint32_t)(src_height * scale); + uint32_t dst_bpp = 4; + uint32_t dst_bufsize = dst_bpp * dst_width * dst_height; + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + unsigned max_cores, max_warps, max_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads)); + + uint32_t num_tasks = max_cores * max_warps * max_threads; + + std::cout << "number of tasks: " << std::dec << num_tasks << std::endl; + std::cout << "source buffer: width=" << src_width << ", heigth=" << src_height << ", size=" << src_bufsize << " bytes" << std::endl; + std::cout << "destination buffer: width=" << dst_width << ", heigth=" << dst_height << ", size=" << dst_bufsize << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + size_t src_addr, dst_addr; + RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr)); + RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr)); + + std::cout << "src_addr=0x" << std::hex << src_addr << std::endl; + std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl; + + // allocate staging shared memory + std::cout << "allocate shared memory" << std::endl; + uint32_t alloc_size = std::max(sizeof(kernel_arg_t), std::max(src_bufsize, dst_bufsize)); + RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + { + kernel_arg.num_tasks = std::min(num_tasks, dst_height); + kernel_arg.format = format; + kernel_arg.filter = filter; + kernel_arg.wrap = wrap; + kernel_arg.use_sw = use_sw; + kernel_arg.lod = 0x0; + + kernel_arg.src_logWidth = (uint32_t)std::log2(src_width); + kernel_arg.src_logHeight = (uint32_t)std::log2(src_height); + kernel_arg.src_stride = src_bpp; + kernel_arg.src_pitch = src_bpp * src_width; + kernel_arg.src_ptr = src_addr; + + kernel_arg.dst_width = dst_width; + kernel_arg.dst_height = dst_height; + kernel_arg.dst_stride = dst_bpp; + kernel_arg.dst_pitch = dst_bpp * dst_width; + kernel_arg.dst_ptr = dst_addr; + + auto buf_ptr = (int*)vx_host_ptr(buffer); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + } + + // upload source buffer + std::cout << "upload source buffer" << std::endl; + { + auto buf_ptr = (int8_t*)vx_host_ptr(buffer); + for (uint32_t i = 0; i < src_bufsize; ++i) { + buf_ptr[i] = src_pixels[i]; + } + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0)); + } + + // clear destination buffer + std::cout << "clear destination buffer" << std::endl; + { + auto buf_ptr = (int32_t*)vx_host_ptr(buffer); + for (uint32_t i = 0; i < (dst_bufsize/4); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/tests/regression/tex/palette16.png b/tests/regression/tex/palette16.png new file mode 100644 index 00000000..34ab80d5 Binary files /dev/null and b/tests/regression/tex/palette16.png differ diff --git a/tests/regression/tex/palette4.png b/tests/regression/tex/palette4.png new file mode 100644 index 00000000..be0979e1 Binary files /dev/null and b/tests/regression/tex/palette4.png differ diff --git a/tests/regression/tex/palette64.png b/tests/regression/tex/palette64.png new file mode 100644 index 00000000..aa497e3e Binary files /dev/null and b/tests/regression/tex/palette64.png differ diff --git a/tests/regression/tex/rainbow.png b/tests/regression/tex/rainbow.png new file mode 100644 index 00000000..ec0c117b Binary files /dev/null and b/tests/regression/tex/rainbow.png differ diff --git a/tests/regression/tex/soccer.png b/tests/regression/tex/soccer.png new file mode 100644 index 00000000..5d57ee82 Binary files /dev/null and b/tests/regression/tex/soccer.png differ diff --git a/tests/regression/tex/surfacedesc.h b/tests/regression/tex/surfacedesc.h new file mode 100644 index 00000000..cf303584 --- /dev/null +++ b/tests/regression/tex/surfacedesc.h @@ -0,0 +1,25 @@ +// +// Copyright (c) Blaise Tine. All rights reserved. +// +// +// Use of this sample source code is subject to the terms of the Microsoft +// license agreement under which you licensed this sample source code. If +// you did not accept the terms of the license agreement, you are not +// authorized to use this sample source code. For the terms of the license, +// please see the license agreement between you and Microsoft or, if applicable, +// see the LICENSE.RTF on your install media or the root of your tools +// installation. +// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR +// INDEMNITIES. +// +#pragma once + +#include "format.h" + +struct SurfaceDesc { + ePixelFormat Format; + uint8_t *pBits; + uint32_t Width; + uint32_t Height; + uint32_t Pitch; +}; \ No newline at end of file diff --git a/tests/regression/tex/texsw.h b/tests/regression/tex/texsw.h new file mode 100644 index 00000000..96b9a19e --- /dev/null +++ b/tests/regression/tex/texsw.h @@ -0,0 +1,167 @@ +#ifndef _TEXSW_H_ + +#include "common.h" + +#define TEX_LOD_MAX 11 + +#define MIN(x, y) ((x < y) ? (x) : (y)) + +#define MAX(x, y) ((x > y) ? (x) : (y)) + +inline int address(int wrap, int value) { + switch (wrap) { + case 1: return value & 0xfffff; + default: + case 0: return MIN(MAX(value, 0), 0xfffff); + } +} + +inline void unpack(int format, int value, int* l, int* h) { + switch (format) { + case 1: + case 2: + *l = value; + *h = 0; + break; + case 3: + *l = (value | (value << 8)) & 0x00ff00ff; + *h = 0; + break; + case 4: + *l = (value | (value << 16)) & 0x07e0f81f; + *h = 0; + break; + case 5: + *l = (value | (value << 12)) & 0x0f0f0f0f; + *h = 0; + break; + default: + case 0: + *l = value & 0x00ff00ff; + *h = (value >> 8) & 0x00ff00ff; + break; + } +} + +inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) { + *l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + *h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; +} + +inline int pack(int format, int l, int h) { + switch (format) { + case 1: + case 2: + return l; + case 3: + return (l | (l >> 8)) & 0xffff; + case 4: + return (l | (l >> 16)) & 0xffff; + case 5: + return (l | (l >> 12)) & 0xffff; + default: + case 0: + return (h << 8) | l; + } +} + +inline int tex_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { + int base_addr = state->src_ptr; + int mip_offset = 0; + int log_width = state->src_logWidth; + int log_height = state->src_logHeight; + int format = state->format; + int wrap = state->wrap; + int filter = state->filter; + + int32_t* pBits = ((uint32_t*)base_addr) + mip_offset; + + if (filter) { + int u0 = address(wrap, u - (0x80000 >> log_width)); + int v0 = address(wrap, v - (0x80000 >> log_height)); + int u1 = address(wrap, u + (0x80000 >> log_width)); + int v1 = address(wrap, v + (0x80000 >> log_height)); + + int x0 = u0 >> (20 - log_width); + int y0 = v0 >> (20 - log_height); + int x1 = u1 >> (20 - log_width); + int y1 = v1 >> (20 - log_height); + + // memory lookup + + int c0 = pBits[x0 + (y0 << log_width)]; + int c1 = pBits[x1 + (y0 << log_width)]; + int c2 = pBits[x0 + (y1 << log_width)]; + int c3 = pBits[x1 + (y1 << log_width)]; + + // filtering + + int alpha = x0 & 0xff; + int beta = y0 & 0xff; + + int c0a, c0b; + int c1a, c1b; + int c01a, c01b; + + unpack(format, c0, &c0a, &c0b); + unpack(format, c1, &c1a, &c1b); + lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b); + + int c2a, c2b; + int c3a, c3b; + int c23a, c23b; + + unpack(format, c2, &c2a, &c2b); + unpack(format, c3, &c3a, &c3b); + lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b); + + int c4a, c4b; + lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b); + return pack(format, c4a, c4b); + } else { + int u0 = address(wrap, u); + int v0 = address(wrap, v); + + int x0 = u0 >> (20 - log_width); + int y0 = v0 >> (20 - log_height); + + int c0 = pBits[x0 + (y0 <> 8) & 0x00ff00ff; + int bl = b & 0x00ff00ff; + int bh = (b >> 8) & 0x00ff00ff; + int frac = (lod >> 12) & 0xff; + int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; + int c = al | (ah << 8); + return c; +} + +inline int tex3_sw(kernel_arg_t* state, int stage, int u, int v, int lod) { + int lodn = MIN(lod + 0x10000, TEX_LOD_MAX); + int a = tex_sw(state, 0, u, v, lod); + int b = tex_sw(state, 0, u, v, lodn); + int al = a & 0x00ff00ff; + int ah = (a >> 8) & 0x00ff00ff; + + int bl = b & 0x00ff00ff; + int bh = (b >> 8) & 0x00ff00ff; + int frac = (lod >> 12) & 0xff; + int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff; + int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff; + int c = al | (ah << 8); + return c; +} + +#endif \ No newline at end of file diff --git a/tests/regression/tex/tga.cpp b/tests/regression/tex/tga.cpp new file mode 100644 index 00000000..62641587 --- /dev/null +++ b/tests/regression/tex/tga.cpp @@ -0,0 +1,122 @@ +#include "tga.h" +#include +#include +#include "format.h" + +struct __attribute__((__packed__)) tga_header_t { + int8_t idlength; + int8_t colormaptype; + int8_t imagetype; + int16_t colormaporigin; + int16_t colormaplength; + int8_t colormapdepth; + int16_t xoffset; + int16_t yoffset; + int16_t width; + int16_t height; + int8_t bitsperpixel; + int8_t imagedescriptor; +}; + +int LoadTGA(const char *filename, + std::vector &pixels, + uint32_t *width, + uint32_t *height, + uint32_t *bpp) { + std::ifstream ifs(filename, std::ios::in | std::ios::binary); + if (!ifs.is_open()) { + std::cerr << "couldn't open file: " << filename << "!" << std::endl; + return -1; + } + + tga_header_t header; + ifs.read(reinterpret_cast(&header), sizeof(tga_header_t)); + if (ifs.fail()) { + std::cerr << "invalid TGA file header!" << std::endl; + return -1; + } + + if (header.imagetype != 2) { + std::cerr << "unsupported TGA encoding format!" << std::endl; + return -1; + } + + ifs.seekg(header.idlength, std::ios::cur); // skip string + if (ifs.fail()) { + std::cerr << "invalid TGA file!" << std::endl; + return -1; + } + + switch (header.bitsperpixel) { + case 16: + case 24: + case 32: { + // Read pixels data + auto stride = header.bitsperpixel / 8; + pixels.resize(stride * header.width * header.height); + ifs.read((char*)pixels.data(), pixels.size()); + if (ifs.fail()) { + std::cerr << "invalid TGA file!" << std::endl; + return -1; + } + *bpp = stride; + break; + } + default: + std::cerr << "unsupported TGA bitsperpixel!" << std::endl; + return -1; + } + + *width = header.width; + *height = header.height; + + return 0; +} + +int SaveTGA(const char *filename, + const std::vector &pixels, + uint32_t width, + uint32_t height, + uint32_t bpp) { + std::ofstream ofs(filename, std::ios::out | std::ios::binary); + if (!ofs.is_open()) { + std::cerr << "couldn't create file: " << filename << "!" << std::endl; + return -1; + } + + if (bpp < 2 || bpp > 4) { + std::cerr << "unsupported pixel stride: " << bpp << "!" << std::endl; + return -1; + } + + tga_header_t header; + header.idlength = 0; + header.colormaptype = 0; // no palette + header.imagetype = 2; // color mapped data + header.colormaporigin = 0; + header.colormaplength = 0; + header.colormapdepth = 0; + header.xoffset = 0; + header.yoffset = 0; + header.width = width; + header.height = height; + header.bitsperpixel = bpp * 8; + header.imagedescriptor = 0; + + // write header + ofs.write(reinterpret_cast(&header), sizeof(tga_header_t)); + + // write pixel data + uint32_t pitch = bpp * width; + const uint8_t* pixel_bytes = pixels.data() + (height - 1) * pitch; + for (uint32_t y = 0; y < height; ++y) { + const uint8_t* pixel_row = pixel_bytes; + for (uint32_t x = 0; x < width; ++x) { + ofs.write((const char*)pixel_row, bpp); + pixel_row += bpp; + } + pixel_bytes -= pitch; + } + + return 0; +} \ No newline at end of file diff --git a/tests/regression/tex/tga.h b/tests/regression/tex/tga.h new file mode 100644 index 00000000..24b92a75 --- /dev/null +++ b/tests/regression/tex/tga.h @@ -0,0 +1,14 @@ +#include +#include + +int LoadTGA(const char *filename, + std::vector &pixels, + uint32_t *width, + uint32_t *height, + uint32_t *bpp); + +int SaveTGA(const char *filename, + const std::vector &pixels, + uint32_t width, + uint32_t height, + uint32_t bpp); \ No newline at end of file diff --git a/tests/regression/tex/toad.png b/tests/regression/tex/toad.png new file mode 100644 index 00000000..fe82cad7 Binary files /dev/null and b/tests/regression/tex/toad.png differ diff --git a/tests/regression/tex/utils.cpp b/tests/regression/tex/utils.cpp new file mode 100644 index 00000000..8a2ff760 --- /dev/null +++ b/tests/regression/tex/utils.cpp @@ -0,0 +1,194 @@ +#include "utils.h" +#include +#include +#include "blitter.h" +#include "format.h" +#include "tga.h" +#include "lupng.h" + +std::string getFileExt(const std::string& str) { + auto i = str.rfind('.'); + if (i != std::string::npos) { + return str.substr(i+1); + } + return(""); +} + +bool iequals(const std::string& a, const std::string& b) { + auto sz = a.size(); + if (b.size() != sz) + return false; + for (size_t i = 0; i < sz; ++i) { + if (tolower(a[i]) != tolower(b[i])) + return false; + } + return true; +} + +int LoadImage(const char *filename, + ePixelFormat format, + std::vector &pixels, + uint32_t *width, + uint32_t *height) { + uint32_t img_width; + uint32_t img_height; + uint32_t img_bpp; + + auto ext = getFileExt(filename); + if (iequals(ext, "tga")) { + int ret = LoadTGA(filename, pixels, &img_width, &img_height, &img_bpp); + if (ret) + return ret; + } else + if (iequals(ext, "png")) { + auto image = luPngReadFile(filename, NULL); + if (image == NULL) + return -1; + if (image->depth != 8 + || (image->channels != 3 + && image->channels != 4)) { + luImageRelease(image, NULL); + std::cerr << "invalid png file format!" << std::endl; + return -1; + } + pixels.resize(image->channels * image->width * image->height); + memcpy(pixels.data(), image->data, pixels.size()); + img_width = image->width; + img_height = image->height; + img_bpp = image->channels; + luImageRelease(image, NULL); + } else { + std::cerr << "invalid file extension: " << ext << "!" << std::endl; + return -1; + } + + ePixelFormat img_format; + switch (img_bpp) { + case 1: + img_format = FORMAT_A8; + break; + case 2: + img_format = FORMAT_A1R5G5B5; + break; + case 3: + img_format = FORMAT_R8G8B8; + break; + case 4: + img_format = FORMAT_A8R8G8B8; + break; + default: + std::abort(); + } + + if (img_format != format) { + // format conversion to RGBA + std::vector staging; + int ret = ConvertImage(staging, pixels, img_width, img_height, img_format, format); + if (ret) + return ret; + pixels.swap(staging); + } + + *width = img_width; + *height = img_height; + + return 0; +} + +int SaveImage(const char *filename, + ePixelFormat format, + const std::vector &pixels, + uint32_t width, + uint32_t height) { + uint32_t bpp = Format::GetInfo(format).BytePerPixel; + auto ext = getFileExt(filename); + if (iequals(ext, "tga")) { + return SaveTGA(filename, pixels, width, height, bpp); + } else + if (iequals(ext, "png")) { + LuImage image; + image.width = width; + image.height = height; + image.depth = 8; + image.channels = bpp; + image.data = (uint8_t*)pixels.data(); + return luPngWriteFile(filename, &image); + } else { + std::cerr << "invalid file extension: " << ext << "!" << std::endl; + return -1; + } + + return 0; +} + +void dump_image(const std::vector& pixels, uint32_t width, uint32_t height, uint32_t bpp) { + assert(width * height * bpp == pixels.size()); + const uint8_t* pixel_bytes = pixels.data(); + for (uint32_t y = 0; y < height; ++y) { + for (uint32_t x = 0; x < width; ++x) { + uint32_t pixel32 = 0; + for (uint32_t b = 0; b < bpp; ++b) { + uint32_t pixel8 = *pixel_bytes++; + pixel32 |= pixel8 << (b * 8); + } + if (x) std::cout << ", "; + std::cout << std::hex << pixel32; + } + std::cout << std::endl; + } +} + +int CopyBuffers(SurfaceDesc &dstDesc, + int32_t dstOffsetX, + int32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + int32_t srcOffsetX, + int32_t srcOffsetY) { + + static const BlitTable s_blitTable; + + if ((srcOffsetX >= (int32_t)srcDesc.Width) || (srcOffsetY >= (int32_t)srcDesc.Height) || + (dstOffsetX >= (int32_t)dstDesc.Width) || (dstOffsetY >= (int32_t)dstDesc.Height)) { + return -1; + } + + if (copyWidth > dstDesc.Width) { + copyWidth = dstDesc.Width; + } + + if (copyWidth > srcDesc.Width) { + copyWidth = srcDesc.Width; + } + + if (copyHeight > dstDesc.Height) { + copyHeight = dstDesc.Height; + } + + if (copyHeight > srcDesc.Height) { + copyHeight = srcDesc.Height; + } + + return s_blitTable.get(srcDesc.Format, dstDesc.Format)( + dstDesc, dstOffsetX, dstOffsetY, copyWidth, copyHeight, srcDesc, + srcOffsetX, srcOffsetY); +} + +int ConvertImage(std::vector& dst_pixels, + const std::vector& src_pixels, + uint32_t width, + uint32_t height, + ePixelFormat src_format, + ePixelFormat dst_format) { + + uint32_t src_pitch = Format::GetInfo(src_format).BytePerPixel * width; + uint32_t dst_pitch = Format::GetInfo(dst_format).BytePerPixel * width; + + dst_pixels.resize(dst_pitch * height); + + SurfaceDesc srcDesc{src_format, (uint8_t*)src_pixels.data(), width, height, src_pitch}; + SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch}; + + return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0); +} \ No newline at end of file diff --git a/tests/regression/tex/utils.h b/tests/regression/tex/utils.h new file mode 100644 index 00000000..48b1ad55 --- /dev/null +++ b/tests/regression/tex/utils.h @@ -0,0 +1,43 @@ +#include +#include +#include +#include "surfacedesc.h" + +#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1)))) + +inline uint32_t ilog2 (uint32_t value) { + return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1; +} + +int LoadImage(const char *filename, + ePixelFormat format, + std::vector &pixels, + uint32_t *width, + uint32_t *height); + +int SaveImage(const char *filename, + ePixelFormat format, + const std::vector &pixels, + uint32_t width, + uint32_t height); + +int CopyBuffers(SurfaceDesc &dstDesc, + int32_t dstOffsetX, + int32_t dstOffsetY, + uint32_t copyWidth, + uint32_t copyHeight, + const SurfaceDesc &srcDesc, + int32_t srcOffsetX, + int32_t srcOffsetY); + +int ConvertImage(std::vector& dst_pixels, + const std::vector& src_pixels, + uint32_t width, + uint32_t height, + ePixelFormat src_format, + ePixelFormat dst_format); + +void dump_image(const std::vector& pixels, + uint32_t width, + uint32_t height, + uint32_t bpp); diff --git a/tests/runtime/simple/tests.cpp b/tests/runtime/simple/tests.cpp index 912df0b9..fd0e219b 100644 --- a/tests/runtime/simple/tests.cpp +++ b/tests/runtime/simple/tests.cpp @@ -208,7 +208,7 @@ int test_spawn_tasks() { st_buffer_src[i] = 65 + i; } - vx_spawn_tasks(ST_BUF_SZ, st_kernel, &arg); + vx_spawn_tasks(ST_BUF_SZ, (vx_spawn_tasks_cb)st_kernel, &arg); return check_error(st_buffer_dst, 0, ST_BUF_SZ); } @@ -230,7 +230,7 @@ void sr_kernel(const sr_args_t * arg) { void __attribute__ ((noinline)) do_serial() { sr_args_t arg; arg.buf = sr_buffer; - vx_serial(sr_kernel, &arg); + vx_serial((vx_serial_cb)sr_kernel, &arg); } int test_serial() {