Merge branch 'master' into assignment5

This commit is contained in:
tinebp
2021-10-19 18:54:09 -04:00
committed by GitHub
147 changed files with 6736 additions and 546 deletions

View File

@@ -30,25 +30,28 @@ jobs:
include: include:
- stage: test - stage: test
name: coverage name: coverage
script: cp -r $PWD ../build1 && cd ../build1 && ./ci/travis_run.py ./ci/regression.sh -coverage script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage
- stage: test
name: tex
script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
- stage: test - stage: test
name: cluster name: cluster
script: cp -r $PWD ../build2 && cd ../build2 && ./ci/travis_run.py ./ci/regression.sh -cluster script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster
- stage: test - stage: test
name: debug name: debug
script: cp -r $PWD ../build3 && cd ../build3 && ./ci/travis_run.py ./ci/regression.sh -debug script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
- stage: test - stage: test
name: config name: config
script: cp -r $PWD ../build4 && cd ../build4 && ./ci/travis_run.py ./ci/regression.sh -config script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config
- stage: test - stage: test
name: stress0 name: stress0
script: cp -r $PWD ../build5 && cd ../build5 && ./ci/travis_run.py ./ci/regression.sh -stress0 script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0
- stage: test - stage: test
name: stress1 name: stress1
script: cp -r $PWD ../build6 && cd ../build6 && ./ci/travis_run.py ./ci/regression.sh -stress1 script: cp -r $PWD ../build_stress1 && cd ../build_stress1 && ./ci/travis_run.py ./ci/regression.sh -stress1
- stage: test - stage: test
name: compiler name: compiler
script: cp -r $PWD ../build7 && cd ../build7 && ./ci/travis_run.py ./ci/test_compiler.sh script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
after_success: after_success:
# Gather code coverage # Gather code coverage

View File

@@ -21,59 +21,32 @@ Vortex is a full-system RISCV-based GPGPU processor.
## Directory structure ## Directory structure
- `doc`: [Documentation](doc/Vortex.md). - `doc`: [Documentation](doc/Vortex.md).
- `hw`: Hardware sources. - `hw`: Hardware sources.
- `driver`: Host drivers repository. - `driver`: Host drivers repository.
- `runtime`: Kernel Runtime software. - `runtime`: Kernel Runtime software.
- `sim`: Simulators repository. - `sim`: Simulators repository.
- `tests`: Tests repository. - `tests`: Tests repository.
- `ci`: Continuous integration scripts. - `ci`: Continuous integration scripts.
- `miscs`: Miscellaneous resources. - `miscs`: Miscellaneous resources.
## Basic Installation ## Build Instructions
### Supported OS Platforms
- Ubuntu 18.04
- Centos 7
### Toolchain Dependencies
- [POCL](http://portablecl.org/)
- [LLVM](https://llvm.org/)
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
- [Verilator](https://www.veripool.org/verilator)
### Install development tools ### Install development tools
$ sudo apt-get install build-essential $ sudo apt-get install build-essential
$ sudo apt-get install git $ sudo apt-get install git
### Install Vortex codebase
### Install gnu-riscv-tools
$ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
$ sudo apt-get -y install \
binutils build-essential libtool texinfo \
gzip zip unzip patchutils curl git \
make cmake ninja-build automake bison flex gperf \
grep sed gawk python bc \
zlib1g-dev libexpat1-dev libmpc-dev \
libglib2.0-dev libfdt-dev libpixman-1-dev
$ git clone https://github.com/riscv/riscv-gnu-toolchain
$ cd riscv-gnu-toolchain
$ git submodule update --init --recursive
$ mkdir build
$ cd build
$ ../configure --prefix=$RISCV_TOOLCHAIN_PATH --with-arch=rv32im --with-abi=ilp32
$ make -j`nproc`
$ make -j`nproc` build-qemu
### Install Verilator
You need into build the latest version using the instructions on their website
$ https://www.veripool.org/projects/verilator/wiki/Installing
### Install Vortex
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git $ git clone --recursive https://github.com/vortexgpgpu/vortex.git
$ cd Vortex $ cd Vortex
$ make ### Install prebuilt toolchain
$ ./ci/toolchain_install.sh -all
### Quick Test running OpenCL vecadd sample on 2 cores ### Build Vortex sources
$ make -s
$ ./ci/blackbox.sh --cores=2 --app=vecadd ### Quick demo running vecadd OpenCL kernel on 2 cores
$ ./ci/blackbox.sh --driver=rtlsim --cores=2 --app=vecadd

View File

@@ -12,7 +12,7 @@ VORTEX_HOME=$SCRIPT_DIR/..
DRIVER=vlsim DRIVER=vlsim
APP=sgemm APP=sgemm
CLUSTERS=1 CLUSTERS=1
CORES=2 CORES=1
WARPS=4 WARPS=4
THREADS=4 THREADS=4
L2=0 L2=0
@@ -132,9 +132,9 @@ if [ $DEBUG -eq 1 ]
then then
if [ $SCOPE -eq 1 ] if [ $SCOPE -eq 1 ]
then then
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
else else
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH
fi fi
if [ $HAS_ARGS -eq 1 ] if [ $HAS_ARGS -eq 1 ]
@@ -153,9 +153,9 @@ then
else else
if [ $SCOPE -eq 1 ] if [ $SCOPE -eq 1 ]
then then
SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
else else
CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH CONFIGS="$CONFIGS" make -C $DRIVER_PATH
fi fi
if [ $HAS_ARGS -eq 1 ] if [ $HAS_ARGS -eq 1 ]

View File

@@ -22,6 +22,17 @@ make -C tests/opencl run-simx
echo "coverage tests done!" echo "coverage tests done!"
} }
tex()
{
echo "begin texture tests..."
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0"
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1"
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1"
echo "coverage texture done!"
}
cluster() cluster()
{ {
echo "begin clustering tests..." echo "begin clustering tests..."
@@ -134,13 +145,15 @@ echo "stress1 tests done!"
usage() usage()
{ {
echo "usage: regression [-coverage] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]" echo "usage: regression [-coverage] [-tex] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]"
} }
while [ "$1" != "" ]; do while [ "$1" != "" ]; do
case $1 in case $1 in
-coverage ) coverage -coverage ) coverage
;; ;;
-tex ) tex
;;
-cluster ) cluster -cluster ) cluster
;; ;;
-debug ) debug -debug ) debug
@@ -155,6 +168,7 @@ while [ "$1" != "" ]; do
stress1 stress1
;; ;;
-all ) coverage -all ) coverage
tex
cluster cluster
debug debug
config config

View File

Before

Width:  |  Height:  |  Size: 60 KiB

After

Width:  |  Height:  |  Size: 60 KiB

View File

Before

Width:  |  Height:  |  Size: 77 KiB

After

Width:  |  Height:  |  Size: 77 KiB

View File

Before

Width:  |  Height:  |  Size: 67 KiB

After

Width:  |  Height:  |  Size: 67 KiB

View File

Before

Width:  |  Height:  |  Size: 517 KiB

After

Width:  |  Height:  |  Size: 517 KiB

View File

@@ -8,7 +8,7 @@ The Vortex Cache Sub-system has the following main properties:
### Cache Hierarchy ### Cache Hierarchy
![Image of Cache Hierarchy](./images/cache_hierarchy.png) ![Image of Cache Hierarchy](./assets/img/cache_hierarchy.png)
- Cache can be configured to be any level in the hierarchy - Cache can be configured to be any level in the hierarchy
- Caches communicate via snooping - Caches communicate via snooping
@@ -18,7 +18,7 @@ The Vortex Cache Sub-system has the following main properties:
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory. VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache](./images/vortex_cache_top_module.png) ![Image of Vortex Cache](./assets/img/vortex_cache_top_module.png)
- Configurable (Cache size, number of banks, bank line size, etc.) - Configurable (Cache size, number of banks, bank line size, etc.)
- I/O signals - I/O signals
@@ -44,7 +44,7 @@ VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/c
VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory. VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache Bank](./images/vortex_bank.png) ![Image of Vortex Cache Bank](./assets/img/vortex_bank.png)
- Allows for high throughput - Allows for high throughput
- Each bank contains queues to hold requests to the cache - Each bank contains queues to hold requests to the cache

View File

@@ -0,0 +1,128 @@
# Execute OpenCL on Vortex backend
## Requirements
- [Vortex](https://github.com/vortexgpgpu/vortex)
- [POCL for Vortex](https://github.com/vortexgpgpu/pocl)
- [riscv-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain)
- [llvm-riscv](https://github.com/llvm-mirror/llvm)
For installation, please see [Build Instructions](../README.md) for more details.
**For Ubuntu18.04 users, you can directly download pre-build toolchains with [toolchain_install.sh](https://github.com/vortexgpgpu/vortex/blob/master/ci/toolchain_install.sh) script.**
```bash
# please modify the DESTDIR variable in the script before execution
bash toolchain_install.sh -all
```
Assuming we have installed all dependencies in `/opt` path, we can get the following environment:
```bash
tree -L 2 /opt
'''
/opt/
├── llvm-riscv
│ ├── bin
│ ├── include
│ ├── lib
│ ├── libexec
│ └── share
├── pocl
│ ├── compiler
│ └── runtime
├── riscv-gnu-toolchain
│ ├── bin
│ ├── drops
│ ├── include
│ ├── lib
│ ├── libexec
│ ├── riscv32-unknown-elf
│ ├── share
│ └── var
└── verilator
├── bin
├── examples
├── include
├── verilator-config.cmake
└── verilator-config-version.cmake
'''
```
## Execute OpenCL on Vortex
In this tutorial, we show the example of executing a vecadd programs on SIMX backend.
To execute a OpenCL program on Vortex, we have the following steps:
- Compile the [OpenCL kernels](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/kernel.cl) into risc-v binary by POCL compiler.
- Compile the [OpenCL host](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/main.cc) and link with Vortex driver(```-lvortex```).
- Execute the compiled host programs on a backend.
Thus, we can write a Makefile as following:
```Makefile
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
OPTS ?= -n64
# please edit these two variable to your environment
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
PROJECT = vecadd
SRCS = main.cc
all: $(PROJECT) kernel.pocl
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
```
First, build the host program.
```bash
make all
```
If we want to execute on SIMX, we can execute the command below.
```bash
make run-simx
```

View File

@@ -13,17 +13,6 @@ OPAE Environment Setup
$ export PATH=:/opt/verilator/bin:$PATH $ export PATH=:/opt/verilator/bin:$PATH
$ export VERILATOR_ROOT=/opt/verilator $ export VERILATOR_ROOT=/opt/verilator
OPAE Build Configuration
------------------------
Within the `/hw/syn/opae` directory, there are source text files for each core-option for the fpga build (the 32 and 64 core options are not currently implemented) which have the following parameters that can be configured:
- NUM_CORES: the number of cores per cluster
- NUM_CLUSTERS: the number of clusters alotted to the processor
- L3_ENABLE: enable the use of the L3 cache
- PERF_ENABLE: enable the use of all profile counters
To enable L3 cache and profile counters for a build, simply uncomment the definition within the respective source file.
OPAE Build OPAE Build
------------------ ------------------
@@ -33,41 +22,58 @@ The FPGA has to following configuration options:
- 4 cores fpga (fpga-4c) - 4 cores fpga (fpga-4c)
- 8 cores fpga (fpga-8c) - 8 cores fpga (fpga-8c)
- 16 cores fpga (fpga-16c) - 16 cores fpga (fpga-16c)
- 32 cores fpga (fpga-32c)
- 64 cores fpga (fpga-64c)
Command line:
$ cd hw/syn/opae $ cd hw/syn/opae
$ make fpga- *# of cores* c $ make fpga-<num-of-cores>c
Example: `make fpga-4c` Example: `make fpga-4c`
A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-45 min to complete. A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-480 min to complete.
OPAE Build Configuration
------------------------
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make fpga-4c
OPAE Build Progress OPAE Build Progress
------------------- -------------------
You could check the last 10 lines in the build log for possible errors until build completion. You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 ./build_fpga_4c/build.log $ tail -n 10 ./build_fpga_<num-of-cores>c/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs. Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u *username* $ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command: If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean-fpga- *# of cores* c $ make clean-fpga-<num-of-cores>c
Example: `make clean-fpga-4c` Example: `make clean-fpga-4c`
The file `vortex_afu.gbs` should exist when the build is done: The file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa ./build_fpga_ *# of cores* c/vortex_afu.gbs $ ls -lsa ./build_fpga_<num-of-cores>c/vortex_afu.gbs
Signing the bitstream and Programming the FPGA Signing the bitstream and Programming the FPGA
---------------------------------------------- ----------------------------------------------
$ cd ./build_fpga_`# of cores`c/ $ cd ./build_fpga_<num-of-cores>c
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs $ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs $ fpgasupdate vortex_afu_unsigned_ssl.gbs

View File

@@ -14,17 +14,17 @@
## Installation ## Installation
- Refer to the install instructions in [README](../README.md). - Refer to the build instructions in [README](../README.md).
## Quick Start Scenarios ## Quick Start Scenarios
Running Vortex simulators with different configurations: Running Vortex simulators with different configurations:
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads - Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
$ ./ci/blackbox.sh --clusters=2 --cores=2 --warps=2 --threads=4 --driver=rtlsim --app=basic $ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
- Run demo driver test with vlsim driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads - Run demo driver test with vlsim driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
$ ./ci/blackbox.sh --clusters=1 --cores=4 --warps=4 --threads=2 --driver=vlsim --app=demo $ ./ci/blackbox.sh --driver=vlsim --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads - Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --clusters=4 --cores=4 --warps=8 --threads=6 --driver=simx --app=dogfood $ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood

View File

@@ -32,7 +32,7 @@ Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with
### Vortex Pipeline/Datapath ### Vortex Pipeline/Datapath
![Image of Vortex Microarchitecture](./images/vortex_microarchitecture_v2.png) ![Image of Vortex Microarchitecture](./assets/img/vortex_microarchitecture_v2.png)
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB. Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB.

View File

@@ -63,12 +63,5 @@ scope: scope-defs.h
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H) $(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) -o $(PROJECT) $(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean: clean:
rm -rf $(PROJECT) *.o .depend scope-defs.h rm -rf $(PROJECT) *.o scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -65,12 +65,5 @@ scope: scope-defs.h
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H) $(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) -o $(PROJECT) $(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean: clean:
rm -rf $(PROJECT) *.o .depend scope-defs.h rm -rf $(PROJECT) *.o scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -35,4 +35,4 @@ $(PROJECT): $(SRCS)
clean: clean:
$(MAKE) -C $(RTLSIM_DIR) clean-static $(MAKE) -C $(RTLSIM_DIR) clean-static
rm -rf $(PROJECT) *.o .depend rm -rf $(PROJECT) *.o

View File

@@ -21,9 +21,6 @@ $(PROJECT): $(SRCS)
$(MAKE) -C $(SIMX_DIR) static $(MAKE) -C $(SIMX_DIR) static
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean: clean:
$(MAKE) -C $(SIMX_DIR) clean-static $(MAKE) -C $(SIMX_DIR) clean-static
rm -rf $(PROJECT) *.o .depend rm -rf $(PROJECT) *.o

View File

@@ -50,13 +50,6 @@ $(PROJECT): $(SRCS) $(SCOPE_H)
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static $(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT) $(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean: clean:
$(MAKE) -C $(VLSIM_DIR) clean-static $(MAKE) -C $(VLSIM_DIR) clean-static
rm -rf $(PROJECT) *.o .depend scope-defs.h rm -rf $(PROJECT) *.o scope-defs.h
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -217,7 +217,7 @@ module VX_alu_unit #(
// can accept new request? // can accept new request?
assign alu_req_if.ready = ready_in; assign alu_req_if.ready = ready_in;
`ifdef DBG_PRINT_PIPELINE `ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (branch_ctl_if.valid) begin if (branch_ctl_if.valid) begin
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n", dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n",

159
hw/rtl/VX_cache_arb.sv Normal file
View File

@@ -0,0 +1,159 @@
`include "VX_define.vh"
module VX_cache_arb #(
parameter NUM_REQS = 1,
parameter LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_IN_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter TYPE = "R",
localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
localparam DATA_WIDTH = (8 * DATA_SIZE),
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
) (
input wire clk,
input wire reset,
// input requests
input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in,
input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in,
input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in,
input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in,
// output request
output wire [LANES-1:0] req_valid_out,
output wire [LANES-1:0] req_rw_out,
output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out,
output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
input wire [LANES-1:0] req_ready_out,
// input response
input wire rsp_valid_in,
input wire [LANES-1:0] rsp_tmask_in,
input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
output wire rsp_ready_in,
// output responses
output wire [NUM_REQS-1:0] rsp_valid_out,
output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out,
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
input wire [NUM_REQS-1:0] rsp_ready_out
);
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
if (NUM_REQS > 1) begin
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
for (genvar i = 0; i < NUM_REQS; i++) begin
for (genvar j = 0; j < LANES; ++j) begin
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
VX_bits_insert #(
.N (TAG_IN_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_insert (
.data_in (req_tag_in[i][j]),
.sel_in (LOG_NUM_REQS'(i)),
.data_out (req_tag_in_w)
);
assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]};
end
end
VX_stream_arbiter #(
.NUM_REQS (NUM_REQS),
.LANES (LANES),
.DATAW (REQ_DATAW),
.BUFFERED (BUFFERED_REQ),
.TYPE (TYPE)
) req_arb (
.clk (clk),
.reset (reset),
.valid_in (req_valid_in),
.data_in (req_data_in_merged),
.ready_in (req_ready_in),
.valid_out (req_valid_out),
.data_out (req_data_out_merged),
.ready_out (req_ready_out)
);
for (genvar i = 0; i < LANES; ++i) begin
assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i];
end
///////////////////////////////////////////////////////////////////////
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
VX_bits_remove #(
.N (TAG_OUT_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (rsp_tag_in),
.data_out (rsp_tag_in_w)
);
VX_stream_demux #(
.NUM_REQS (NUM_REQS),
.LANES (1),
.DATAW (RSP_DATAW),
.BUFFERED (BUFFERED_RSP)
) rsp_demux (
.clk (clk),
.reset (reset),
.sel_in (rsp_sel),
.valid_in (rsp_valid_in),
.data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}),
.ready_in (rsp_ready_in),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out_merged),
.ready_out (rsp_ready_out)
);
for (genvar i = 0; i < NUM_REQS; i++) begin
assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
end
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign req_valid_out = req_valid_in;
assign req_tag_out = req_tag_in;
assign req_addr_out = req_addr_in;
assign req_rw_out = req_rw_in;
assign req_byteen_out = req_byteen_in;
assign req_data_out = req_data_in;
assign req_ready_in = req_ready_out;
assign rsp_valid_out = rsp_valid_in;
assign rsp_tmask_out = rsp_tmask_in;
assign rsp_tag_out = rsp_tag_in;
assign rsp_data_out = rsp_data_in;
assign rsp_ready_in = rsp_ready_out;
end
endmodule

View File

@@ -78,14 +78,14 @@ module VX_commit #(
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if), .fpu_commit_if (fpu_commit_if),
`endif `endif
.gpu_commit_if (gpu_commit_if),
.writeback_if (writeback_if) .writeback_if (writeback_if)
); );
// store and gpu commits don't writeback // store and gpu commits don't writeback
assign st_commit_if.ready = 1'b1; assign st_commit_if.ready = 1'b1;
assign gpu_commit_if.ready = 1'b1;
`ifdef DBG_PRINT_PIPELINE `ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (alu_commit_if.valid && alu_commit_if.ready) begin if (alu_commit_if.valid && alu_commit_if.ready) begin
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd); dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);

View File

@@ -230,6 +230,21 @@
`define CSR_NW 12'hFC1 `define CSR_NW 12'hFC1
`define CSR_NC 12'hFC2 `define CSR_NC 12'hFC2
////////// Texture Units //////////////////////////////////////////////////////
`define NUM_TEX_UNITS 2
`define CSR_TEX_STATES 7
`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES)
`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00)
`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01)
`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02)
`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03)
`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04)
`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05)
`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06)
// Pipeline Queues //////////////////////////////////////////////////////////// // Pipeline Queues ////////////////////////////////////////////////////////////
// Size of Instruction Buffer // Size of Instruction Buffer

View File

@@ -17,6 +17,9 @@ module VX_csr_data #(
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if, VX_fpu_to_csr_if.slave fpu_to_csr_if,
`endif `endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if.master tex_csr_if,
`endif
input wire read_enable, input wire read_enable,
input wire[`CSR_ADDR_BITS-1:0] read_addr, input wire[`CSR_ADDR_BITS-1:0] read_addr,
@@ -26,7 +29,7 @@ module VX_csr_data #(
input wire write_enable, input wire write_enable,
input wire[`CSR_ADDR_BITS-1:0] write_addr, input wire[`CSR_ADDR_BITS-1:0] write_addr,
input wire[`NW_BITS-1:0] write_wid, input wire[`NW_BITS-1:0] write_wid,
input wire[`CSR_WIDTH-1:0] write_data, input wire[31:0] write_data,
input wire busy input wire busy
); );
@@ -46,13 +49,13 @@ module VX_csr_data #(
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr; reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
always @(posedge clk) begin always @(posedge clk) begin
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
if (reset) begin if (reset) begin
fcsr <= '0; fcsr <= '0;
end end
if (fpu_to_csr_if.write_enable) begin if (fpu_to_csr_if.write_enable) begin
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
| fpu_to_csr_if.write_fflags; | fpu_to_csr_if.write_fflags;
end end
`endif `endif
@@ -61,27 +64,33 @@ module VX_csr_data #(
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0]; `CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0]; `CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0]; `CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
`CSR_SATP: csr_satp <= write_data; `CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MSTATUS: csr_mstatus <= write_data; `CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
`CSR_MEDELEG: csr_medeleg <= write_data; `CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
`CSR_MIDELEG: csr_mideleg <= write_data; `CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
`CSR_MIE: csr_mie <= write_data; `CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
`CSR_MTVEC: csr_mtvec <= write_data; `CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
`CSR_MEPC: csr_mepc <= write_data; default: begin
`ASSERT(write_addr >= `CSR_TEX_BEGIN(0)
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data; && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES),
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data; ("%t: invalid CSR write address: %0h", $time, write_addr));
default: begin
`ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr));
end end
endcase endcase
end end
end end
`UNUSED_VAR (write_data)
// TEX CSRs
`ifdef EXT_TEX_ENABLE
assign tex_csr_if.write_enable = write_enable;
assign tex_csr_if.write_addr = write_addr;
assign tex_csr_if.write_data = write_data;
`endif
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
csr_cycle <= 0; csr_cycle <= 0;
@@ -209,7 +218,8 @@ module VX_csr_data #(
default: begin default: begin
if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32)) if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)))) begin || (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)
|| (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin
read_addr_valid_r = 0; read_addr_valid_r = 0;
end end
end end

View File

@@ -20,6 +20,9 @@ module VX_csr_unit #(
VX_fpu_to_csr_if.slave fpu_to_csr_if, VX_fpu_to_csr_if.slave fpu_to_csr_if,
input wire[`NUM_WARPS-1:0] fpu_pending, input wire[`NUM_WARPS-1:0] fpu_pending,
`endif `endif
`ifdef EXT_TEX_ENABLE
VX_tex_csr_if.master tex_csr_if,
`endif
output wire[`NUM_WARPS-1:0] pending, output wire[`NUM_WARPS-1:0] pending,
input wire busy input wire busy
@@ -46,6 +49,9 @@ module VX_csr_unit #(
.fetch_to_csr_if(fetch_to_csr_if), .fetch_to_csr_if(fetch_to_csr_if),
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if),
`endif
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
`endif `endif
.read_enable (csr_req_if.valid), .read_enable (csr_req_if.valid),
.read_addr (csr_req_if.addr), .read_addr (csr_req_if.addr),
@@ -54,7 +60,7 @@ module VX_csr_unit #(
.write_enable (write_enable), .write_enable (write_enable),
.write_addr (csr_addr_s1), .write_addr (csr_addr_s1),
.write_wid (csr_commit_if.wid), .write_wid (csr_commit_if.wid),
.write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]), .write_data (csr_updated_data_s1),
.busy (busy) .busy (busy)
); );

View File

@@ -1,6 +1,6 @@
`include "VX_define.vh" `include "VX_define.vh"
`ifdef DBG_PRINT_PIPELINE `ifdef DBG_TRACE_PIPELINE
`include "VX_print_instr.vh" `include "VX_trace_instr.vh"
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
@@ -42,6 +42,7 @@ module VX_decode #(
wire [31:0] instr = ifetch_rsp_if.data; wire [31:0] instr = ifetch_rsp_if.data;
wire [6:0] opcode = instr[6:0]; wire [6:0] opcode = instr[6:0];
wire [1:0] func2 = instr[26:25];
wire [2:0] func3 = instr[14:12]; wire [2:0] func3 = instr[14:12];
wire [6:0] func7 = instr[31:25]; wire [6:0] func7 = instr[31:25];
wire [11:0] u_12 = instr[31:20]; wire [11:0] u_12 = instr[31:20];
@@ -193,7 +194,6 @@ module VX_decode #(
end end
`INST_F: begin `INST_F: begin
ex_type = `EX_LSU; ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(func3[0]);
op_mod = `INST_MOD_BITS'(1); op_mod = `INST_MOD_BITS'(1);
end end
`INST_SYS : begin `INST_SYS : begin
@@ -375,11 +375,21 @@ module VX_decode #(
`USED_IREG (rs1); `USED_IREG (rs1);
`USED_IREG (rs2); `USED_IREG (rs2);
end end
`ifdef EXT_TEX_ENABLE
3'h5: begin 3'h5: begin
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
op_mod = `INST_MOD_BITS'(func2);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
`endif
3'h6: begin
ex_type = `EX_LSU; ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(`INST_GPU_PRED); op_type = `INST_OP_BITS'(`INST_LSU_LW);
imm = {{20{u_12[11]}}, u_12}; op_mod = `INST_MOD_BITS'(2);
use_rd = 0;
`USED_IREG (rs1); `USED_IREG (rs1);
end end
default:; default:;
@@ -389,6 +399,8 @@ module VX_decode #(
endcase endcase
end end
`UNUSED_VAR (func2)
// disable write to integer register r0 // disable write to integer register r0
wire wb = use_rd && (| rd_r); wire wb = use_rd && (| rd_r);
@@ -421,13 +433,13 @@ module VX_decode #(
assign ifetch_rsp_if.ready = decode_if.ready; assign ifetch_rsp_if.ready = decode_if.ready;
`ifdef DBG_PRINT_PIPELINE `ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin if (decode_if.valid && decode_if.ready) begin
dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC); dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC);
print_ex_type(decode_if.ex_type); trace_ex_type(decode_if.ex_type);
dpi_trace(", op="); dpi_trace(", op=");
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod); trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm); dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
end end
end end

View File

@@ -18,6 +18,8 @@
`define NRI_BITS `LOG2UP(`NUM_IREGS) `define NRI_BITS `LOG2UP(`NUM_IREGS)
`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS)
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS) `define NUM_REGS (2 * `NUM_IREGS)
`else `else
@@ -66,6 +68,8 @@
`define INST_GPU 7'b1101011 `define INST_GPU 7'b1101011
`define INST_TEX 7'b0101011
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even `define INST_FRM_RNE 3'b000 // round to nearest even
@@ -150,8 +154,8 @@
`define INST_LSU_BITS 4 `define INST_LSU_BITS 4
`define INST_LSU_FMT(x) x[2:0] `define INST_LSU_FMT(x) x[2:0]
`define INST_LSU_WSIZE(x) x[1:0] `define INST_LSU_WSIZE(x) x[1:0]
`define INST_LSU_IS_FENCE(x) x[0] `define INST_LSU_IS_FENCE(x) (3'h1 == x)
`define INST_LSU_IS_PREF(x) (x==3'b111) `define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
`define INST_FENCE_BITS 1 `define INST_FENCE_BITS 1
`define INST_FENCE_D 1'h0 `define INST_FENCE_D 1'h0
@@ -187,6 +191,7 @@
`define INST_GPU_JOIN 3'h3 `define INST_GPU_JOIN 3'h3
`define INST_GPU_BAR 3'h4 `define INST_GPU_BAR 3'h4
`define INST_GPU_PRED 3'h5 `define INST_GPU_PRED 3'h5
`define INST_GPU_TEX 3'h6
`define INST_GPU_BITS 3 `define INST_GPU_BITS 3
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -238,8 +243,11 @@
`define DBG_CACHE_REQ_MDATAW 0 `define DBG_CACHE_REQ_MDATAW 0
`endif `endif
// non-cacheable address bit // non-cacheable tag bits
`define NC_FLAG_BITS 1 `define NC_TAG_BIT 1
// texture tag bits
`define TEX_TAG_BIT 1
////////////////////////// Icache Configurable Knobs ////////////////////////// ////////////////////////// Icache Configurable Knobs //////////////////////////
@@ -278,12 +286,20 @@
// Block size in bytes // Block size in bytes
`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE `define DCACHE_LINE_SIZE `L1_BLOCK_SIZE
// TAG sharing enable // Core request tag bits
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE) `define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_FLAG_BITS + `SM_ENABLE) `ifdef EXT_TEX_ENABLE
`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
// Input request tag bits `define TEX_TAG_ID_BITS (2)
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS) `define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS)
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT)
`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS)
`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS)
`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS)
`else
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
`endif
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS)
// Memory request data bits // Memory request data bits
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8) `define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
@@ -300,7 +316,7 @@
// Memory request tag bits // Memory request tag bits
`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE) `define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE)
`define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH) `define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH)
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_FLAG_BITS), `_DNC_MEM_TAG_WIDTH) `define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_TAG_BIT), `_DNC_MEM_TAG_WIDTH)
// Merged D-cache/I-cache memory tag // Merged D-cache/I-cache memory tag
`define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2)) `define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2))
@@ -348,7 +364,7 @@
// Memory request tag bits // Memory request tag bits
`define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE) `define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE)
`define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH) `define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH)
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_FLAG_BITS), `_L2_NC_MEM_TAG_WIDTH) `define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_TAG_BIT), `_L2_NC_MEM_TAG_WIDTH)
`define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS))) `define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS)))
////////////////////////// L3cache Configurable Knobs ///////////////////////// ////////////////////////// L3cache Configurable Knobs /////////////////////////
@@ -380,7 +396,7 @@
// Memory request tag bits // Memory request tag bits
`define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE) `define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE)
`define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH) `define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH)
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_FLAG_BITS), `_L3_NC_MEM_TAG_WIDTH) `define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_TAG_BIT), `_L3_NC_MEM_TAG_WIDTH)
`define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS))) `define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS)))
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@@ -1,6 +1,6 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_instr_demux ( module VX_dispatch (
input wire clk, input wire clk,
input wire reset, input wire reset,
@@ -60,7 +60,7 @@ module VX_instr_demux (
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU); wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type); wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod); wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
wire lsu_is_prefetch = (~ibuffer_if.wb) && ~(ibuffer_if.op_type[`INST_OP_BITS-1]); wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
VX_skid_buffer #( VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1), .DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
@@ -125,18 +125,17 @@ module VX_instr_demux (
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU); wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type); wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid];
VX_skid_buffer #( VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)), .DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
.OUT_REG (1) .OUT_REG (1)
) gpu_buffer ( ) gpu_buffer (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.valid_in (gpu_req_valid), .valid_in (gpu_req_valid),
.ready_in (gpu_req_ready), .ready_in (gpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpu_rs2_data}), .data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}), .data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
.valid_out (gpu_req_if.valid), .valid_out (gpu_req_if.valid),
.ready_out (gpu_req_if.ready) .ready_out (gpu_req_if.ready)
); );

View File

@@ -45,12 +45,108 @@ module VX_execute #(
VX_commit_if.master gpu_commit_if, VX_commit_if.master gpu_commit_if,
input wire busy input wire busy
); );
`ifdef EXT_TEX_ENABLE
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
) lsu_dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
) lsu_dcache_rsp_if();
VX_dcache_req_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
) tex_dcache_req_if();
VX_dcache_rsp_if #(
.NUM_REQS (`NUM_THREADS),
.WORD_SIZE (4),
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
) tex_dcache_rsp_if();
VX_tex_csr_if tex_csr_if();
wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in;
wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out;
`UNUSED_VAR (tex_tag_out)
`UNUSED_VAR (lsu_tag_out)
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]);
assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]);
`ifdef DBG_CACHE_REQ_INFO
assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS];
assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS];
`endif
end
assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0];
assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0];
`ifdef DBG_CACHE_REQ_INFO
assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
`endif
VX_cache_arb #(
.NUM_REQS (2),
.LANES (`NUM_THREADS),
.DATA_SIZE (4),
.TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS),
.TAG_SEL_IDX (`NC_TAG_BIT + `SM_ENABLE)
) tex_lsu_arb (
.clk (clk),
.reset (reset),
// Tex/LSU request
.req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}),
.req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}),
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
.req_tag_in ({tex_tag_in, lsu_tag_in}),
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
// Dcache request
.req_valid_out (dcache_req_if.valid),
.req_rw_out (dcache_req_if.rw),
.req_byteen_out (dcache_req_if.byteen),
.req_addr_out (dcache_req_if.addr),
.req_data_out (dcache_req_if.data),
.req_tag_out (dcache_req_if.tag),
.req_ready_out (dcache_req_if.ready),
// Dcache response
.rsp_valid_in (dcache_rsp_if.valid),
.rsp_tmask_in (dcache_rsp_if.tmask),
.rsp_tag_in (dcache_rsp_if.tag),
.rsp_data_in (dcache_rsp_if.data),
.rsp_ready_in (dcache_rsp_if.ready),
// Tex/LSU response
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
.rsp_tag_out ({tex_tag_out, lsu_tag_out}),
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
);
`endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if(); wire [`NUM_WARPS-1:0] csr_pending;
wire[`NUM_WARPS-1:0] fpu_pending; wire [`NUM_WARPS-1:0] fpu_pending;
wire[`NUM_WARPS-1:0] csr_pending; VX_fpu_to_csr_if fpu_to_csr_if();
`endif `endif
`RESET_RELAY (alu_reset); `RESET_RELAY (alu_reset);
`RESET_RELAY (lsu_reset); `RESET_RELAY (lsu_reset);
@@ -58,7 +154,7 @@ module VX_execute #(
`RESET_RELAY (gpu_reset); `RESET_RELAY (gpu_reset);
VX_alu_unit #( VX_alu_unit #(
.CORE_ID (CORE_ID) .CORE_ID(CORE_ID)
) alu_unit ( ) alu_unit (
.clk (clk), .clk (clk),
.reset (alu_reset), .reset (alu_reset),
@@ -68,20 +164,25 @@ module VX_execute #(
); );
VX_lsu_unit #( VX_lsu_unit #(
.CORE_ID (CORE_ID) .CORE_ID(CORE_ID)
) lsu_unit ( ) lsu_unit (
`SCOPE_BIND_VX_execute_lsu_unit `SCOPE_BIND_VX_execute_lsu_unit
.clk (clk), .clk (clk),
.reset (lsu_reset), .reset (lsu_reset),
`ifdef EXT_TEX_ENABLE
.dcache_req_if (lsu_dcache_req_if),
.dcache_rsp_if (lsu_dcache_rsp_if),
`else
.dcache_req_if (dcache_req_if), .dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if), .dcache_rsp_if (dcache_rsp_if),
`endif
.lsu_req_if (lsu_req_if), .lsu_req_if (lsu_req_if),
.ld_commit_if (ld_commit_if), .ld_commit_if (ld_commit_if),
.st_commit_if (st_commit_if) .st_commit_if (st_commit_if)
); );
VX_csr_unit #( VX_csr_unit #(
.CORE_ID (CORE_ID) .CORE_ID(CORE_ID)
) csr_unit ( ) csr_unit (
.clk (clk), .clk (clk),
.reset (csr_reset), .reset (csr_reset),
@@ -89,7 +190,7 @@ module VX_execute #(
.perf_memsys_if (perf_memsys_if), .perf_memsys_if (perf_memsys_if),
.perf_pipeline_if(perf_pipeline_if), .perf_pipeline_if(perf_pipeline_if),
`endif `endif
.cmt_to_csr_if (cmt_to_csr_if), .cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if), .fetch_to_csr_if(fetch_to_csr_if),
.csr_req_if (csr_req_if), .csr_req_if (csr_req_if),
.csr_commit_if (csr_commit_if), .csr_commit_if (csr_commit_if),
@@ -100,6 +201,9 @@ module VX_execute #(
`else `else
`UNUSED_PIN (pending), `UNUSED_PIN (pending),
`endif `endif
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
`endif
.busy (busy) .busy (busy)
); );
@@ -107,7 +211,7 @@ module VX_execute #(
`RESET_RELAY (fpu_reset); `RESET_RELAY (fpu_reset);
VX_fpu_unit #( VX_fpu_unit #(
.CORE_ID (CORE_ID) .CORE_ID(CORE_ID)
) fpu_unit ( ) fpu_unit (
.clk (clk), .clk (clk),
.reset (fpu_reset), .reset (fpu_reset),
@@ -120,12 +224,17 @@ module VX_execute #(
`endif `endif
VX_gpu_unit #( VX_gpu_unit #(
.CORE_ID (CORE_ID) .CORE_ID(CORE_ID)
) gpu_unit ( ) gpu_unit (
`SCOPE_BIND_VX_execute_gpu_unit `SCOPE_BIND_VX_execute_gpu_unit
.clk (clk), .clk (clk),
.reset (gpu_reset), .reset (gpu_reset),
.gpu_req_if (gpu_req_if), .gpu_req_if (gpu_req_if),
`ifdef EXT_TEX_ENABLE
.tex_csr_if (tex_csr_if),
.dcache_req_if (tex_dcache_req_if),
.dcache_rsp_if (tex_dcache_rsp_if),
`endif
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if),
.gpu_commit_if (gpu_commit_if) .gpu_commit_if (gpu_commit_if)
); );
@@ -137,4 +246,4 @@ module VX_execute #(
&& (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK && (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL); || `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL);
endmodule endmodule

View File

@@ -11,6 +11,12 @@ module VX_gpu_unit #(
// Inputs // Inputs
VX_gpu_req_if.slave gpu_req_if, VX_gpu_req_if.slave gpu_req_if,
`ifdef EXT_TEX_ENABLE
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
VX_tex_csr_if.slave tex_csr_if,
`endif
// Outputs // Outputs
VX_warp_ctl_if.master warp_ctl_if, VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master gpu_commit_if VX_commit_if.master gpu_commit_if
@@ -18,14 +24,29 @@ module VX_gpu_unit #(
import gpu_types::*; import gpu_types::*;
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset) localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW);
wire rsp_valid;
wire [`NW_BITS-1:0] rsp_wid;
wire [`NUM_THREADS-1:0] rsp_tmask;
wire [31:0] rsp_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [RSP_DATAW-1:0] rsp_data, rsp_data_r;
gpu_tmc_t tmc; gpu_tmc_t tmc;
gpu_wspawn_t wspawn; gpu_wspawn_t wspawn;
gpu_barrier_t barrier; gpu_barrier_t barrier;
gpu_split_t split; gpu_split_t split;
wire [WCTL_DATAW-1:0] warp_ctl_data;
wire is_warp_ctl;
wire stall_in, stall_out;
wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN); wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN);
wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC); wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC);
wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT); wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT);
@@ -33,7 +54,8 @@ module VX_gpu_unit #(
wire is_pred = (gpu_req_if.op_type == `INST_GPU_PRED); wire is_pred = (gpu_req_if.op_type == `INST_GPU_PRED);
wire [31:0] rs1_data = gpu_req_if.rs1_data[gpu_req_if.tid]; wire [31:0] rs1_data = gpu_req_if.rs1_data[gpu_req_if.tid];
wire [31:0] rs2_data = gpu_req_if.rs2_data[gpu_req_if.tid];
wire [`NUM_THREADS-1:0] taken_tmask; wire [`NUM_THREADS-1:0] taken_tmask;
wire [`NUM_THREADS-1:0] not_taken_tmask; wire [`NUM_THREADS-1:0] not_taken_tmask;
@@ -52,7 +74,7 @@ module VX_gpu_unit #(
// wspawn // wspawn
wire [31:0] wspawn_pc = gpu_req_if.rs2_data; wire [31:0] wspawn_pc = rs2_data;
wire [`NUM_WARPS-1:0] wspawn_wmask; wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; i++) begin for (genvar i = 0; i < `NUM_WARPS; i++) begin
assign wspawn_wmask[i] = (i < rs1_data); assign wspawn_wmask[i] = (i < rs1_data);
@@ -73,30 +95,109 @@ module VX_gpu_unit #(
assign barrier.valid = is_bar; assign barrier.valid = is_bar;
assign barrier.id = rs1_data[`NB_BITS-1:0]; assign barrier.id = rs1_data[`NB_BITS-1:0];
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1); assign barrier.size_m1 = (`NW_BITS)'(rs2_data - 1);
// pack warp ctl result
assign warp_ctl_data = {tmc, wspawn, split, barrier};
// texture
`ifdef EXT_TEX_ENABLE
`UNUSED_VAR (gpu_req_if.op_mod)
VX_tex_req_if tex_req_if();
VX_tex_rsp_if tex_rsp_if();
wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX);
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
assign tex_req_if.wid = gpu_req_if.wid;
assign tex_req_if.tmask = gpu_req_if.tmask;
assign tex_req_if.PC = gpu_req_if.PC;
assign tex_req_if.rd = gpu_req_if.rd;
assign tex_req_if.wb = gpu_req_if.wb;
assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0];
assign tex_req_if.coords[0] = gpu_req_if.rs1_data;
assign tex_req_if.coords[1] = gpu_req_if.rs2_data;
assign tex_req_if.lod = gpu_req_if.rs3_data;
VX_tex_unit #(
.CORE_ID(CORE_ID)
) tex_unit (
.clk (clk),
.reset (reset),
.tex_req_if (tex_req_if),
.tex_csr_if (tex_csr_if),
.tex_rsp_if (tex_rsp_if),
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if)
);
assign tex_rsp_if.ready = !stall_out;
assign stall_in = (is_tex && ~tex_req_if.ready)
|| (~is_tex && (tex_rsp_if.valid || stall_out));
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
assign rsp_rd = tex_rsp_if.rd;
assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb;
assign rsp_data = tex_rsp_if.valid ? RSP_DATAW'(tex_rsp_if.data) : RSP_DATAW'(warp_ctl_data);
`else
`UNUSED_VAR (gpu_req_if.op_mod)
`UNUSED_VAR (gpu_req_if.rs3_data)
`UNUSED_VAR (gpu_req_if.wb)
`UNUSED_VAR (gpu_req_if.rd)
assign stall_in = stall_out;
assign is_warp_ctl = 1;
assign rsp_valid = gpu_req_if.valid;
assign rsp_wid = gpu_req_if.wid;
assign rsp_tmask = gpu_req_if.tmask;
assign rsp_PC = gpu_req_if.PC;
assign rsp_rd = 0;
assign rsp_wb = 0;
assign rsp_data = RSP_DATAW'(warp_ctl_data);
`endif
wire is_warp_ctl_r;
// output // output
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS), .DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
.RESETW (1) .RESETW (1)
) pipe_reg ( ) pipe_reg (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (!stall), .enable (!stall_out),
.data_in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}), .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier}) .data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
); );
assign gpu_commit_if.eop = 1'b1; assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0];
assign gpu_commit_if.eop = 1'b1;
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready; // warp control reponse
assign warp_ctl_if.wid = gpu_commit_if.wid;
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data_r[WCTL_DATAW-1:0];
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r;
assign warp_ctl_if.wid = gpu_commit_if.wid;
// can accept new request? // can accept new request?
assign gpu_req_if.ready = ~stall; assign gpu_req_if.ready = ~stall_in;
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid); `SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
`SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid); `SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid);

View File

@@ -88,7 +88,7 @@ module VX_icache_stage #(
`SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data); `SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data);
`SCOPE_ASSIGN (icache_rsp_tag, rsp_tag); `SCOPE_ASSIGN (icache_rsp_tag, rsp_tag);
`ifdef DBG_PRINT_CORE_ICACHE `ifdef DBG_TRACE_CORE_ICACHE
always @(posedge clk) begin always @(posedge clk) begin
if (icache_req_if.valid && icache_req_if.ready) begin if (icache_req_if.valid && icache_req_if.ready) begin
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC); dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC);

View File

@@ -23,56 +23,60 @@ module VX_issue #(
`endif `endif
VX_gpu_req_if.master gpu_req_if VX_gpu_req_if.master gpu_req_if
); );
VX_ibuffer_if ibuffer_if(); VX_ibuffer_if ibuffer_if();
VX_gpr_rsp_if gpr_rsp_if(); VX_gpr_req_if gpr_req_if();
VX_gpr_rsp_if gpr_rsp_if();
VX_gpr_req_if gpr_req_if();
assign gpr_req_if.wid = ibuffer_if.wid;
assign gpr_req_if.rs1 = ibuffer_if.rs1;
assign gpr_req_if.rs2 = ibuffer_if.rs2;
assign gpr_req_if.rs3 = ibuffer_if.rs3;
VX_writeback_if sboard_wb_if(); VX_writeback_if sboard_wb_if();
assign sboard_wb_if.valid = writeback_if.valid; VX_ibuffer_if scoreboard_if();
assign sboard_wb_if.wid = writeback_if.wid; VX_ibuffer_if dispatch_if();
assign sboard_wb_if.PC = writeback_if.PC;
assign sboard_wb_if.rd = writeback_if.rd;
assign sboard_wb_if.eop = writeback_if.eop;
assign sboard_wb_if.ready = writeback_if.ready;
VX_ibuffer_if sboard_ib_if();
assign sboard_ib_if.valid = ibuffer_if.valid && idmux_ib_if.ready;
assign sboard_ib_if.wid = ibuffer_if.wid;
assign sboard_ib_if.PC = ibuffer_if.PC;
assign sboard_ib_if.wb = ibuffer_if.wb;
assign sboard_ib_if.rd = ibuffer_if.rd;
assign sboard_ib_if.rd_n = ibuffer_if.rd_n;
assign sboard_ib_if.rs1_n = ibuffer_if.rs1_n;
assign sboard_ib_if.rs2_n = ibuffer_if.rs2_n;
assign sboard_ib_if.rs3_n = ibuffer_if.rs3_n;
assign sboard_ib_if.wid_n = ibuffer_if.wid_n;
VX_ibuffer_if idmux_ib_if(); // GPR request interface
assign idmux_ib_if.valid = ibuffer_if.valid && sboard_ib_if.ready; assign gpr_req_if.wid = ibuffer_if.wid;
assign idmux_ib_if.wid = ibuffer_if.wid; assign gpr_req_if.rs1 = ibuffer_if.rs1;
assign idmux_ib_if.tmask = ibuffer_if.tmask; assign gpr_req_if.rs2 = ibuffer_if.rs2;
assign idmux_ib_if.PC = ibuffer_if.PC; assign gpr_req_if.rs3 = ibuffer_if.rs3;
assign idmux_ib_if.ex_type = ibuffer_if.ex_type;
assign idmux_ib_if.op_type = ibuffer_if.op_type; // scoreboard writeback interface
assign idmux_ib_if.op_mod = ibuffer_if.op_mod; assign sboard_wb_if.valid = writeback_if.valid;
assign idmux_ib_if.wb = ibuffer_if.wb; assign sboard_wb_if.wid = writeback_if.wid;
assign idmux_ib_if.rd = ibuffer_if.rd; assign sboard_wb_if.PC = writeback_if.PC;
assign idmux_ib_if.rs1 = ibuffer_if.rs1; assign sboard_wb_if.rd = writeback_if.rd;
assign idmux_ib_if.imm = ibuffer_if.imm; assign sboard_wb_if.eop = writeback_if.eop;
assign idmux_ib_if.use_PC = ibuffer_if.use_PC;
assign idmux_ib_if.use_imm = ibuffer_if.use_imm; // scoreboard interface
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
assign scoreboard_if.wid = ibuffer_if.wid;
assign scoreboard_if.PC = ibuffer_if.PC;
assign scoreboard_if.wb = ibuffer_if.wb;
assign scoreboard_if.rd = ibuffer_if.rd;
assign scoreboard_if.rd_n = ibuffer_if.rd_n;
assign scoreboard_if.rs1_n = ibuffer_if.rs1_n;
assign scoreboard_if.rs2_n = ibuffer_if.rs2_n;
assign scoreboard_if.rs3_n = ibuffer_if.rs3_n;
assign scoreboard_if.wid_n = ibuffer_if.wid_n;
// dispatch interface
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
assign dispatch_if.wid = ibuffer_if.wid;
assign dispatch_if.tmask = ibuffer_if.tmask;
assign dispatch_if.PC = ibuffer_if.PC;
assign dispatch_if.ex_type = ibuffer_if.ex_type;
assign dispatch_if.op_type = ibuffer_if.op_type;
assign dispatch_if.op_mod = ibuffer_if.op_mod;
assign dispatch_if.wb = ibuffer_if.wb;
assign dispatch_if.rd = ibuffer_if.rd;
assign dispatch_if.rs1 = ibuffer_if.rs1;
assign dispatch_if.imm = ibuffer_if.imm;
assign dispatch_if.use_PC = ibuffer_if.use_PC;
assign dispatch_if.use_imm = ibuffer_if.use_imm;
// issue the instruction // issue the instruction
assign ibuffer_if.ready = sboard_ib_if.ready && idmux_ib_if.ready; assign ibuffer_if.ready = scoreboard_if.ready && dispatch_if.ready;
`RESET_RELAY (ibuf_reset); `RESET_RELAY (ibuf_reset);
`RESET_RELAY (scoreboard_reset);
`RESET_RELAY (gpr_reset); `RESET_RELAY (gpr_reset);
`RESET_RELAY (demux_reset); `RESET_RELAY (dispatch_reset);
VX_ibuffer #( VX_ibuffer #(
.CORE_ID(CORE_ID) .CORE_ID(CORE_ID)
@@ -87,9 +91,9 @@ module VX_issue #(
.CORE_ID(CORE_ID) .CORE_ID(CORE_ID)
) scoreboard ( ) scoreboard (
.clk (clk), .clk (clk),
.reset (reset), .reset (scoreboard_reset),
.ibuffer_if (sboard_ib_if), .writeback_if(sboard_wb_if),
.writeback_if(sboard_wb_if) .ibuffer_if (scoreboard_if)
); );
VX_gpr_stage #( VX_gpr_stage #(
@@ -102,10 +106,10 @@ module VX_issue #(
.gpr_rsp_if (gpr_rsp_if) .gpr_rsp_if (gpr_rsp_if)
); );
VX_instr_demux instr_demux ( VX_dispatch dispatch (
.clk (clk), .clk (clk),
.reset (demux_reset), .reset (dispatch_reset),
.ibuffer_if (idmux_ib_if), .ibuffer_if (dispatch_if),
.gpr_rsp_if (gpr_rsp_if), .gpr_rsp_if (gpr_rsp_if),
.alu_req_if (alu_req_if), .alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if), .lsu_req_if (lsu_req_if),
@@ -131,11 +135,11 @@ module VX_issue #(
`SCOPE_ASSIGN (issue_imm, ibuffer_if.imm); `SCOPE_ASSIGN (issue_imm, ibuffer_if.imm);
`SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC); `SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC);
`SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm); `SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm);
`SCOPE_ASSIGN (scoreboard_delay, !sboard_wb_if.ready); `SCOPE_ASSIGN (scoreboard_delay, !scoreboard_if.ready);
`SCOPE_ASSIGN (execute_delay, !idmux_ib_if.ready); `SCOPE_ASSIGN (dispatch_delay, !dispatch_if.ready);
`SCOPE_ASSIGN (gpr_rsp_a, gpr_rsp_if.rs1_data); `SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data);
`SCOPE_ASSIGN (gpr_rsp_b, gpr_rsp_if.rs2_data); `SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data);
`SCOPE_ASSIGN (gpr_rsp_c, gpr_rsp_if.rs3_data); `SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data);
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid); `SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
`SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask); `SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask);
`SCOPE_ASSIGN (writeback_wid, writeback_if.wid); `SCOPE_ASSIGN (writeback_wid, writeback_if.wid);
@@ -170,7 +174,7 @@ module VX_issue #(
if (decode_if.valid & !decode_if.ready) begin if (decode_if.valid & !decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1; perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
end end
if (ibuffer_if.valid & !sboard_wb_if.ready) begin if (scoreboard_if.valid & !scoreboard_if.ready) begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1; perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
end end
if (alu_req_if.valid & !alu_req_if.ready) begin if (alu_req_if.valid & !alu_req_if.ready) begin
@@ -204,7 +208,7 @@ module VX_issue #(
`endif `endif
`endif `endif
`ifdef DBG_PRINT_PIPELINE `ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin if (alu_req_if.valid && alu_req_if.ready) begin
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=", dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=",
@@ -246,6 +250,8 @@ module VX_issue #(
`TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS); `TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
dpi_trace(", rs2_data="); dpi_trace(", rs2_data=");
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS); `TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
dpi_trace(", rs3_data=");
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
dpi_trace("\n"); dpi_trace("\n");
end end
end end

View File

@@ -24,7 +24,7 @@ module VX_lsu_unit #(
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE); localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
localparam ADDR_TYPEW = `NC_FLAG_BITS + `SM_ENABLE; localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter")) `STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
@@ -80,6 +80,8 @@ module VX_lsu_unit #(
wire lsu_valid = lsu_req_if.valid && ~fence_wait; wire lsu_valid = lsu_req_if.valid && ~fence_wait;
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)), .DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1) .RESETW (1)
@@ -87,8 +89,8 @@ module VX_lsu_unit #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (!stall_in), .enable (!stall_in),
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb | lsu_req_if.is_prefetch, lsu_req_if.store_data}), .data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data}) .data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
); );
// Can accept new request? // Can accept new request?
@@ -103,6 +105,7 @@ module VX_lsu_unit #(
wire rsp_is_prefetch; wire rsp_is_prefetch;
`UNUSED_VAR (rsp_type) `UNUSED_VAR (rsp_type)
`UNUSED_VAR (rsp_is_prefetch)
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask; reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
wire [`NUM_THREADS-1:0] rsp_rem_mask_n; wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
@@ -132,7 +135,11 @@ module VX_lsu_unit #(
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n); wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS]; assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS];
`UNUSED_VAR (dcache_rsp_if.tag)
// do not writeback from software prefetch
wire req_wb2 = req_wb && ~req_is_prefetch;
VX_index_buffer #( VX_index_buffer #(
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1), .DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
@@ -143,8 +150,8 @@ module VX_lsu_unit #(
.write_addr (mbuf_waddr), .write_addr (mbuf_waddr),
.acquire_slot (mbuf_push), .acquire_slot (mbuf_push),
.read_addr (mbuf_raddr), .read_addr (mbuf_raddr),
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb, req_type, req_offset, req_is_dup, req_is_prefetch}), .write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}), .read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
.release_addr (mbuf_raddr), .release_addr (mbuf_raddr),
.release_slot (mbuf_pop), .release_slot (mbuf_pop),
.full (mbuf_full), .full (mbuf_full),
@@ -276,8 +283,6 @@ module VX_lsu_unit #(
// send load commit // send load commit
// ignore responce from software prefetch
wire rsp_valid = (rsp_is_prefetch)? 0:(| dcache_rsp_if.valid);
wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid; wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid;
VX_pipe_register #( VX_pipe_register #(
@@ -287,12 +292,12 @@ module VX_lsu_unit #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (!load_rsp_stall), .enable (!load_rsp_stall),
.data_in ({rsp_valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}), .data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
.data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop}) .data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
); );
// Can accept new cache response? // Can accept new cache response?
assign dcache_rsp_if.ready = rsp_is_prefetch ? 1 : ~load_rsp_stall; assign dcache_rsp_if.ready = ~load_rsp_stall;
// scope registration // scope registration
`SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire); `SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire);
@@ -333,7 +338,7 @@ module VX_lsu_unit #(
end end
`endif `endif
`ifdef DBG_PRINT_CORE_DCACHE `ifdef DBG_TRACE_CORE_DCACHE
wire dcache_req_fire_any = (| dcache_req_fire); wire dcache_req_fire_any = (| dcache_req_fire);
always @(posedge clk) begin always @(posedge clk) begin
if (lsu_req_if.valid && fence_wait) begin if (lsu_req_if.valid && fence_wait) begin
@@ -349,7 +354,7 @@ module VX_lsu_unit #(
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS); `TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
dpi_trace("\n"); dpi_trace("\n");
end else begin end else begin
dpi_trace("%d: D$%0d Rd Req: req_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire); dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
`TRACE_ARRAY1D(req_addr, `NUM_THREADS); `TRACE_ARRAY1D(req_addr, `NUM_THREADS);
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen); dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS); `TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
@@ -357,7 +362,7 @@ module VX_lsu_unit #(
end end
end end
if (dcache_rsp_fire) begin if (dcache_rsp_fire) begin
dpi_trace("%d: D$%0d Rsp: rsp_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=", dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd); $time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS); `TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
dpi_trace(", is_dup=%b\n", rsp_is_dup); dpi_trace(", is_dup=%b\n", rsp_is_dup);

View File

@@ -206,6 +206,7 @@ module VX_mem_unit # (
.LANES (`NUM_THREADS), .LANES (`NUM_THREADS),
.DATA_SIZE (4), .DATA_SIZE (4),
.TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH), .TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH),
.TAG_SEL_IDX (0), // SM flag
.TYPE ("P"), .TYPE ("P"),
.BUFFERED_REQ (2), .BUFFERED_REQ (2),
.BUFFERED_RSP (1) .BUFFERED_RSP (1)

View File

@@ -119,9 +119,9 @@
`define UP(x) (((x) > 0) ? (x) : 1) `define UP(x) (((x) > 0) ? (x) : 1)
`define RTRIM(x,s) x[$bits(x)-1:($bits(x)-s)] `define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
`define LTRIM(x,s) x[s-1:0] `define LTRIM(x, s) x[s-1:0]
`define TRACE_ARRAY1D(a, m) \ `define TRACE_ARRAY1D(a, m) \
dpi_trace("{"); \ dpi_trace("{"); \

View File

@@ -6,8 +6,8 @@ module VX_scoreboard #(
input wire clk, input wire clk,
input wire reset, input wire reset,
VX_ibuffer_if.scoreboard ibuffer_if, VX_ibuffer_if.slave ibuffer_if,
VX_writeback_if.scoreboard writeback_if VX_writeback_if.slave writeback_if
); );
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
@@ -53,11 +53,12 @@ module VX_scoreboard #(
reg [31:0] deadlock_ctr; reg [31:0] deadlock_ctr;
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE)); wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
deadlock_ctr <= 0; deadlock_ctr <= 0;
end else begin end else begin
`ifdef DBG_PRINT_PIPELINE `ifdef DBG_TRACE_PIPELINE
if (ibuffer_if.valid && ~ibuffer_if.ready) begin if (ibuffer_if.valid && ~ibuffer_if.ready) begin
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n", dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb, $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,

View File

@@ -1,9 +1,9 @@
`ifndef VX_PRINT_INSTR `ifndef VX_TRACE_INSTR
`define VX_PRINT_INSTR `define VX_TRACE_INSTR
`include "VX_define.vh" `include "VX_define.vh"
task print_ex_type ( task trace_ex_type (
input [`EX_BITS-1:0] ex_type input [`EX_BITS-1:0] ex_type
); );
case (ex_type) case (ex_type)
@@ -16,7 +16,7 @@ task print_ex_type (
endcase endcase
endtask endtask
task print_ex_op ( task trace_ex_op (
input [`EX_BITS-1:0] ex_type, input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type, input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod input [`INST_MOD_BITS-1:0] op_mod
@@ -137,6 +137,7 @@ task print_ex_op (
`INST_GPU_JOIN: dpi_trace("JOIN"); `INST_GPU_JOIN: dpi_trace("JOIN");
`INST_GPU_BAR: dpi_trace("BAR"); `INST_GPU_BAR: dpi_trace("BAR");
`INST_GPU_PRED: dpi_trace("PRED"); `INST_GPU_PRED: dpi_trace("PRED");
`INST_GPU_TEX: dpi_trace("TEX");
default: dpi_trace("?"); default: dpi_trace("?");
endcase endcase
end end

View File

@@ -71,8 +71,8 @@ module VX_warp_sched #(
// activate first warp // activate first warp
warp_pcs[0] <= `STARTUP_ADDR; warp_pcs[0] <= `STARTUP_ADDR;
active_warps[0] <= '1; active_warps[0] <= 1;
thread_masks[0] <= '1; thread_masks[0] <= 1;
end else begin end else begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1)); use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));

View File

@@ -12,7 +12,8 @@ module VX_writeback #(
VX_commit_if.slave csr_commit_if, VX_commit_if.slave csr_commit_if,
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if, VX_commit_if.slave fpu_commit_if,
`endif `endif
VX_commit_if.slave gpu_commit_if,
// outputs // outputs
VX_writeback_if.master writeback_if VX_writeback_if.master writeback_if
@@ -22,9 +23,17 @@ module VX_writeback #(
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1; localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`ifdef EXT_TEX_ENABLE
localparam NUM_RSPS = 5;
`else
localparam NUM_RSPS = 4;
`endif
`else
`ifdef EXT_TEX_ENABLE
localparam NUM_RSPS = 4; localparam NUM_RSPS = 4;
`else `else
localparam NUM_RSPS = 3; localparam NUM_RSPS = 3;
`endif
`endif `endif
wire wb_valid; wire wb_valid;
@@ -40,22 +49,27 @@ module VX_writeback #(
wire [NUM_RSPS-1:0] rsp_ready; wire [NUM_RSPS-1:0] rsp_ready;
wire stall; wire stall;
assign rsp_valid = { assign rsp_valid = {
`ifdef EXT_TEX_ENABLE
gpu_commit_if.valid && gpu_commit_if.wb,
`endif
csr_commit_if.valid && csr_commit_if.wb, csr_commit_if.valid && csr_commit_if.wb,
alu_commit_if.valid && alu_commit_if.wb, alu_commit_if.valid && alu_commit_if.wb,
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
fpu_commit_if.valid && fpu_commit_if.wb, fpu_commit_if.valid && fpu_commit_if.wb,
`endif `endif
ld_commit_if.valid && ld_commit_if.wb ld_commit_if.valid && ld_commit_if.wb
}; };
assign rsp_data = { assign rsp_data = {
`ifdef EXT_TEX_ENABLE
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
`endif
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop}, {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop}, {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
{fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop}, {fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop},
`endif `endif
{ ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop} { ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop}
}; };
@@ -82,8 +96,20 @@ module VX_writeback #(
`else `else
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb; assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb; assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
`ifdef EXT_TEX_ENABLE
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
`endif
`endif
`ifdef EXT_TEX_ENABLE
`ifdef EXT_F_ENABLE
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
`else
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
`endif
`else
assign gpu_commit_if.ready = 1;
`endif `endif
assign stall = ~writeback_if.ready && writeback_if.valid; assign stall = ~writeback_if.ready && writeback_if.valid;

View File

@@ -201,7 +201,7 @@ module Vortex (
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag); `SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
`SCOPE_ASSIGN (busy, busy); `SCOPE_ASSIGN (busy, busy);
`ifdef DBG_PRINT_MEM `ifdef DBG_TRACE_MEM
always @(posedge clk) begin always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw) if (mem_req_rw)

View File

@@ -158,7 +158,7 @@ module VX_avs_wrapper #(
.ready_out (mem_rsp_ready) .ready_out (mem_rsp_ready)
); );
`ifdef DBG_PRINT_AVS `ifdef DBG_TRACE_AVS
always @(posedge clk) begin always @(posedge clk) begin
if (mem_req_valid && mem_req_ready) begin if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw) begin if (mem_req_rw) begin

View File

@@ -45,12 +45,14 @@ localparam CCI_DATA_WIDTH = $bits(t_ccip_clData);
localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8; localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE); localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE);
localparam AVS_RD_QUEUE_SIZE = 4; localparam AVS_RD_QUEUE_SIZE = 4;
localparam AVS_REQ_TAGW_VX_ = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH); localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH;
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, AVS_REQ_TAGW_VX_); localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
localparam AVS_REQ_TAGW_CCI_ = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH); localparam _AVS_REQ_TAGW_VX2 = `MAX(_VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX);
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, AVS_REQ_TAGW_CCI_); localparam _AVS_REQ_TAGW_CCI = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH);
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI); localparam _AVS_REQ_TAGW_CCI2 = `MAX(CCI_ADDR_WIDTH, _AVS_REQ_TAGW_CCI);
localparam AVS_REQ_TAGW = `MAX(_AVS_REQ_TAGW_VX2, _AVS_REQ_TAGW_CCI2);
localparam CCI_RD_WINDOW_SIZE = 8; localparam CCI_RD_WINDOW_SIZE = 8;
localparam CCI_RW_PENDING_SIZE= 256; localparam CCI_RW_PENDING_SIZE= 256;
@@ -185,36 +187,36 @@ always @(posedge clk) begin
case (mmio_hdr.address) case (mmio_hdr.address)
MMIO_IO_ADDR: begin MMIO_IO_ADDR: begin
cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data); cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_IO_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data)); dpi_trace("%d: MMIO_IO_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data));
`endif `endif
end end
MMIO_MEM_ADDR: begin MMIO_MEM_ADDR: begin
cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data); cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_MEM_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data)); dpi_trace("%d: MMIO_MEM_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data));
`endif `endif
end end
MMIO_DATA_SIZE: begin MMIO_DATA_SIZE: begin
cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data); cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_DATA_SIZE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data)); dpi_trace("%d: MMIO_DATA_SIZE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
`endif `endif
end end
MMIO_CMD_TYPE: begin MMIO_CMD_TYPE: begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_CMD_TYPE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data)); dpi_trace("%d: MMIO_CMD_TYPE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data));
`endif `endif
end end
`ifdef SCOPE `ifdef SCOPE
MMIO_SCOPE_WRITE: begin MMIO_SCOPE_WRITE: begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_SCOPE_WRITE: addr=%0h, data=%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)); dpi_trace("%d: MMIO_SCOPE_WRITE: addr=%0h, data=%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data));
`endif `endif
end end
`endif `endif
default: begin default: begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: Unknown MMIO Wr: addr=%0h, data=%0h\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data)); dpi_trace("%d: Unknown MMIO Wr: addr=%0h, data=%0h\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
`endif `endif
end end
@@ -241,7 +243,7 @@ always @(posedge clk) begin
16'h0008: mmio_tx.data <= 64'h0; // reserved 16'h0008: mmio_tx.data <= 64'h0; // reserved
MMIO_STATUS: begin MMIO_STATUS: begin
mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)}); mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)});
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
if (state != STATE_WIDTH'(mmio_tx.data)) begin if (state != STATE_WIDTH'(mmio_tx.data)) begin
dpi_trace("%d: MMIO_STATUS: addr=%0h, state=%0d\n", $time, mmio_hdr.address, state); dpi_trace("%d: MMIO_STATUS: addr=%0h, state=%0d\n", $time, mmio_hdr.address, state);
end end
@@ -250,20 +252,20 @@ always @(posedge clk) begin
`ifdef SCOPE `ifdef SCOPE
MMIO_SCOPE_READ: begin MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata; mmio_tx.data <= cmd_scope_rdata;
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_SCOPE_READ: addr=%0h, data=%0h\n", $time, mmio_hdr.address, cmd_scope_rdata); dpi_trace("%d: MMIO_SCOPE_READ: addr=%0h, data=%0h\n", $time, mmio_hdr.address, cmd_scope_rdata);
`endif `endif
end end
`endif `endif
MMIO_DEV_CAPS: begin MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps; mmio_tx.data <= dev_caps;
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: MMIO_DEV_CAPS: addr=%0h, data=%0h\n", $time, mmio_hdr.address, dev_caps); dpi_trace("%d: MMIO_DEV_CAPS: addr=%0h, data=%0h\n", $time, mmio_hdr.address, dev_caps);
`endif `endif
end end
default: begin default: begin
mmio_tx.data <= 64'h0; mmio_tx.data <= 64'h0;
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: Unknown MMIO Rd: addr=%0h\n", $time, mmio_hdr.address); dpi_trace("%d: Unknown MMIO Rd: addr=%0h\n", $time, mmio_hdr.address);
`endif `endif
end end
@@ -297,19 +299,19 @@ always @(posedge clk) begin
STATE_IDLE: begin STATE_IDLE: begin
case (cmd_type) case (cmd_type)
CMD_MEM_READ: begin CMD_MEM_READ: begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE READ: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size); dpi_trace("%d: STATE READ: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
`endif `endif
state <= STATE_READ; state <= STATE_READ;
end end
CMD_MEM_WRITE: begin CMD_MEM_WRITE: begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE WRITE: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size); dpi_trace("%d: STATE WRITE: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
`endif `endif
state <= STATE_WRITE; state <= STATE_WRITE;
end end
CMD_RUN: begin CMD_RUN: begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE START\n", $time); dpi_trace("%d: STATE START\n", $time);
`endif `endif
vx_reset <= 1; vx_reset <= 1;
@@ -324,7 +326,7 @@ always @(posedge clk) begin
STATE_READ: begin STATE_READ: begin
if (cmd_read_done) begin if (cmd_read_done) begin
state <= STATE_IDLE; state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE IDLE\n", $time); dpi_trace("%d: STATE IDLE\n", $time);
`endif `endif
end end
@@ -333,7 +335,7 @@ always @(posedge clk) begin
STATE_WRITE: begin STATE_WRITE: begin
if (cmd_write_done) begin if (cmd_write_done) begin
state <= STATE_IDLE; state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE IDLE\n", $time); dpi_trace("%d: STATE IDLE\n", $time);
`endif `endif
end end
@@ -345,7 +347,7 @@ always @(posedge clk) begin
if (cmd_run_done) begin if (cmd_run_done) begin
vx_started <= 0; vx_started <= 0;
state <= STATE_IDLE; state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: STATE IDLE\n", $time); dpi_trace("%d: STATE IDLE\n", $time);
`endif `endif
end end
@@ -699,7 +701,7 @@ always @(posedge clk) begin
if (cci_rd_req_fire) begin if (cci_rd_req_fire) begin
cci_rd_req_addr <= cci_rd_req_addr + 1; cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + 1; cci_rd_req_ctr <= cci_rd_req_ctr + 1;
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads); dpi_trace("%d: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads);
`endif `endif
end end
@@ -709,13 +711,13 @@ always @(posedge clk) begin
if (CCI_RD_QUEUE_TAGW'(cci_rd_rsp_ctr) == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin if (CCI_RD_QUEUE_TAGW'(cci_rd_rsp_ctr) == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE); cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end end
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data); dpi_trace("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
`endif `endif
end end
if (cci_rdq_pop) begin if (cci_rdq_pop) begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads); dpi_trace("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads);
`endif `endif
end end
@@ -856,13 +858,13 @@ begin
if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin
cci_wr_req_done <= 1; cci_wr_req_done <= 1;
end end
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data); dpi_trace("%d: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
`endif `endif
end end
if (cci_wr_rsp_fire) begin if (cci_wr_rsp_fire) begin
`ifdef DBG_PRINT_OPAE `ifdef DBG_TRACE_OPAE
dpi_trace("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes); dpi_trace("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes);
`endif `endif
end end

View File

@@ -509,7 +509,7 @@ module VX_bank #(
assign perf_mshr_stalls = mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full;
`endif `endif
`ifdef DBG_PRINT_CACHE_BANK `ifdef DBG_TRACE_CACHE_BANK
wire crsq_fire = crsq_valid && crsq_ready; wire crsq_fire = crsq_valid && crsq_ready;
wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid) wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid)
&& ~(mshr_fire || mem_rsp_fire || creq_fire); && ~(mshr_fire || mem_rsp_fire || creq_fire);

View File

@@ -53,7 +53,7 @@
`define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS] `define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS]
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1:(CORE_TAG_WIDTH-`NW_BITS-32) `define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@@ -119,7 +119,7 @@ module VX_data_access #(
`UNUSED_VAR (stall) `UNUSED_VAR (stall)
`ifdef DBG_PRINT_CACHE_DATA `ifdef DBG_TRACE_CACHE_DATA
always @(posedge clk) begin always @(posedge clk) begin
if (fill && ~stall) begin if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data); dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data);

View File

@@ -202,7 +202,7 @@ module VX_miss_resrv #(
`UNUSED_VAR (lookup_valid) `UNUSED_VAR (lookup_valid)
`ifdef DBG_PRINT_CACHE_MSHR `ifdef DBG_TRACE_CACHE_MSHR
always @(posedge clk) begin always @(posedge clk) begin
if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin
if (allocate_fire) if (allocate_fire)

View File

@@ -229,7 +229,7 @@ module VX_shared_mem #(
core_rsp_data_in = 'x; core_rsp_data_in = 'x;
bank_rsp_sel_n = bank_rsp_sel_r; bank_rsp_sel_n = bank_rsp_sel_r;
for (integer i = 0; i < NUM_BANKS; i++) begin for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_req_valid[i] if (core_req_read_mask[i]
&& (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin && (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1; core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i]; core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
@@ -271,7 +271,7 @@ module VX_shared_mem #(
end end
`endif `endif
`ifdef DBG_PRINT_CACHE_BANK `ifdef DBG_TRACE_CACHE_BANK
reg is_multi_tag_req; reg is_multi_tag_req;
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN

View File

@@ -61,7 +61,7 @@ module VX_tag_access #(
`UNUSED_VAR (stall) `UNUSED_VAR (stall)
`ifdef DBG_PRINT_CACHE_TAG `ifdef DBG_TRACE_CACHE_TAG
always @(posedge clk) begin always @(posedge clk) begin
if (fill && ~stall) begin if (fill && ~stall) begin
dpi_trace("%d: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag); dpi_trace("%d: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag);

View File

@@ -3,8 +3,7 @@
`include "defs_div_sqrt_mvp.sv" `include "defs_div_sqrt_mvp.sv"
`TRACING_OFF `TRACING_OFF
module VX_fpu_fpnew module VX_fpu_fpnew #(
#(
parameter TAGW = 1, parameter TAGW = 1,
parameter FMULADD = 1, parameter FMULADD = 1,
parameter FDIVSQRT = 1, parameter FDIVSQRT = 1,

View File

@@ -12,9 +12,11 @@ interface VX_gpu_req_if();
wire [31:0] PC; wire [31:0] PC;
wire [31:0] next_PC; wire [31:0] next_PC;
wire [`INST_GPU_BITS-1:0] op_type; wire [`INST_GPU_BITS-1:0] op_type;
wire [`INST_MOD_BITS-1:0] op_mod;
wire [`NT_BITS-1:0] tid; wire [`NT_BITS-1:0] tid;
wire [`NUM_THREADS-1:0][31:0] rs1_data; wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [31:0] rs2_data; wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
wire [`NR_BITS-1:0] rd; wire [`NR_BITS-1:0] rd;
wire wb; wire wb;
@@ -27,9 +29,11 @@ interface VX_gpu_req_if();
output PC, output PC,
output next_PC, output next_PC,
output op_type, output op_type,
output op_mod,
output tid, output tid,
output rs1_data, output rs1_data,
output rs2_data, output rs2_data,
output rs3_data,
output rd, output rd,
output wb, output wb,
input ready input ready
@@ -42,9 +46,11 @@ interface VX_gpu_req_if();
input PC, input PC,
input next_PC, input next_PC,
input op_type, input op_type,
input op_mod,
input tid, input tid,
input rs1_data, input rs1_data,
input rs2_data, input rs2_data,
input rs3_data,
input rd, input rd,
input wb, input wb,
output ready output ready

View File

@@ -76,20 +76,6 @@ interface VX_ibuffer_if ();
input wid_n, input wid_n,
output ready output ready
); );
modport scoreboard (
input valid,
input wid,
input PC,
input wb,
input rd,
input rd_n,
input rs1_n,
input rs2_n,
input rs3_n,
input wid_n,
output ready
);
endinterface endinterface

View File

@@ -0,0 +1,26 @@
`ifndef VX_TEX_CSR_IF
`define VX_TEX_CSR_IF
`include "VX_define.vh"
interface VX_tex_csr_if ();
wire write_enable;
wire [`CSR_ADDR_BITS-1:0] write_addr;
wire [31:0] write_data;
modport master (
output write_enable,
output write_addr,
output write_data
);
modport slave (
input write_enable,
input write_addr,
input write_data
);
endinterface
`endif

View File

@@ -0,0 +1,51 @@
`ifndef VX_TEX_REQ_IF
`define VX_TEX_REQ_IF
`include "VX_define.vh"
interface VX_tex_req_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC;
wire [`NR_BITS-1:0] rd;
wire wb;
wire [`NTEX_BITS-1:0] unit;
wire [1:0][`NUM_THREADS-1:0][31:0] coords;
wire [`NUM_THREADS-1:0][31:0] lod;
wire ready;
modport master (
output valid,
output wid,
output tmask,
output PC,
output rd,
output wb,
output unit,
output coords,
output lod,
input ready
);
modport slave (
input valid,
input wid,
input tmask,
input PC,
input rd,
input wb,
input unit,
input coords,
input lod,
output ready
);
endinterface
`endif

View File

@@ -0,0 +1,43 @@
`ifndef VX_TEX_RSP_IF
`define VX_TEX_RSP_IF
`include "VX_define.vh"
interface VX_tex_rsp_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC;
wire [`NR_BITS-1:0] rd;
wire wb;
wire [`NUM_THREADS-1:0][31:0] data;
wire ready;
modport master (
output valid,
output wid,
output tmask,
output PC,
output rd,
output wb,
output data,
input ready
);
modport slave (
input valid,
input wid,
input tmask,
input PC,
input rd,
input wb,
input data,
output ready
);
endinterface
`endif

View File

@@ -36,15 +36,6 @@ interface VX_writeback_if ();
output ready output ready
); );
modport scoreboard (
input valid,
input wid,
input PC,
input rd,
input eop,
output ready
);
endinterface endinterface
`endif `endif

View File

@@ -93,13 +93,13 @@ module VX_scope #(
CMD_SET_START: begin CMD_SET_START: begin
delay_val <= $bits(delay_val)'(cmd_data); delay_val <= $bits(delay_val)'(cmd_data);
cmd_start <= 1; cmd_start <= 1;
`ifdef DBG_PRINT_SCOPE `ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: CMD_SET_START: delay_val=%0d\n", $time, $bits(delay_val)'(cmd_data)); dpi_trace("%d: *** scope: CMD_SET_START: delay_val=%0d\n", $time, $bits(delay_val)'(cmd_data));
`endif `endif
end end
CMD_SET_STOP: begin CMD_SET_STOP: begin
waddr_end <= $bits(waddr)'(cmd_data); waddr_end <= $bits(waddr)'(cmd_data);
`ifdef DBG_PRINT_SCOPE `ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: CMD_SET_STOP: waddr_end=%0d\n", $time, $bits(waddr)'(cmd_data)); dpi_trace("%d: *** scope: CMD_SET_STOP: waddr_end=%0d\n", $time, $bits(waddr)'(cmd_data));
`endif `endif
end end
@@ -116,7 +116,7 @@ module VX_scope #(
delta <= 0; delta <= 0;
delay_cntr <= 0; delay_cntr <= 0;
start_time <= timestamp; start_time <= timestamp;
`ifdef DBG_PRINT_SCOPE `ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp); dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp);
`endif `endif
end else begin end else begin
@@ -132,7 +132,7 @@ module VX_scope #(
recording <= 1; recording <= 1;
delta <= 0; delta <= 0;
start_time <= timestamp; start_time <= timestamp;
`ifdef DBG_PRINT_SCOPE `ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp); dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp);
`endif `endif
end end
@@ -161,7 +161,7 @@ module VX_scope #(
if (stop if (stop
|| (waddr >= waddr_end)) begin || (waddr >= waddr_end)) begin
`ifdef DBG_PRINT_SCOPE `ifdef DBG_TRACE_SCOPE
dpi_trace("%d: *** scope: recording stop - waddr=(%0d, %0d)\n", $time, waddr, waddr_end); dpi_trace("%d: *** scope: recording stop - waddr=(%0d, %0d)\n", $time, waddr, waddr_end);
`endif `endif
waddr <= waddr; // keep last address waddr <= waddr; // keep last address
@@ -229,7 +229,7 @@ module VX_scope #(
assign bus_out = bus_out_r; assign bus_out = bus_out_r;
`ifdef DBG_PRINT_SCOPE `ifdef DBG_TRACE_SCOPE
always @(posedge clk) begin always @(posedge clk) begin
if (bus_read) begin if (bus_read) begin
dpi_trace("%d: scope-read: cmd=%0d, addr=%0d, value=%0h\n", $time, get_cmd, raddr, bus_out); dpi_trace("%d: scope-read: cmd=%0d, addr=%0d, value=%0h\n", $time, get_cmd, raddr, bus_out);

View File

@@ -0,0 +1,178 @@
`include "VX_tex_define.vh"
module VX_tex_addr #(
parameter CORE_ID = 0,
parameter REQ_INFOW = 1,
parameter NUM_REQS = 1
) (
input wire clk,
input wire reset,
// inputs
input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask,
input wire [1:0][NUM_REQS-1:0][31:0] req_coords,
input wire [`TEX_FORMAT_BITS-1:0] req_format,
input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps,
input wire [`TEX_ADDR_BITS-1:0] req_baseaddr,
input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff,
input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims,
input wire [REQ_INFOW-1:0] req_info,
output wire req_ready,
// outputs
output wire rsp_valid,
output wire [NUM_REQS-1:0] rsp_tmask,
output wire [`TEX_FILTER_BITS-1:0] rsp_filter,
output wire [`TEX_STRIDE_BITS-1:0] rsp_stride,
output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr,
output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends,
output wire [REQ_INFOW-1:0] rsp_info,
input wire rsp_ready
);
`UNUSED_PARAM (CORE_ID)
localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1;
localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS;
localparam SCALED_X_W = (2 * `FIXED_INT);
localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS;
wire valid_s0;
wire [NUM_REQS-1:0] tmask_s0;
wire [`TEX_FILTER_BITS-1:0] filter_s0;
wire [REQ_INFOW-1:0] req_info_s0;
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0;
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0;
wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0;
wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0;
wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0;
wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0;
wire stall_out;
// stride
VX_tex_stride #(
.CORE_ID (CORE_ID)
) tex_stride (
.format (req_format),
.log_stride (log_stride)
);
// addressing mode
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin
wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]);
wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i];
wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i];
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_lo (
.wrap_i (req_wraps[j]),
.coord_i (coord_lo),
.coord_o (clamped_lo[i][j])
);
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_hi (
.wrap_i (req_wraps[j]),
.coord_i (coord_hi),
.coord_o (clamped_hi[i][j])
);
end
assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride);
assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]);
end
VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}),
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
);
// addresses generation
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo;
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi;
wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends;
wire [NUM_REQS-1:0][3:0][31:0] addr;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin
assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]);
assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]);
assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0);
end
end
`UNUSED_VAR (log_pitch_s0)
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0;
wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0;
wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i];
wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i];
wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo);
wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi);
assign addr[i][0] = base_addr_lo + 32'(offset_u_lo);
assign addr[i][1] = base_addr_lo + 32'(offset_u_hi);
assign addr[i][2] = base_addr_hi + 32'(offset_u_lo);
assign addr[i][3] = base_addr_hi + 32'(offset_u_hi);
end
assign stall_out = rsp_valid && ~rsp_ready;
VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}),
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info})
);
assign req_ready = ~stall_out;
`ifdef DBG_TRACE_TEX
wire [`NW_BITS-1:0] rsp_wid;
wire [31:0] rsp_PC;
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin
if (rsp_valid && rsp_ready) begin
dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=",
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride);
`TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS);
dpi_trace("\n");
end
end
`endif
function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src,
input logic [`TEX_DIM_BITS-1:0] dim);
`IGNORE_WARNINGS_BEGIN
logic [`FIXED_BITS-1:0] out;
`IGNORE_WARNINGS_END
out = `FIXED_BITS'(src) << dim;
return out[`FIXED_FRAC +: `FIXED_INT];
endfunction
endmodule

View File

@@ -0,0 +1,39 @@
`ifndef VX_TEX_DEFINE
`define VX_TEX_DEFINE
`include "VX_define.vh"
`define FIXED_BITS 32
`define FIXED_FRAC 20
`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC)
`define FIXED_ONE (2 ** `FIXED_FRAC)
`define FIXED_HALF (`FIXED_ONE >> 1)
`define FIXED_MASK (`FIXED_ONE - 1)
`define TEX_ADDR_BITS 32
`define TEX_FORMAT_BITS 3
`define TEX_WRAP_BITS 2
`define TEX_DIM_BITS 4
`define TEX_FILTER_BITS 1
`define TEX_MIPOFF_BITS (2*12+1)
`define TEX_STRIDE_BITS 2
`define TEX_LOD_BITS 4
`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS)
`define TEX_WRAP_CLAMP 0
`define TEX_WRAP_REPEAT 1
`define TEX_WRAP_MIRROR 2
`define BLEND_FRAC 8
`define BLEND_ONE (2 ** `BLEND_FRAC)
`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0)
`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1)
`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2)
`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3)
`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4)
`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5)
`endif

View File

@@ -0,0 +1,58 @@
`include "VX_tex_define.vh"
module VX_tex_format #(
parameter CORE_ID = 0
) (
input wire [`TEX_FORMAT_BITS-1:0] format,
input wire [31:0] texel_in,
output wire [31:0] texel_out
);
`UNUSED_PARAM (CORE_ID)
reg [31:0] texel_out_r;
always @(*) begin
case (format)
`TEX_FORMAT_R8G8B8A8: begin
texel_out_r[07:00] = texel_in[7:0];
texel_out_r[15:08] = texel_in[15:8];
texel_out_r[23:16] = texel_in[23:16];
texel_out_r[31:24] = texel_in[31:24];
end
`TEX_FORMAT_R5G6B5: begin
texel_out_r[07:00] = {texel_in[15:11], texel_in[15:13]};
texel_out_r[15:08] = {texel_in[10:5], texel_in[10:9]};
texel_out_r[23:16] = {texel_in[4:0], texel_in[4:2]};
texel_out_r[31:24] = 8'hff;
end
`TEX_FORMAT_R4G4B4A4: begin
texel_out_r[07:00] = {texel_in[11:8], texel_in[15:12]};
texel_out_r[15:08] = {2{texel_in[7:4]}};
texel_out_r[23:16] = {2{texel_in[3:0]}};
texel_out_r[31:24] = {2{texel_in[15:12]}};
end
`TEX_FORMAT_L8A8: begin
texel_out_r[07:00] = texel_in[7:0];
texel_out_r[15:08] = texel_in[7:0];
texel_out_r[23:16] = texel_in[7:0];
texel_out_r[31:24] = texel_in[15:8];
end
`TEX_FORMAT_L8: begin
texel_out_r[07:00] = texel_in[7:0];
texel_out_r[15:08] = texel_in[7:0];
texel_out_r[23:16] = texel_in[7:0];
texel_out_r[31:24] = 8'hff;
end
//`TEX_FORMAT_A8
default: begin
texel_out_r[07:00] = 0;
texel_out_r[15:08] = 0;
texel_out_r[23:16] = 0;
texel_out_r[31:24] = texel_in[7:0];
end
endcase
end
assign texel_out = texel_out_r;
endmodule

View File

@@ -0,0 +1,16 @@
`include "VX_tex_define.vh"
module VX_tex_lerp (
input wire [3:0][7:0] in1,
input wire [3:0][7:0] in2,
input wire [8:0] alpha,
input wire [7:0] beta,
output wire [3:0][7:0] out
);
for (genvar i = 0; i < 4; ++i) begin
wire [16:0] sum = in1[i] * alpha + in2[i] * beta;
`UNUSED_VAR (sum)
assign out[i] = sum[15:8];
end
endmodule

View File

@@ -0,0 +1,295 @@
`include "VX_tex_define.vh"
module VX_tex_mem #(
parameter CORE_ID = 0,
parameter REQ_INFOW = 1,
parameter NUM_REQS = 1
) (
input wire clk,
input wire reset,
// memory interface
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// inputs
input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask,
input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [`TEX_STRIDE_BITS-1:0] req_stride,
input wire [NUM_REQS-1:0][3:0][31:0] req_addr,
input wire [REQ_INFOW-1:0] req_info,
output wire req_ready,
// outputs
output wire rsp_valid,
output wire [NUM_REQS-1:0] rsp_tmask,
output wire [NUM_REQS-1:0][3:0][31:0] rsp_data,
output wire [REQ_INFOW-1:0] rsp_info,
input wire rsp_ready
);
`UNUSED_PARAM (CORE_ID)
localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1);
wire [3:0] dup_reqs;
wire [3:0][NUM_REQS-1:0][29:0] req_addr_w;
wire [3:0][NUM_REQS-1:0][1:0] align_offs;
// reorder address into quads
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 4; ++j) begin
assign req_addr_w[j][i] = req_addr[i][j][31:2];
assign align_offs[j][i] = req_addr[i][j][1:0];
end
end
// find duplicate addresses
for (genvar i = 0; i < 4; ++i) begin
wire [NUM_REQS-1:0] addr_matches;
for (genvar j = 0; j < NUM_REQS; j++) begin
assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j];
end
assign dup_reqs[i] = req_tmask[0] && (& addr_matches);
end
// save request addresses into fifo
wire reqq_push, reqq_pop, reqq_empty, reqq_full;
wire [3:0][NUM_REQS-1:0][29:0] q_req_addr;
wire [NUM_REQS-1:0] q_req_tmask;
wire [`TEX_FILTER_BITS-1:0] q_req_filter;
wire [REQ_INFOW-1:0] q_req_info;
wire [`TEX_STRIDE_BITS-1:0] q_req_stride;
wire [3:0][NUM_REQS-1:0][1:0] q_align_offs;
wire [3:0] q_dup_reqs;
assign reqq_push = req_valid && req_ready;
VX_fifo_queue #(
.DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4),
.SIZE (`LSUQ_SIZE),
.OUT_REG (1)
) req_queue (
.clk (clk),
.reset (reset),
.push (reqq_push),
.pop (reqq_pop),
.data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}),
.data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}),
.empty (reqq_empty),
.full (reqq_full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
// can take more requests?
assign req_ready = ~reqq_full;
///////////////////////////////////////////////////////////////////////////
wire req_texel_valid;
wire sent_all_ready, last_texel_sent;
wire req_texel_dup;
wire [NUM_REQS-1:0][29:0] req_texel_addr;
reg [1:0] req_texel_idx;
reg req_texels_done;
always @(posedge clk) begin
if (reset || last_texel_sent) begin
req_texel_idx <= 0;
end else if (req_texel_valid && sent_all_ready) begin
req_texel_idx <= req_texel_idx + 1;
end
end
always @(posedge clk) begin
if (reset || reqq_pop) begin
req_texels_done <= 0;
end else if (last_texel_sent) begin
req_texels_done <= 1;
end
end
assign req_texel_valid = ~reqq_empty && ~req_texels_done;
assign req_texel_addr = q_req_addr[req_texel_idx];
assign req_texel_dup = q_dup_reqs[req_texel_idx];
wire is_last_texel = (req_texel_idx == (q_req_filter ? 3 : 0));
assign last_texel_sent = req_texel_valid && sent_all_ready && is_last_texel;
// DCache Request
reg [NUM_REQS-1:0] texel_sent_mask;
wire [NUM_REQS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire dcache_req_fire_any = (| dcache_req_fire);
assign sent_all_ready = (&(dcache_req_if.ready | texel_sent_mask | ~q_req_tmask))
|| (req_texel_dup & dcache_req_if.ready[0]);
always @(posedge clk) begin
if (reset || sent_all_ready) begin
texel_sent_mask <= 0;
end else begin
texel_sent_mask <= texel_sent_mask | dcache_req_fire;
end
end
wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1};
assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask;
assign dcache_req_if.rw = {NUM_REQS{1'b0}};
assign dcache_req_if.addr = req_texel_addr;
assign dcache_req_if.byteen = {NUM_REQS{4'b1111}};
assign dcache_req_if.data = 'x;
`ifdef DBG_CACHE_REQ_INFO
assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}};
`else
assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}};
`endif
// Dcache Response
reg [3:0][NUM_REQS-1:0][31:0] rsp_texels, rsp_texels_n;
wire [NUM_REQS-1:0][3:0][31:0] rsp_texels_qual;
reg [NUM_REQS-1:0][31:0] rsp_data_qual;
reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init;
wire [RSP_CTR_W-1:0] rsp_rem_ctr_n;
wire dcache_rsp_fire;
wire [1:0] rsp_texel_idx;
wire rsp_texel_dup;
assign rsp_texel_idx = dcache_rsp_if.tag[1:0];
`UNUSED_VAR (dcache_rsp_if.tag)
assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx];
assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
for (genvar i = 0; i < NUM_REQS; i++) begin
wire [31:0] src_mask = {32{dcache_rsp_if.tmask[i]}};
wire [31:0] src_data = ((i == 0 || rsp_texel_dup) ? dcache_rsp_if.data[0] : dcache_rsp_if.data[i]) & src_mask;
reg [31:0] rsp_data_shifted;
always @(*) begin
rsp_data_shifted[31:16] = src_data[31:16];
rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0];
rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0];
end
always @(*) begin
case (q_req_stride)
0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]);
1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]);
default: rsp_data_qual[i] = rsp_data_shifted;
endcase
end
end
always @(*) begin
rsp_texels_n = rsp_texels;
rsp_texels_n[rsp_texel_idx] |= rsp_data_qual;
end
always @(posedge clk) begin
if (reset || reqq_pop) begin
rsp_texels <= '0;
end else if (dcache_rsp_fire) begin
rsp_texels <= rsp_texels_n;
end
end
always @(*) begin
rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask));
if (q_req_filter) begin
for (integer i = 1; i < 4; ++i) begin
rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask));
end
end
end
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask));
always @(posedge clk) begin
if (reset) begin
rsp_rem_ctr <= 0;
end else begin
if (dcache_req_fire_any && 0 == rsp_rem_ctr) begin
rsp_rem_ctr <= rsp_rem_ctr_init;
end else if (dcache_rsp_fire) begin
rsp_rem_ctr <= rsp_rem_ctr_n;
end
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 4; ++j) begin
assign rsp_texels_qual[i][j] = rsp_texels_n[j][i];
end
end
wire stall_out = rsp_valid && ~rsp_ready;
wire is_last_rsp = (0 == rsp_rem_ctr_n);
wire rsp_texels_done = dcache_rsp_fire && is_last_rsp;
assign reqq_pop = rsp_texels_done && ~stall_out;
VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (4 * NUM_REQS * 32)),
.RESETW (1)
) rsp_pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({rsp_texels_done, q_req_tmask, q_req_info, rsp_texels_qual}),
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
);
// Can accept new cache response?
assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out);
`ifdef DBG_TRACE_TEX
wire [`NW_BITS-1:0] q_req_wid, req_wid, rsp_wid;
wire [31:0] q_req_PC, req_PC, rsp_PC;
assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0];
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin
if (dcache_req_fire_any) begin
dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=",
$time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx);
`TRACE_ARRAY1D(req_texel_addr, NUM_REQS);
dpi_trace(", is_dup=%b\n", req_texel_dup);
end
if (dcache_rsp_fire) begin
dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=",
$time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx);
`TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS);
dpi_trace("\n");
end
if (req_valid && req_ready) begin
dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=",
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride);
`TRACE_ARRAY2D(req_addr, 4, NUM_REQS);
dpi_trace("\n");
end
if (rsp_valid && rsp_ready) begin
dpi_trace("%d: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
`TRACE_ARRAY2D(rsp_data, 4, NUM_REQS);
dpi_trace("\n");
end
end
`endif
endmodule

View File

@@ -0,0 +1,146 @@
`include "VX_tex_define.vh"
module VX_tex_sampler #(
parameter CORE_ID = 0,
parameter REQ_INFOW = 1,
parameter NUM_REQS = 1
) (
input wire clk,
input wire reset,
// inputs
input wire req_valid,
input wire [NUM_REQS-1:0] req_tmask,
input wire [`TEX_FORMAT_BITS-1:0] req_format,
input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends,
input wire [NUM_REQS-1:0][3:0][31:0] req_data,
input wire [REQ_INFOW-1:0] req_info,
output wire req_ready,
// ouputs
output wire rsp_valid,
output wire [NUM_REQS-1:0] rsp_tmask,
output wire [NUM_REQS-1:0][31:0] rsp_data,
output wire [REQ_INFOW-1:0] rsp_info,
input wire rsp_ready
);
`UNUSED_PARAM (CORE_ID)
wire valid_s0;
wire [NUM_REQS-1:0] tmask_s0;
wire [REQ_INFOW-1:0] req_info_s0;
wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh;
wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0;
wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0;
wire [NUM_REQS-1:0][31:0] texel_v;
wire stall_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [3:0][31:0] fmt_texels;
for (genvar j = 0; j < 4; ++j) begin
VX_tex_format #(
.CORE_ID (CORE_ID)
) tex_format (
.format (req_format),
.texel_in (req_data[i][j]),
.texel_out (fmt_texels[j])
);
end
wire [7:0] beta = req_blends[i][0];
wire [8:0] alpha = `BLEND_ONE - beta;
VX_tex_lerp #(
) tex_lerp_ul (
.in1 (fmt_texels[0]),
.in2 (fmt_texels[1]),
.alpha (alpha),
.beta (beta),
.out (texel_ul[i])
);
VX_tex_lerp #(
) tex_lerp_uh (
.in1 (fmt_texels[2]),
.in2 (fmt_texels[3]),
.alpha (alpha),
.beta (beta),
.out (texel_uh[i])
);
assign blend_v[i] = req_blends[i][1];
end
VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}),
.data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0})
);
for (genvar i = 0; i < NUM_REQS; i++) begin
wire [7:0] beta = blend_v_s0[i];
wire [8:0] alpha = `BLEND_ONE - beta;
VX_tex_lerp #(
) tex_lerp_v (
.in1 (texel_ul_s0[i]),
.in2 (texel_uh_s0[i]),
.alpha (alpha),
.beta (beta),
.out (texel_v[i])
);
end
assign stall_out = rsp_valid && ~rsp_ready;
VX_pipe_register #(
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}),
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
);
// can accept new request?
assign req_ready = ~stall_out;
`ifdef DBG_TRACE_TEX
wire [`NW_BITS-1:0] req_wid, rsp_wid;
wire [31:0] req_PC, rsp_PC;
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin
if (req_valid && req_ready) begin
dpi_trace("%d: core%0d-tex-sampler-req: wid=%0d, PC=%0h, tmask=%b, format=%0d, data=",
$time, CORE_ID, req_wid, req_PC, req_tmask, req_format);
`TRACE_ARRAY2D(req_data, 4, NUM_REQS);
dpi_trace(", u0=");
`TRACE_ARRAY1D(req_blends[0], NUM_REQS);
dpi_trace(", v0=");
`TRACE_ARRAY1D(req_blends[1], NUM_REQS);
dpi_trace("\n");
end
if (rsp_valid && rsp_ready) begin
dpi_trace("%d: core%0d-tex-sampler-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
`TRACE_ARRAY1D(rsp_data, NUM_REQS);
dpi_trace("\n");
end
end
`endif
endmodule

View File

@@ -0,0 +1,21 @@
`include "VX_platform.vh"
module VX_tex_sat #(
parameter IN_W = 1,
parameter OUT_W = 1,
parameter MODEL = 1
) (
input wire [IN_W-1:0] data_in,
output wire [OUT_W-1:0] data_out
);
`STATIC_ASSERT(((OUT_W+1) < IN_W), ("invalid parameter"))
if (MODEL == 1) begin
wire [OUT_W-1:0] underflow_mask = {OUT_W{~data_in[IN_W-1]}};
wire [OUT_W-1:0] overflow_mask = {OUT_W{(| data_in[IN_W-2:OUT_W])}};
assign data_out = (data_in[OUT_W-1:0] | overflow_mask) & underflow_mask;
end else begin
assign data_out = data_in[IN_W-1] ? OUT_W'(0) : ((data_in > {OUT_W{1'b1}}) ? {OUT_W{1'b1}} : OUT_W'(data_in));
end
endmodule

View File

@@ -0,0 +1,27 @@
`include "VX_tex_define.vh"
module VX_tex_stride #(
parameter CORE_ID = 0
) (
input wire [`TEX_FORMAT_BITS-1:0] format,
output wire [`TEX_STRIDE_BITS-1:0] log_stride
);
`UNUSED_PARAM (CORE_ID)
reg [`TEX_STRIDE_BITS-1:0] log_stride_r;
always @(*) begin
case (format)
`TEX_FORMAT_A8: log_stride_r = 0;
`TEX_FORMAT_L8: log_stride_r = 0;
`TEX_FORMAT_L8A8: log_stride_r = 1;
`TEX_FORMAT_R5G6B5: log_stride_r = 1;
`TEX_FORMAT_R4G4B4A4: log_stride_r = 1;
//`TEX_FORMAT_R8G8B8A8
default: log_stride_r = 2;
endcase
end
assign log_stride = log_stride_r;
endmodule

View File

@@ -0,0 +1,234 @@
`include "VX_tex_define.vh"
module VX_tex_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// Texture unit <-> Memory Unit
VX_dcache_req_if.master dcache_req_if,
VX_dcache_rsp_if.slave dcache_rsp_if,
// Inputs
VX_tex_req_if.slave tex_req_if,
VX_tex_csr_if.slave tex_csr_if,
// Outputs
VX_tex_rsp_if.master tex_rsp_if
);
localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32;
localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S;
localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A;
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0];
reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0];
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0];
// CSRs programming
reg [`NUM_TEX_UNITS-1:0] csrs_dirty;
`UNUSED_VAR (csrs_dirty)
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS];
always @(posedge clk) begin
if (tex_csr_if.write_enable) begin
case (tex_csr_if.write_addr)
`CSR_TEX_ADDR(i) : begin
tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_FORMAT(i) : begin
tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_WRAP(i) : begin
tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS];
tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS];
csrs_dirty[i] <= 1;
end
`CSR_TEX_FILTER(i) : begin
tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_MIPOFF(i) : begin
tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_WIDTH(i) : begin
tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
csrs_dirty[i] <= 1;
end
`CSR_TEX_HEIGHT(i) : begin
tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
csrs_dirty[i] <= 1;
end
endcase
end
if (reset || (tex_req_if.valid && tex_req_if.ready)) begin
csrs_dirty[i] <= '0;
end
end
end
// mipmap attributes
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0];
wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS];
assign sel_mipoff[i] = tex_mipoff[unit][mip_level];
assign sel_dims[i] = tex_dims[unit][mip_level];
end
// address generation
wire mem_req_valid;
wire [`NUM_THREADS-1:0] mem_req_tmask;
wire [`TEX_FILTER_BITS-1:0] mem_req_filter;
wire [`TEX_STRIDE_BITS-1:0] mem_req_stride;
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends;
wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr;
wire [REQ_INFOW_A-1:0] mem_req_info;
wire mem_req_ready;
VX_tex_addr #(
.CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_A),
.NUM_REQS (`NUM_THREADS)
) tex_addr (
.clk (clk),
.reset (reset),
.req_valid (tex_req_if.valid),
.req_tmask (tex_req_if.tmask),
.req_coords (tex_req_if.coords),
.req_format (tex_format[tex_req_if.unit]),
.req_filter (tex_filter[tex_req_if.unit]),
.req_wraps (tex_wraps[tex_req_if.unit]),
.req_baseaddr (tex_baddr[tex_req_if.unit]),
.req_mipoff (sel_mipoff),
.req_logdims (sel_dims),
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
.req_ready (tex_req_if.ready),
.rsp_valid (mem_req_valid),
.rsp_tmask (mem_req_tmask),
.rsp_filter (mem_req_filter),
.rsp_stride (mem_req_stride),
.rsp_addr (mem_req_addr),
.rsp_blends (mem_req_blends),
.rsp_info (mem_req_info),
.rsp_ready (mem_req_ready)
);
// retrieve texel values from memory
wire mem_rsp_valid;
wire [`NUM_THREADS-1:0] mem_rsp_tmask;
wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data;
wire [REQ_INFOW_M-1:0] mem_rsp_info;
wire mem_rsp_ready;
VX_tex_mem #(
.CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_M),
.NUM_REQS (`NUM_THREADS)
) tex_mem (
.clk (clk),
.reset (reset),
// memory interface
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
// inputs
.req_valid (mem_req_valid),
.req_tmask (mem_req_tmask),
.req_filter(mem_req_filter),
.req_stride(mem_req_stride),
.req_addr (mem_req_addr),
.req_info ({mem_req_blends, mem_req_info}),
.req_ready (mem_req_ready),
// outputs
.rsp_valid (mem_rsp_valid),
.rsp_tmask (mem_rsp_tmask),
.rsp_data (mem_rsp_data),
.rsp_info (mem_rsp_info),
.rsp_ready (mem_rsp_ready)
);
// apply sampler
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends;
wire [`TEX_FORMAT_BITS-1:0] rsp_format;
wire [REQ_INFOW_S-1:0] rsp_info;
assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info;
VX_tex_sampler #(
.CORE_ID (CORE_ID),
.REQ_INFOW (REQ_INFOW_S),
.NUM_REQS (`NUM_THREADS)
) tex_sampler (
.clk (clk),
.reset (reset),
// inputs
.req_valid (mem_rsp_valid),
.req_tmask (mem_rsp_tmask),
.req_data (mem_rsp_data),
.req_format (rsp_format),
.req_blends (rsp_blends),
.req_info (rsp_info),
.req_ready (mem_rsp_ready),
// outputs
.rsp_valid (tex_rsp_if.valid),
.rsp_tmask (tex_rsp_if.tmask),
.rsp_data (tex_rsp_if.data),
.rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}),
.rsp_ready (tex_rsp_if.ready)
);
`ifdef DBG_TRACE_TEX
always @(posedge clk) begin
if (tex_req_if.valid && tex_req_if.ready) begin
for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin
if (csrs_dirty[i]) begin
dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]);
dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]);
end
end
dpi_trace("%d: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=",
$time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod);
`TRACE_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS);
dpi_trace(", v=");
`TRACE_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS);
dpi_trace("\n");
end
if (tex_rsp_if.valid && tex_rsp_if.ready) begin
dpi_trace("%d: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
$time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask);
`TRACE_ARRAY1D(tex_rsp_if.data, `NUM_THREADS);
dpi_trace("\n");
end
end
`endif
endmodule

View File

@@ -0,0 +1,38 @@
`include "VX_tex_define.vh"
module VX_tex_wrap #(
parameter CORE_ID = 0
) (
input wire [`TEX_WRAP_BITS-1:0] wrap_i,
input wire [31:0] coord_i,
output wire [`FIXED_FRAC-1:0] coord_o
);
`UNUSED_PARAM (CORE_ID)
reg [`FIXED_FRAC-1:0] coord_r;
wire [`FIXED_FRAC-1:0] clamp;
VX_tex_sat #(
.IN_W (32),
.OUT_W (`FIXED_FRAC)
) sat_fx (
.data_in (coord_i),
.data_out (clamp)
);
always @(*) begin
case (wrap_i)
`TEX_WRAP_CLAMP:
coord_r = clamp;
`TEX_WRAP_MIRROR:
coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}};
default: //`TEX_WRAP_REPEAT
coord_r = coord_i[`FIXED_FRAC-1:0];
endcase
end
assign coord_o = coord_r;
endmodule

View File

@@ -194,9 +194,9 @@
"issue_imm": 32, "issue_imm": 32,
"issue_use_pc": 1, "issue_use_pc": 1,
"issue_use_imm": 1, "issue_use_imm": 1,
"gpr_rsp_a":"`NUM_THREADS * 32", "gpr_rs1":"`NUM_THREADS * 32",
"gpr_rsp_b":"`NUM_THREADS * 32", "gpr_rs2":"`NUM_THREADS * 32",
"gpr_rsp_c":"`NUM_THREADS * 32", "gpr_rs3":"`NUM_THREADS * 32",
"?writeback_valid": 1, "?writeback_valid": 1,
"writeback_wid":"`NW_BITS", "writeback_wid":"`NW_BITS",
"writeback_pc": 32, "writeback_pc": 32,
@@ -205,7 +205,7 @@
"writeback_data":"`NUM_THREADS * 32", "writeback_data":"`NUM_THREADS * 32",
"writeback_eop": 1, "writeback_eop": 1,
"!scoreboard_delay": 1, "!scoreboard_delay": 1,
"!execute_delay": 1 "!dispatch_delay": 1
}, },
"afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": { "afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": {
"?valid_st0": 1, "?valid_st0": 1,

View File

@@ -262,7 +262,7 @@ def expand_text(text, params):
has_func = do_repl.has_func has_func = do_repl.has_func
if not (params_updated or do_repl.expanded): if not (params_updated or do_repl.expanded):
break break
text = new_text text = new_text
changed = True changed = True
if not has_func: if not has_func:
break break

View File

@@ -8,20 +8,21 @@ else
RUN_SYNTH=qsub-synth RUN_SYNTH=qsub-synth
endif endif
# control RTL debug print states # control RTL debug tracing states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO DBG_FLAGS += -DDBG_CACHE_REQ_INFO
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS) CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
@@ -33,7 +34,8 @@ CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3_CACHE_SIZE=524288 $(CONFIGS) CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3_CACHE_SIZE=524288 $(CONFIGS)
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY) FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)
RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE) $(TEX_INCLUDE)
CFLAGS += $(RTL_INCLUDE) CFLAGS += $(RTL_INCLUDE)

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,12 +12,12 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Part, Family PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
# Executable Configuration # Executable Configuration
SYN_ARGS = --parallel --read_settings_files=on SYN_ARGS = --parallel --read_settings_files=on

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG
FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10 FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -12,7 +12,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Executable Configuration # Executable Configuration

View File

@@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10 #FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE) TEX_INCLUDE = $(RTL_DIR)/tex_unit
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf

View File

@@ -1,46 +1,42 @@
PARAM += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4 PARAMS += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4
# control RTL debug tracing states
DBG_TRACE_FLAGS = -DDBG_TRACE_CORE_ICACHE \
-DDBG_TRACE_CORE_DCACHE \
-DDBG_TRACE_CACHE_BANK \
-DDBG_TRACE_CACHE_SNP \
-DDBG_TRACE_CACHE_MSHR \
-DDBG_TRACE_CACHE_TAG \
-DDBG_TRACE_CACHE_DATA \
-DDBG_TRACE_MEM \
-DDBG_TRACE_OPAE \
-DDBG_TRACE_AVS
# control RTL debug print states #DBG_PRINT=$(DBG_TRACE_FLAGS)
DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \
-DDBG_PRINT_CORE_DCACHE \
-DDBG_PRINT_CACHE_BANK \
-DDBG_PRINT_CACHE_SNP \
-DDBG_PRINT_CACHE_MSHR \
-DDBG_PRINT_CACHE_TAG \
-DDBG_PRINT_CACHE_DATA \
-DDBG_PRINT_MEM \
-DDBG_PRINT_OPAE \
-DDBG_PRINT_AVS
#DBG_PRINT=$(DBG_PRINT_FLAGS)
INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs
SRCS = cachesim.cpp testbench.cpp SRCS = cachesim.cpp testbench.cpp
all: build all: build
CF += -std=c++11 -fms-extensions -I../.. CF += -std=c++11 -fms-extensions -I../..
CF += $(PARAMS)
VF += --language 1800-2009 --assert -Wall --trace #-Wpedantic VF += --language 1800-2009 --assert -Wall --trace #-Wpedantic
VF += -Wno-DECLFILENAME VF += -Wno-DECLFILENAME
VF += --x-initial unique VF += --x-initial unique
VF += -exe $(SRCS) $(INCLUDE) VF += -exe $(SRCS) $(INCLUDE)
VF += $(PARAMS)
DBG += -DVCD_OUTPUT $(DBG_PRINT)
gen: gen:
verilator $(VF) -DNDEBUG -cc VX_cache.v $(PARAM) -CFLAGS '$(CF) -DNDEBUG $(PARAM)' --exe $(SRCS) verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
build: gen build: gen
(cd obj_dir && make -j -f VVX_cache.mk) (cd obj_dir && make -j -f V$(TOP).mk)
run: build run: build
(cd obj_dir && ./VVX_cache) (cd obj_dir && ./V$(TOP))
clean: clean:
rm -rf obj_dir rm -rf obj_dir

View File

@@ -173,10 +173,10 @@ void CacheSim::stall_mem(){
} }
void CacheSim::send_snoop_req(){ void CacheSim::send_snoop_req(){
cache_->snp_req_valid = 1; /*cache_->snp_req_valid = 1;
cache_->snp_req_addr = 0x12222222; cache_->snp_req_addr = 0x12222222;
cache_->snp_req_invalidate = 1; cache_->snp_req_invalidate = 1;
cache_->snp_req_tag = 0xff; cache_->snp_req_tag = 0xff; */
} }
void CacheSim::eval_mem_bus() { void CacheSim::eval_mem_bus() {
@@ -274,9 +274,9 @@ bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){
//DEBUG //DEBUG
void CacheSim::display_miss(){ void CacheSim::display_miss(){
int i = (unsigned int)cache_->miss_vec; //int i = (unsigned int)cache_->miss_vec;
std::bitset<8> x(i); //std::bitset<8> x(i);
if (i) std::cout << "Miss Vec " << x << std::endl; //if (i) std::cout << "Miss Vec " << x << std::endl;
//std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl; //std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl;
} }

View File

@@ -1,11 +1,30 @@
all: testbench.iv TOP = VX_fifo_queue
testbench.iv: testbench.v PARAMS ?=
iverilog testbench.v -o testbench.iv -I ../../rtl/
run: testbench.iv INCLUDE = -I../../rtl/ -I../../rtl/libs
! vvp testbench.iv | grep 'ERROR' || false
SRCS = main.cpp
all: build
CF += -std=c++11 -fms-extensions -I../..
VF += $(PARAMS)
VF += --language 1800-2009 --assert -Wall --trace
VF += -Wno-DECLFILENAME
VF += --x-initial unique
VF += -exe $(SRCS) $(INCLUDE)
VF += $(PARAMS)
gen:
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
build: gen
(cd obj_dir && make -j -f V$(TOP).mk)
run: build
(cd obj_dir && ./V$(TOP))
clean: clean:
rm testbench.iv rm -rf obj_dir

View File

@@ -0,0 +1,93 @@
#include "vl_simulator.h"
#include "VVX_fifo_queue.h"
#include <iostream>
#define MAX_TICKS 20
#define CHECK(x) \
do { \
if (x) \
break; \
std::cout << "FAILED: " << #x << std::endl; \
std::abort(); \
} while (false)
uint64_t ticks = 0;
double sc_time_stamp() {
return ticks;
}
using Device = VVX_fifo_queue;
int main(int argc, char **argv) {
// Initialize Verilators variables
Verilated::commandArgs(argc, argv);
vl_simulator<Device> sim;
// run test
ticks = sim.reset(0);
while (ticks < MAX_TICKS) {
switch (ticks) {
case 0:
// initial values
sim->pop = 0;
sim->push = 0;
ticks = sim.step(ticks, 2);
break;
case 2:
// Verify outputs
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x1);
// push 0xa
sim->pop = 0;
sim->push = 1;
sim->data_in = 0xa;
break;
case 4:
// verify outputs
CHECK(sim->data_out == 0xa);
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x0);
// push 0xb
sim->pop = 0;
sim->push = 1;
sim->data_in = 0xb;
break;
case 6:
// verify outputs
CHECK(sim->data_out == 0xa);
CHECK(sim->full == 0x1);
CHECK(sim->empty == 0x0);
// pop
sim->pop = 1;
sim->push = 0;
break;
case 8:
// verify outputs
CHECK(sim->data_out == 0xb);
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x0);
// pop
sim->pop = 1;
sim->push = 0;
break;
case 10:
// verify outputs
CHECK(sim->full == 0x0);
CHECK(sim->empty == 0x1);
sim->pop = 0;
sim->push = 0;
break;
}
// advance clock
ticks = sim.step(ticks, 2);
}
std::cout << "PASSED!" << std::endl;
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
return 0;
}

View File

@@ -0,0 +1,81 @@
#pragma once
#include <array>
#include <cstdint>
#include "verilated.h"
#ifdef VM_TRACE
#include <verilated_vcd_c.h> // Trace file format header
#endif
template <typename T>
class vl_simulator {
private:
T top_;
#ifdef VM_TRACE
VerilatedVcdC tfp_;
#endif
public:
vl_simulator() {
top_.clk = 0;
top_.reset = 0;
#ifdef VM_TRACE
Verilated::traceEverOn(true);
top_.trace(&tfp_, 99);
tfp_.open("trace.vcd");
#endif
}
~vl_simulator() {
#ifdef VM_TRACE
tfp_.close();
#endif
top_.final();
}
uint64_t reset(uint64_t ticks) {
top_.reset = 1;
ticks = this->step(ticks, 2);
top_.reset = 0;
return ticks;
}
uint64_t step(uint64_t ticks, uint32_t count = 1) {
while (count--) {
top_.eval();
#ifdef VM_TRACE
tfp_.dump(ticks);
#endif
top_.clk = !top_.clk;
++ticks;
}
return ticks;
}
T* operator->() {
return &top_;
}
};
template <typename... Args>
void vl_setw(uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
sig[i] = arr[i];
}
}
template <typename... Args>
int vl_cmpw(const uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
if (sig[i] < arr[i])
return -1;
if (sig[i] > arr[i])
return 1;
}
return 0;
}

View File

@@ -0,0 +1,30 @@
TOP = VX_tex_sampler
PARAMS ?=
INCLUDE = -I../../../rtl/ -I../../../rtl/libs -I../../../rtl/tex_unit
SRCS = main.cpp
all: build
CF += -std=c++11 -fms-extensions -I../..
VF += $(PARAMS)
VF += --language 1800-2009 --assert -Wall --trace
VF += -Wno-DECLFILENAME
VF += --x-initial unique
VF += -exe $(SRCS) $(INCLUDE)
VF += $(PARAMS)
gen:
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
build: gen
(cd obj_dir && make -j -f V$(TOP).mk)
run: build
(cd obj_dir && ./V$(TOP))
clean:
rm -rf obj_dir

View File

@@ -0,0 +1,215 @@
#include "vl_simulator.h"
#include "VVX_tex_sampler.h"
#include <iostream>
#include <map>
#define MAX_TICKS 20
#define MAX_UNIT_CYCLES 5
#define NUM_THREADS
#define CHECK(x) \
do { \
if (x) \
break; \
std::cout << "FAILED: " << #x << std::endl; \
std::abort(); \
} while (false)
uint64_t ticks = 0;
// using Device = VVX_tex_sampler;
template <typename T>
class testbench
{
private:
vl_simulator<T> sim;
std::map<int, struct Input> input_map;
std::map<int, struct Output> output_map;
public:
struct UnitTest {
bool use_reset;
unsigned int num_cycles;
bool use_cmodel;
struct Output outputs[MAX_UNIT_CYCLES];
struct Input inputs[MAX_UNIT_CYCLES];
unsigned int num_output_check;
unsigned int check_output_cycle[MAX_UNIT_CYCLES];
}
struct Input {
bool req_valid;
unsigned int req_wid;
unsigned int req_tmask;
unsigned int req_PC;
unsigned int req_rd;
unsigned int req_wb;
unsigned int req_filter;
unsigned int req_format;
unsigned int req_u[NUM_THREADS];
unsigned int req_v[NUM_THREADS];
unsigned int req_texels[NUM_THREADS][4];
bool rsp_ready;
}
struct Output {
int output_cycle;
// outputs
bool req_ready;
bool rsp_valid;
unsigned int rsp_wid;
unsigned int rsp_tmask;
unsigned int rsp_PC;
unsigned int rsp_rd;
bool rsp_wb;
unsigned int rsp_data[NUM_THREADS];
}
testbench(/* args */){
}
~testbench(){
}
void unittest_Cmodel(struct UnitTest * test){
int cycles = test->num_cycles;
int num_outputs = test->num_output_check;
// struct Input* inputs = new (struct Input)[cycles];
struct Output* outputs = new (struct Output)[num_outputs];
// implement c model and assign outputs to struct
if (test->inputs[0]->req_filter == 0){
for (int i = 0; i < NUM_THREADS; i++)
outputs[0]->rsp_data[0] = test->inputs->req_texels[i][0];
} else {
// for (int i = 0; i < NUM_THREADS; i++){
// uint32_t low[4], high[4];
// for (int j = 0; j < 4; j++){
// low[j] = test->inputs->req_texels[i][j] & 0x00ff00ff;
// high[j] = (test->inputs->req_texels[i][j] >> 8) & 0x00ff00ff;
// }
// }
}
outputs[0]->output_cycle = 1;
test->num_cycles = 1;
test->outputs = &outputs;
}
void generate_test_vectors(struct UnitTest * tests, int num_tests, bool is_pipe){
// for all unit tests create output test vectors (w w/o c-model)
int prev_test_cycle = 0;
for (int i = 0; i < num_tests; i++)
{
int op_counter = 0;
int ip_counter = 0;
int test_cycle = 0;
int last_ip_cycle = 0;
struct UnitTest curr_test = tests[i];
if (curr_test->use_cmodel){
unittest_Cmodel(&curr_test);
}
for (int j = 0; j < curr_test->num_cycles; j++)
{
if (curr_test->inputs[ip_counter]->input_cycle == test_cycle){
input_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->inputs[j]));
last_ip_cycle = prev_test_cycle + test_cycle;
ip_counter++;
}
if (curr_test->outputs[op_counter]->output_cycle == test_cycle){
output_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->outputs[op_counter]));
op_counter++;
}
test_cycle++;
}
if(!is_pipe){
prev_test_cycle += (test_cycle - 1);
}
else{
prev_test_cycle = last_ip_cycle + 1;
}
}
}
void run(){
ticks = sim.reset(0);
int cycle = 0;
while (ticks < MAX_TICKS) {
auto input = input_map.find(cycle);
auto output = output_map.find(cycle);
if (input != input_map.end()){
sim->req_valid = input->req_valid;
sim->req_wid = input->req_wid;
sim->req_tmask = input->req_tmask;
sim->req_PC = input->req_PC;
sim->req_rd = input->req_rd;
sim->req_wb = input->req_wb;
sim->req_filter = input->req_filter;
sim->req_format = input->req_format;
// sim->req_u = input->req_u[NUM_THREADS];
// sim->req_v = input->req_v[NUM_THREADS];
vl_setw(sim->req_texels, input->req_texels)
// sim->req_texels = input->req_texels[NUM_THREADS][4];
sim->rsp_ready = input->rsp_ready;
} else{
std::cout << "Warning! No Input on Cycle " << cycle << std::endl;
}
if(output != output_map.end()){
CHECK(sim->req_ready == output->req_ready);
CHECK(sim->rsp_valid == output->rsp_valid);
CHECK(sim->rsp_wid == output->rsp_wid);
CHECK(sim->rsp_tmask == output->rsp_tmask);
CHECK(sim->rsp_PC == output->rsp_PC);
CHECK(sim->rsp_rd == output->rsp_rd);
CHECK(sim->rsp_wb == output->rsp_wb);
CHECK(vl_cmpw(sim->rsp_data, output->rsp_data));
}
cycle++;
ticks = sim.step(ticks,2);
}
}
std::cout << "PASSED!" << std::endl;
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
};
double sc_time_stamp() {
return ticks;
}
int main(int argc, char **argv) {
// Initialize Verilators variables
Verilated::commandArgs(argc, argv);
testbench<VVX_tex_sampler> sampler_testbench;
sampler_testbench.generate_test_vectors(tests, 1, 0);
sampler_test_bench.run();
return 0;
}

View File

@@ -0,0 +1,81 @@
#pragma once
#include <array>
#include <cstdint>
#include "verilated.h"
#ifdef VM_TRACE
#include <verilated_vcd_c.h> // Trace file format header
#endif
template <typename T>
class vl_simulator {
private:
T top_;
#ifdef VM_TRACE
VerilatedVcdC tfp_;
#endif
public:
vl_simulator() {
top_.clk = 0;
top_.reset = 0;
#ifdef VM_TRACE
Verilated::traceEverOn(true);
top_.trace(&tfp_, 99);
tfp_.open("trace.vcd");
#endif
}
~vl_simulator() {
#ifdef VM_TRACE
tfp_.close();
#endif
top_.final();
}
uint64_t reset(uint64_t ticks) {
top_.reset = 1;
ticks = this->step(ticks, 2);
top_.reset = 0;
return ticks;
}
uint64_t step(uint64_t ticks, uint32_t count = 1) {
while (count--) {
top_.eval();
#ifdef VM_TRACE
tfp_.dump(ticks);
#endif
top_.clk = !top_.clk;
++ticks;
}
return ticks;
}
T* operator->() {
return &top_;
}
};
template <typename... Args>
void vl_setw(uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
sig[i] = arr[i];
}
}
template <typename... Args>
int vl_cmpw(const uint32_t* sig, Args&&... args) {
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
for (size_t i = 0; i < sizeof... (Args); ++i) {
if (sig[i] < arr[i])
return -1;
if (sig[i] > arr[i])
return 1;
}
return 0;
}

View File

@@ -5,7 +5,62 @@
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
#else
#define __ASM_STR(x) #x
#endif
#define vx_csr_swap(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_read(csr) ({ \
register unsigned __v; \
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
__v; \
})
#define vx_csr_write(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
// Texture load
#define vx_tex(unit, u, v, l) ({ \
unsigned __r; \
unsigned __u = u; \
unsigned __v = v; \
unsigned __l = l; \
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
__r; \
})
#ifdef __ASSEMBLY__ #ifdef __ASSEMBLY__
#define __ASM_STR(x) x #define __ASM_STR(x) x
@@ -52,6 +107,16 @@ extern "C" {
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \ __asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
}) })
// Texture load
#define vx_tex(unit, u, v, l) ({ \
unsigned __r; \
unsigned __u = u; \
unsigned __v = v; \
unsigned __l = l; \
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
__r; \
})
// Set thread mask // Set thread mask
inline void vx_tmc(unsigned thread_mask) { inline void vx_tmc(unsigned thread_mask) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask)); asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
@@ -86,7 +151,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
// Prefetch // Prefetch
inline void vx_prefetch(unsigned addr) { inline void vx_prefetch(unsigned addr) {
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) ); asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) );
} }
// Return active warp's thread id // Return active warp's thread id
@@ -170,6 +235,8 @@ inline void vx_fence() {
#define __endif vx_join(); #define __endif vx_join();
#define __DIVERGENT__ __attribute__((annotate("divergent")))
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@@ -34,7 +34,7 @@ int vx_vprintf(const char* format, va_list va) {
printf_arg_t arg; printf_arg_t arg;
arg.format = format; arg.format = format;
arg.va = &va; arg.va = &va;
vx_serial(__printf_cb, &arg); vx_serial((vx_serial_cb)__printf_cb, &arg);
return arg.ret; return arg.ret;
} }
@@ -63,7 +63,7 @@ void vx_putint(int value, int base) {
putint_arg_t arg; putint_arg_t arg;
arg.value = value; arg.value = value;
arg.base = base; arg.base = base;
vx_serial(__putint_cb, &arg); vx_serial((vx_serial_cb)__putint_cb, &arg);
} }
static void __putfloat_cb(const putfloat_arg_t* arg) { static void __putfloat_cb(const putfloat_arg_t* arg) {
@@ -83,7 +83,7 @@ void vx_putfloat(float value, int precision) {
putfloat_arg_t arg; putfloat_arg_t arg;
arg.value = value; arg.value = value;
arg.precision = precision; arg.precision = precision;
vx_serial(__putfloat_cb, &arg); vx_serial((vx_serial_cb)__putfloat_cb, &arg);
} }
#ifdef __cplusplus #ifdef __cplusplus

View File

@@ -1,32 +1,34 @@
RTL_DIR=../../hw/rtl RTL_DIR=../../hw/rtl
DPI_DIR=../../hw/dpi DPI_DIR=../../hw/dpi
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../../../hw -I../../common CXXFLAGS += -I../../../hw -I../../common
CXXFLAGS += -I../../common/softfloat/source/include CXXFLAGS += -I../../common/softfloat/source/include
LDFLAGS += ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
# control RTL debug print states # control RTL debug tracing states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO DBG_FLAGS += -DDBG_CACHE_REQ_INFO
DBG_FLAGS += -DVCD_OUTPUT DBG_FLAGS += -DVCD_OUTPUT
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp

View File

@@ -182,7 +182,7 @@ static const char* op_string(const Instr &instr) {
case 2: return "SPLIT"; case 2: return "SPLIT";
case 3: return "JOIN"; case 3: return "JOIN";
case 4: return "BAR"; case 4: return "BAR";
case 5: return "PREFETCH"; case 6: return "PREFETCH";
default: default:
std::abort(); std::abort();
} }

View File

@@ -712,7 +712,7 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) {
pipeline->stall_warp = true; pipeline->stall_warp = true;
runOnce = true; runOnce = true;
} break; } break;
case 5: { case 6: {
// PREFETCH // PREFETCH
int addr = rsdata[0]; int addr = rsdata[0];
printf("*** PREFETCHED %d ***\n", addr); printf("*** PREFETCHED %d ***\n", addr);

View File

@@ -2,27 +2,28 @@ RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi DPI_DIR = ../../hw/dpi
SCRIPT_DIR=../../hw/scripts SCRIPT_DIR=../../hw/scripts
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common CXXFLAGS += -I.. -I../../../hw -I../../common
CXXFLAGS += -I../../common/softfloat/source/include CXXFLAGS += -I../../common/softfloat/source/include
LDFLAGS += -shared ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += -shared ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
# control RTL debug print states # control RTL debug tracing states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS) DBG_FLAGS += $(DBG_TRACE_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO DBG_FLAGS += -DDBG_CACHE_REQ_INFO
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
@@ -30,7 +31,8 @@ SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp SRCS += fpga.cpp opae_sim.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
TOP = vortex_afu_shim TOP = vortex_afu_shim
@@ -84,12 +86,12 @@ VL_FLAGS += -D$(FPU_CORE)
PROJECT = libopae-c-vlsim PROJECT = libopae-c-vlsim
all: shared all: $(PROJECT).so
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h $(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
shared: $(SRCS) vortex_afu.h $(PROJECT).so: $(SRCS) vortex_afu.h
verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so
static: $(SRCS) vortex_afu.h static: $(SRCS) vortex_afu.h

View File

@@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
#else #else
commandQueue = clCreateCommandQueue(context, commandQueue = clCreateCommandQueue(context,
devices[device_touse], NULL, &status); devices[device_touse], 0, &status);
#endif // PROFILING #endif // PROFILING
@@ -451,8 +451,8 @@ void cl_cleanup()
printf("clReleaseContext()\n"); printf("clReleaseContext()\n");
} }
for (int p = 0; p < numPlatforms; ++p) { for (cl_uint p = 0; p < numPlatforms; ++p) {
for (int d = 0; d < numDevices[p]; ++d) { for (cl_uint d = 0; d < numDevices[p]; ++d) {
status = clReleaseDevice(devices[d]); status = clReleaseDevice(devices[d]);
cl_errChk(status, "Oops!", true); cl_errChk(status, "Oops!", true);
printf("clReleaseDevice()\n"); printf("clReleaseDevice()\n");

View File

@@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
#else #else
commandQueue = clCreateCommandQueue(context, commandQueue = clCreateCommandQueue(context,
devices[device_touse], NULL, &status); devices[device_touse], 0, &status);
#endif // PROFILING #endif // PROFILING
@@ -451,8 +451,8 @@ void cl_cleanup()
printf("clReleaseContext()\n"); printf("clReleaseContext()\n");
} }
for (int p = 0; p < numPlatforms; ++p) { for (cl_uint p = 0; p < numPlatforms; ++p) {
for (int d = 0; d < numDevices[p]; ++d) { for (cl_uint d = 0; d < numDevices[p]; ++d) {
status = clReleaseDevice(devices[d]); status = clReleaseDevice(devices[d]);
cl_errChk(status, "Oops!", true); cl_errChk(status, "Oops!", true);
printf("clReleaseDevice()\n"); printf("clReleaseDevice()\n");

Some files were not shown because too many files have changed in this diff Show More