Merge remote-tracking branch 'upstream/master' into vortex2

This commit is contained in:
Hansung Kim
2024-02-01 23:35:58 -08:00
203 changed files with 4383 additions and 21981 deletions

View File

@@ -38,7 +38,7 @@ jobs:
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32 - rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64 - rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
- make -C $HOME/build32 - make -C $HOME/build32
- XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make -C $HOME/build64 - XLEN=64 make -C $HOME/build64
- stage: test - stage: test
name: unittest name: unittest
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
@@ -47,13 +47,13 @@ jobs:
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test - stage: test
name: isa64 name: isa64
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test - stage: test
name: regression name: regression
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test - stage: test
name: regression64 name: regression64
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test - stage: test
name: opencl name: opencl
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl

View File

@@ -33,6 +33,7 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- `miscs`: Miscellaneous resources. - `miscs`: Miscellaneous resources.
## Build Instructions ## Build Instructions
More detailed build instructions can be found [here](docs/install_vortex.md).
### Supported OS Platforms ### Supported OS Platforms
- Ubuntu 18.04 - Ubuntu 18.04
- Centos 7 - Centos 7

View File

@@ -16,7 +16,17 @@
show_usage() show_usage()
{ {
echo "Vortex BlackBox Test Driver v1.0" echo "Vortex BlackBox Test Driver v1.0"
echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--rebuild=0|1] [--log=logfile] [--help]]" echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--rebuild=#n] [--log=logfile] [--help]]"
}
show_help()
{
show_usage
echo " where"
echo "--driver: simx, rtlsim, oape, xrt"
echo "--app: any subfolder test under regression or opencl"
echo "--class: 0=disable, 1=pipeline, 2=memsys"
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
} }
SCRIPT_DIR=$(dirname "$0") SCRIPT_DIR=$(dirname "$0")
@@ -36,6 +46,7 @@ SCOPE=0
HAS_ARGS=0 HAS_ARGS=0
PERF_CLASS=0 PERF_CLASS=0
REBUILD=2 REBUILD=2
TEMPBUILD=0
LOGFILE=run.log LOGFILE=run.log
for i in "$@" for i in "$@"
@@ -102,7 +113,7 @@ case $i in
shift shift
;; ;;
--help) --help)
show_usage show_help
exit 0 exit 0
;; ;;
*) *)
@@ -112,6 +123,12 @@ case $i in
esac esac
done done
if [ $REBUILD -eq 3 ];
then
REBUILD=1
TEMPBUILD=1
fi
case $DRIVER in case $DRIVER in
simx) simx)
DRIVER_PATH=$VORTEX_HOME/runtime/simx DRIVER_PATH=$VORTEX_HOME/runtime/simx
@@ -174,26 +191,59 @@ make -C $VORTEX_HOME/runtime/stub > /dev/null
if [ $DEBUG -ne 0 ] if [ $DEBUG -ne 0 ]
then then
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application # running application
if [ $HAS_ARGS -eq 1 ] if [ $TEMPBUILD -eq 1 ]
then then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" # setup temp directory
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 TEMPDIR=$(mktemp -d)
status=$? mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" # driver initialization
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 if [ $SCOPE -eq 1 ]
status=$? then
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
fi fi
if [ -f "$APP_PATH/trace.vcd" ] if [ -f "$APP_PATH/trace.vcd" ]
@@ -201,26 +251,59 @@ then
mv -f $APP_PATH/trace.vcd . mv -f $APP_PATH/trace.vcd .
fi fi
else else
# driver initialization if [ $TEMPBUILD -eq 1 ]
if [ $SCOPE -eq 1 ]
then then
echo "running: SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH" # setup temp directory
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null TEMPDIR=$(mktemp -d)
else mkdir -p "$TEMPDIR/$DRIVER"
echo "running: CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application # driver initialization
if [ $HAS_ARGS -eq 1 ] if [ $SCOPE -eq 1 ]
then then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER" echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
status=$? else
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER # driver initialization
status=$? if [ $SCOPE -eq 1 ]
then
echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH"
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
fi
fi fi
fi fi

View File

@@ -16,14 +16,11 @@
TOOLDIR=${TOOLDIR:=/opt} TOOLDIR=${TOOLDIR:=/opt}
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
export LLVM_POCL=$TOOLDIR/llvm-pocl
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
export VERILATOR_ROOT=$TOOLDIR/verilator export VERILATOR_ROOT=$TOOLDIR/verilator
export PATH=$VERILATOR_ROOT/bin:$PATH export PATH=$VERILATOR_ROOT/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v export SV2V_PATH=$TOOLDIR/sv2v
export PATH=$SV2V_PATH/bin:$PATH export PATH=$SV2V_PATH/bin:$PATH
export YOSYS_PATH=$TOOLDIR/yosys export YOSYS_PATH=$TOOLDIR/yosys
export PATH=$YOSYS_PATH/bin:$PATH export PATH=$YOSYS_PATH/bin:$PATH
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
export POCL_RT_PATH=$TOOLDIR/pocl/runtime

Binary file not shown.

Before

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 207 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 463 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 517 KiB

View File

@@ -2,69 +2,26 @@
The Vortex Cache Sub-system has the following main properties: The Vortex Cache Sub-system has the following main properties:
- High-bandwidth with bank parallelism - High-bandwidth transfer with Multi-bank parallelism
- Snoop protocol to flush data for CPU access - Non-blocking pipelined architecture with local MSHR
- Generic design: Dcache, Icache, Shared Memory, L2 cache, L3 cache - Configurable design: Dcache, Icache, L2 cache, L3 cache
### Cache Hierarchy ### Cache Microarchitecture
![Image of Cache Hierarchy](./assets/img/cache_hierarchy.png) ![Image of Cache Hierarchy](./assets/img/cache_microarchitecture.png)
- Cache can be configured to be any level in the hierarchy The Vortex cache is comprised of multiple parallel banks. It is comprised of the following modules:
- Caches communicate via snooping - **Bank request dispatch crossbar**: assign a bank to incoming requests and resolve collision using stalls.
- Cache flush from AFU is passed down the hierarchy - **Bank response merge crossbar**: merge result from banks and forward to the core response.
- **Memory request multiplexer**: arbitrate bank memory requests
- **Memory response demultiplexer**: forward memory response to the corresponding bank.
- **Flush Unit**: perform tag memory initialization.
### VX_cache.v (Top Module) Incoming requests entering the cache are sent to a dispatch crossbar that select the corresponding bank for each request, resolving bank collisions with stalls. The result output of each bank is merge back into outgoing response port via merger crossbar. Each bank intergates a non-blocking pipeline with a local Miss Status Holding Register (MSHR) to reduce the miss rate. The bank pipeline consists of the following stages:
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory. - **Schedule**: Selects the next request into the pipeline from the incoming core request, memory fill, or the MSHR entry, with priority given to the latter.
- **Tag Access**: A single-port read/write access to the tag store.
- **Data Access**: Single-port read/write access to the data store.
- **Response Handling**: Core response back to the core.
![Image of Vortex Cache](./assets/img/vortex_cache_top_module.png) Deadlocks inside the cache can occur when the MSHR is full and a new request is already in the pipeline. It can also occur when the memory request queue is full, and there is an incoming memory response. The cache mitigates MSHR deadlocks by using an early full signal before a new request is issued and similarly mitigates memory deadlocks by ensuring that its request queue never fills up.
- Configurable (Cache size, number of banks, bank line size, etc.)
- I/O signals
- Core Request
- Core Rsp
- DRAM Req
- DRAM Rsp
- Snoop Rsp
- Snoop Rsp
- Snoop Forwarding Out
- Snoop Forwarding In
- Bank Select
- Assigns valid and ready signals for each bank
- Snoop Forwarder
- DRAM Request Arbiter
- Prepares cache response for communication with DRAM
- Snoop Response Arbiter
- Sends snoop response
- Core Response Merge
- Cache accesses one line at a time. As a result, each request may not come back in the same response. This module tries to recombine the responses by thread ID.
### VX_cache_bank.v
VX_cache_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache Bank](./assets/img/vortex_bank.png)
- Allows for high throughput
- Each bank contains queues to hold requests to the cache
- I/O signals
- Core request
- Core Response
- DRAM Fill Requests
- DRAM Fill Response
- DRAM WB Requests
- Snp Request
- Snp Response
- Request Priority: DRAM fill, miss reserve, core request, snoop request
- Snoop Request Queue
- DRAM Fill Queue
- Core Req Arbiter
- Requests to be processed by the bank
- Tag Data Store
- Registers for valid, dirty, dirtyb, tag, and data
- Length of registers determined by lines in the bank
- Tag Data Access:
- I/O: stall, snoop info, force request miss
- Writes to cache or sends read response; hit or miss determined here
- A missed request goes to the miss reserve if it is not a snoop request or DRAM fill

View File

@@ -0,0 +1,36 @@
# Continuous Integration
- Each time you push to the repo, the Continuous Integration pipeline will run
- This pipeline consists of creating the correct development environment, building your code, and running all tests
- This is an extensive pipeline so it might take some time to complete
## Protecting Master Branch
Navigate to your Repository:
Open your repository on GitHub.
Click on "Settings":
In the upper-right corner of your repository page, click on the "Settings" tab.
Select "Branches" in the left sidebar:
On the left sidebar, look for the "Branches" option and click on it.
Choose the Branch:
Under "Branch protection rules," select the branch you want to protect. In this case, choose the main branch.
Enable Branch Protection:``
Check the box that says "Protect this branch."
Configure Protection Settings:
You can configure various protection settings. Some common settings include:
Require pull request reviews before merging: This ensures that changes are reviewed before being merged.
Require status checks to pass before merging: This ensures that automated tests and checks are passing.
Require signed commits: This enforces that commits are signed with a verified signature.
Restrict Who Can Push:
You can further restrict who can push directly to the branch. You might want to limit this privilege to specific people or teams.
Save Changes:
Once you've configured the protection settings, scroll down and click on the "Save changes" button.
Now, your main branch is protected, and certain criteria must be met before changes can be pushed directly to it. Contributors will need to create pull requests, have their changes reviewed, and meet other specified criteria before the changes can be merged into the main branch.

18
docs/contributing.md Normal file
View File

@@ -0,0 +1,18 @@
# Contributing to Vortex on Github
## Github Details
- There are two main repos, `vortex` (public, this one) and `vortex-dev` (private)
- todo: Most current development is on `vortex`
- If you have a legacy version of `vortex`, you can use the releases branch or tags to access the repo at that point in time
## Contribution Process
- You should create a new branch from develop that is clearly named with the feature that you want to add
- Avoid pushing directly to the `master` branch instead you will need to make a Pull Request (PR)
- There should be protections in place that prevent pushing directly to the main branch, but don't rely on it
- When you make a PR it will be tested against the continuous integration (ci) pipeline (see `continuous_integration.md`)
- It is not sufficient to just write some tests, they need to be incorporated into the ci pipeline to make sure they are run
- During a PR, you might receive feedback regarding your changes and you might need to make further commits to your branch
## Creating and Adding Tests
see `testing.md`

45
docs/environment_setup.md Normal file
View File

@@ -0,0 +1,45 @@
# Environment Setup
These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
## Set Up on Your Own System
The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md).
## Servers for Georgia Tech Students and Collaborators
### Volvo
Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up.
Setup on Volvo:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh volvo.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
4. `source /nethome/software/set_vortex_env.sh` to set up the necessary environment variables.
5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
### Nio
Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio.
Setup on Nio:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh nio.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
4. `source /opt/set_vortex_env_dev.sh` to set up the necessary environment variables.
5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
## Docker (Experimental)
Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported.
### Setup with Docker
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo.
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .`
4. Run a container based on the image: `docker run --rm -v ./:/root/vortex/ -it --name vtx-dev --privileged=true --platform=linux/amd64 vortex`
5. Install the toolchain `./ci/toolchain_install.sh --all` (once per container)
6. `make -s` in `vortex` root directory
7. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
You may exit from a container and resume a container you have exited or start a second terminal session `docker exec -it <container-name> bash`

View File

@@ -9,9 +9,6 @@ OPAE Environment Setup
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
$ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
$ export PATH=:/opt/verilator/bin:$PATH
$ export VERILATOR_ROOT=/opt/verilator
OPAE Build OPAE Build
------------------ ------------------

View File

@@ -13,7 +13,8 @@
## Installation ## Installation
- Refer to the build instructions in [README](../README.md). - For the different environments Vortex supports, [read this document](environment_setup.md).
- To install on your own system, [follow this document](install_vortex.md).
## Quick Start Scenarios ## Quick Start Scenarios

124
docs/install_vortex.md Normal file
View File

@@ -0,0 +1,124 @@
# Installing and Setting Up the Vortex Environment
## Ubuntu 18.04, 20.04
1. Install the following dependencies:
```
sudo apt-get install build-essential zlib1g-dev libtinfo-dev libncurses5 uuid-dev libboost-serialization-dev libpng-dev libhwloc-dev
```
2. Upgrade gcc to 11:
```
sudo apt-get install gcc-11 g++-11
```
Multiple gcc versions on Ubuntu can be managed with update-alternatives, e.g.:
```
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
```
3. Download the Vortex codebase:
```
git clone --recursive https://github.com/vortexgpgpu/vortex.git
```
4. Install Vortex's prebuilt toolchain:
```
cd vortex
sudo ./ci/toolchain_install.sh -all
# By default, the toolchain will install to /opt folder. This is recommended, but you can install the toolchain to a different directory by setting DESTDIR.
DESTDIR=$TOOLDIR ./ci/toolchain_install.sh -all
```
5. Set up environment:
```
export VORTEX_HOME=$TOOLDIR/vortex
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
export LLVM_POCL=$TOOLDIR/llvm-pocl
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
export POCL_RT_PATH=$TOOLDIR/pocl/runtime
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
export VERILATOR_ROOT=$TOOLDIR/verilator
export SV2V_PATH=$TOOLDIR/sv2v
export YOSYS_PATH=$TOOLDIR/yosys
export PATH=$YOSYS_PATH/bin:$SV2V_PATH/bin:$VERILATOR_ROOT/bin:$PATH
```
6. Build Vortex
```
make
```
## RHEL 8
Note: depending on the system, some of the toolchain may need to be recompiled for non-Ubuntu Linux. The source for the tools can be found [here](https://github.com/vortexgpgpu/).
1. Install the following dependencies:
```
sudo yum install libpng-devel boost boost-devel boost-serialization libuuid-devel opencl-headers hwloc hwloc-devel gmp-devel compat-hwloc1
```
2. Upgrade gcc to 11:
```
sudo yum install gcc-toolset-11
```
Multiple gcc versions on Red Hat can be managed with scl
3. Install MPFR 4.2.0:
Download [the source](https://ftp.gnu.org/gnu/mpfr/) and follow [the installation documentation](https://www.mpfr.org/mpfr-current/mpfr.html#How-to-Install).
4. Download the Vortex codebase:
```
git clone --recursive https://github.com/vortexgpgpu/vortex.git
```
5. Install Vortex's prebuilt toolchain:
```
cd vortex
sudo ./ci/toolchain_install.sh -all
# By default, the toolchain will install to /opt folder. This is recommended, but you can install the toolchain to a different directory by setting DESTDIR.
DESTDIR=$TOOLDIR ./ci/toolchain_install.sh -all
```
6. Set up environment:
```
export VORTEX_HOME=$TOOLDIR/vortex
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
export LLVM_POCL=$TOOLDIR/llvm-pocl
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
export POCL_RT_PATH=$TOOLDIR/pocl/runtime
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
export VERILATOR_ROOT=$TOOLDIR/verilator
export SV2V_PATH=$TOOLDIR/sv2v
export YOSYS_PATH=$TOOLDIR/yosys
export PATH=$YOSYS_PATH/bin:$SV2V_PATH/bin:$VERILATOR_ROOT/bin:$PATH
export LD_LIBRARY_PATH=<path to mpfr>/src/.libs:$LD_LIBRARY_PATH
```
7. Build Vortex
```
make
```

View File

@@ -24,71 +24,57 @@ Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with
- Control the number of warps to activate during execution - Control the number of warps to activate during execution
- `WSPAWN` *count, addr*: activate count warps and jump to addr location - `WSPAWN` *count, addr*: activate count warps and jump to addr location
- **Control-Flow Divergence** - **Control-Flow Divergence**
- Control threads to activate when a branch diverges - Control threads activation when a branch diverges
- `SPLIT` *predicate*: apply 'taken' predicate thread mask adn save 'not-taken' into IPDOM stack - `SPLIT` *taken, predicate*: apply predicate thread mask and save current state into IPDOM stack
- `JOIN`: restore 'not-taken' thread mask - `JOIN`: pop IPDOM stack to restore thread mask
- `PRED` *predicate, restore_mask*: thread predicate instruction
- **Warp Synchronization** - **Warp Synchronization**
- `BAR` *id, count*: stall warps entering barrier *id* until count is reached - `BAR` *id, count*: stall warps entering barrier *id* until count is reached
### Vortex Pipeline/Datapath ### Vortex Pipeline/Datapath
![Image of Vortex Microarchitecture](./assets/img/vortex_microarchitecture_v2.png) ![Image of Vortex Microarchitecture](./assets/img/vortex_microarchitecture.png)
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB. Vortex has a 6-stage pipeline:
- **Schedule**
- Warp Scheduler
- Schedule the next PC into the pipeline
- Track stalled, active warps
- IPDOM Stack
- Save split/join states for divergent threads
- Inflight Tracker
- Track in-flight instructions
- **Fetch** - **Fetch**
- Warp Scheduler - Retrieve instructions from memory
- Track stalled & active warps, resolve branches and barriers, maintain split/join IPDOM stack - Handle I-cache requests/responses
- Instruction Cache
- Retrieve instruction from cache, issue I-cache requests/responses
- **Decode** - **Decode**
- Decode fetched instructions, notify warp scheduler when the following instructions are decoded: - Decode fetched instructions
- Branch, tmc, split/join, wspawn - Notify warp scheduler on control instructions
- Precompute used_regs mask (needed for Issue stage)
- **Issue** - **Issue**
- Scheduling
- In-order issue (operands/execute unit ready), out-of-order commit
- IBuffer - IBuffer
- Store fetched instructions, separate queues per-warp, selects next warp through round-robin scheduling - Store decoded instructions in separate per-warp queues
- Scoreboard - Scoreboard
- Track in-use registers - Track in-use registers
- GPRs (General-Purpose Registers) stage - Check register use for decoded instructions
- Fetch issued instruction operands and send operands to execute unit - Operands Collector
- Fetch the operands for issued instructions from the register file
- **Execute** - **Execute**
- ALU Unit - ALU Unit
- Single-cycle operations (+,-,>>,<<,&,|,^), Branch instructions (Share ALU resources) - Handle arithmetic and branch operations
- MULDIV Unit
- Multiplier - done in 2 cycles
- Divider - division and remainder, done in 32 cycles
- Implements serial alogrithm (Stalls the pipeline)
- FPU Unit - FPU Unit
- Multi-cycle operations, uses `FPnew` Library on ASIC, uses hard DSPs on FPGA - Handle floating-point operations
- CSR Unit
- Store constant status registers - device caps, FPU status flags, performance counters
- Handle external CSR requests (requests from host CPU)
- LSU Unit - LSU Unit
- Handle load/store operations, issue D-cache requests, handle D-cache responses - Handle load/store operations
- Commit load responses - saves storage, Scoreboard tracks completion - SFU Unit
- GPGPU Unit - Handle warp control operations
- Handle GPGPU instructions - Handle Control Status Registers (CSRs) operations
- TMC, WSPAWN, SPLIT, BAR
- JOIN is handled by Warp Scheduler (upon SPLIT response)
- **Commit** - **Commit**
- Commit - Write result back to the register file and update the Scoreboard.
- Update CSR flags, update performance counters
- Writeback ### Vortex clustering architecture
- Write result back to GPRs, notify Scoreboard (release in-use register), select candidate instruction (ALU unit has highest priority) - Sockets
- **Clustering** - Grouping multiple cores sharing L1 cache
- Group mulitple cores into clusters (optionally share L2 cache) - Clusters
- Group multiple clusters (optionally share L3 cache) - Grouping of sockets sharing L2 cache
- Configurable at build time
- Default configuration:
- #Clusters = 1
- #Cores = 4
- #Warps = 4
- #Threads = 4
- **FPGA AFU Interface**
- Manage CPU-GPU comunication
- Query devices caps, load kernel instructions and resource buffers, start kernel execution, read destination buffers
- Local Memory - GPU access to local DRAM
- Reserved I/O addresses - redirect to host CPU, console output

View File

@@ -2,18 +2,18 @@
## Running a Vortex application ## Running a Vortex application
The framework provides a utility script: blakcbox.sh under the /ci/ folder for executing applications in the tests tree. The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree.
You can query the commandline options of the tool using: You can query the commandline options of the tool using:
$ ./ci/blakcbox.sh --help $ ./ci/blackbox.sh --help
To execute sgemm test program on the simx driver and passing "-n10" as argument to sgemm: To execute sgemm test program on the simx driver and passing "-n10" as argument to sgemm:
$ ./ci/blakcbox.sh --driver=simx --app=sgemm --args="-n10" $ ./ci/blackbox.sh --driver=simx --app=sgemm --args="-n10"
You can execute the same application of a GPU architecture with 2 cores: You can execute the same application of a GPU architecture with 2 cores:
$ ./ci/blakcbox.sh --core=2 --driver=simx --app=sgemm --args="-n10" $ ./ci/blackbox.sh --core=2 --driver=simx --app=sgemm --args="-n10"
When excuting, Blackbox needs to recompile the driver if the desired architecture changes. When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
It tracks the latest configuration in a file under the current directory blackbox.<driver>.cache. It tracks the latest configuration in a file under the current directory blackbox.<driver>.cache.
@@ -31,3 +31,17 @@ You can execute the default opncl suite by running the following commands at the
$ make -C tests/opencl run-simx $ make -C tests/opencl run-simx
$ make -C tests/opencl run-rtlsim $ make -C tests/opencl run-rtlsim
## Creating Your Own Regression Tests
- Inside `test/` you will find a series of folders which are named based on what they test
- You can view the tests to see which ones have tests similar to what you are trying to create new tests for
- once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test
- `testcases.h` contains each of the test case templates
- `main.cpp` contains the implementation of each of the test cases and builds a test suite of all the tests cases you want
Compile the test case: `make -C tests/regression/<testcase-name>/ clean-all && make -C tests/regression/<testcase-name>/`
Run the test case: `./ci/blackbox.sh --driver=simx --cores=4 --app=<testcase-name> --debug`
## Adding Your Tests to the CI Pipeline
see `continuous_integration.md`

View File

@@ -45,6 +45,15 @@ module VX_cluster import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS); `SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
`endif `endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS](); VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
@@ -69,18 +78,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (gbar_reset), .reset (gbar_reset),
.gbar_bus_if (gbar_bus_if) .gbar_bus_if (gbar_bus_if)
); );
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_l2cache;
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l2cache = perf_l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(
@@ -102,7 +100,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.MSHR_SIZE (`L2_MSHR_SIZE), .MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE), .MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE), .MREQ_SIZE (`L2_MREQ_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH), .TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1), .WRITE_ENABLE (1),
.UUID_WIDTH (`UUID_WIDTH), .UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2), .CORE_OUT_REG (2),
@@ -113,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (l2_reset), .reset (l2_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_l2cache), .cache_perf (mem_perf_tmp_if.l2cache),
`endif `endif
.core_bus_if (per_socket_mem_bus_if), .core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if) .mem_bus_if (mem_bus_if)
@@ -146,6 +144,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i) .SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
) socket ( ) socket (
`SCOPE_IO_BIND (scope_socket+i) `SCOPE_IO_BIND (scope_socket+i)
.clk (clk), .clk (clk),
.reset (socket_reset), .reset (socket_reset),
@@ -167,6 +166,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
); );
end end
`BUFFER_BUSY (busy, (| per_socket_busy), (`NUM_SOCKETS > 1)); `BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
endmodule endmodule

View File

@@ -191,13 +191,21 @@
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED))) `define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif `endif
`ifndef SV_DPI
`define DPI_DISABLE
`endif
`ifndef FPU_FPNEW `ifndef FPU_FPNEW
`ifndef FPU_DSP `ifndef FPU_DSP
`ifndef FPU_DPI `ifndef FPU_DPI
`ifdef SYNTHESIS `ifndef SYNTHESIS
`define FPU_DSP `ifndef DPI_DISABLE
`else
`define FPU_DPI `define FPU_DPI
`else
`define FPU_DSP
`endif
`else
`define FPU_DSP
`endif `endif
`endif `endif
`endif `endif
@@ -223,18 +231,18 @@
// Number of ALU units // Number of ALU units
`ifndef NUM_ALU_LANES `ifndef NUM_ALU_LANES
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2) `define NUM_ALU_LANES `NUM_THREADS
`endif `endif
`ifndef NUM_ALU_BLOCKS `ifndef NUM_ALU_BLOCKS
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1) `define NUM_ALU_BLOCKS `ISSUE_WIDTH
`endif `endif
// Number of FPU units // Number of FPU units
`ifndef NUM_FPU_LANES `ifndef NUM_FPU_LANES
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2) `define NUM_FPU_LANES `NUM_THREADS
`endif `endif
`ifndef NUM_FPU_BLOCKS `ifndef NUM_FPU_BLOCKS
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1) `define NUM_FPU_BLOCKS `ISSUE_WIDTH
`endif `endif
// Number of LSU units // Number of LSU units
@@ -258,7 +266,10 @@
`endif `endif
// LSU Duplicate Address Check // LSU Duplicate Address Check
`ifdef LSU_DUP `ifndef LSU_DUP_DISABLE
`define LSU_DUP_ENABLE
`endif
`ifdef LSU_DUP_ENABLE
`define LSU_DUP_ENABLED 1 `define LSU_DUP_ENABLED 1
`else `else
`define LSU_DUP_ENABLED 0 `define LSU_DUP_ENABLED 0
@@ -285,8 +296,8 @@
// Floating-Point Units /////////////////////////////////////////////////////// // Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue // Size of FPU Request Queue
`ifndef FPU_REQ_QUEUE_SIZE `ifndef FPUQ_SIZE
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES)) `define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
`endif `endif
// FNCP Latency // FNCP Latency
@@ -377,7 +388,7 @@
// Number of Cache Units // Number of Cache Units
`ifndef NUM_ICACHES `ifndef NUM_ICACHES
`define NUM_ICACHES `UP(`NUM_CORES / 4) `define NUM_ICACHES `UP(`SOCKET_SIZE / 4)
`endif `endif
// Cache Size // Cache Size
@@ -407,7 +418,7 @@
// Number of Associative Ways // Number of Associative Ways
`ifndef ICACHE_NUM_WAYS `ifndef ICACHE_NUM_WAYS
`define ICACHE_NUM_WAYS 2 `define ICACHE_NUM_WAYS 1
`endif `endif
// Dcache Configurable Knobs ////////////////////////////////////////////////// // Dcache Configurable Knobs //////////////////////////////////////////////////
@@ -426,7 +437,7 @@
// Number of Cache Units // Number of Cache Units
`ifndef NUM_DCACHES `ifndef NUM_DCACHES
`define NUM_DCACHES `UP(`NUM_CORES / 4) `define NUM_DCACHES `UP(`SOCKET_SIZE / 4)
`endif `endif
// Cache Size // Cache Size
@@ -436,7 +447,7 @@
// Number of Banks // Number of Banks
`ifndef DCACHE_NUM_BANKS `ifndef DCACHE_NUM_BANKS
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES) `define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
`endif `endif
// Core Response Queue Size // Core Response Queue Size
@@ -461,7 +472,7 @@
// Number of Associative Ways // Number of Associative Ways
`ifndef DCACHE_NUM_WAYS `ifndef DCACHE_NUM_WAYS
`define DCACHE_NUM_WAYS 2 `define DCACHE_NUM_WAYS 1
`endif `endif
// SM Configurable Knobs ////////////////////////////////////////////////////// // SM Configurable Knobs //////////////////////////////////////////////////////
@@ -520,7 +531,7 @@
// Number of Associative Ways // Number of Associative Ways
`ifndef L2_NUM_WAYS `ifndef L2_NUM_WAYS
`define L2_NUM_WAYS 4 `define L2_NUM_WAYS 2
`endif `endif
// L3cache Configurable Knobs ///////////////////////////////////////////////// // L3cache Configurable Knobs /////////////////////////////////////////////////

View File

@@ -57,10 +57,18 @@
`define EX_ALU 0 `define EX_ALU 0
`define EX_LSU 1 `define EX_LSU 1
`define EX_SFU 2 `define EX_SFU 2
`define EX_FPU 3 `define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED) `define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
`define EX_BITS `CLOG2(`NUM_EX_UNITS) `define EX_BITS `CLOG2(`NUM_EX_UNITS)
`define EX_WIDTH `UP(`EX_BITS)
`define SFU_CSRS 0
`define SFU_WCTL 1
`define NUM_SFU_UNITS (2)
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
`define SFU_WIDTH `UP(`SFU_BITS)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@@ -307,20 +315,20 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define BUFFER_BUSY(dst, src, enable) \ `define BUFFER_EX(dst, src, ena, latency) \
logic __busy; \ VX_pipe_register #( \
if (enable) begin \ .DATAW ($bits(dst)), \
always @(posedge clk) begin \ .RESETW ($bits(dst)), \
if (reset) begin \ .DEPTH (latency) \
__busy <= 1'b0; \ ) __``dst ( \
end else begin \ .clk (clk), \
__busy <= src; \ .reset (reset), \
end \ .enable (ena), \
end \ .data_in (src), \
end else begin \ .data_out (dst) \
assign __busy = src; \ )
end \
assign dst = __busy `define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
`define POP_COUNT_EX(out, in, model) \ `define POP_COUNT_EX(out, in, model) \
VX_popcount #( \ VX_popcount #( \
@@ -369,35 +377,32 @@
VX_dcr_bus_if dst(); \ VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
`define PERF_REDUCE(dst, src, field, width, count) \ `define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \ for (genvar __d = 0; __d < dst_count; ++__d) begin \
wire [width-1:0] __reduce_add_o_``dst``field; \ localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
reg [width-1:0] __reduce_add_r_``dst``field; \ wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
for (genvar __i = 0; __i < count; ++__i) begin \ wire [width-1:0] __reduce_add_o_``dst``field; \
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \ for (genvar __i = 0; __i < __count; ++__i) begin \
end \ assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \ end \
__reduce_add_i_``src``field, \ VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_o_``dst``field \ __reduce_add_i_``src``field, \
); \ __reduce_add_o_``dst``field \
always @(posedge clk) begin \ ); \
if (reset) begin \ if (reg_enable) begin \
__reduce_add_r_``dst``field <= '0; \ reg [width-1:0] __reduce_add_r_``dst``field; \
end else begin \ always @(posedge clk) begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \ if (reset) begin \
end \ __reduce_add_r_``dst``field <= '0; \
end \ end else begin \
assign ``dst.``field = __reduce_add_r_``dst``field __reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
end \
`define PERF_CACHE_REDUCE(dst, src, count) \ end \
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \ assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \ end else begin \
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \ assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \ end \
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \ end
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \ `define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
if (block_size != 1) begin \ if (block_size != 1) begin \
@@ -410,8 +415,22 @@
assign dst = src; \ assign dst = src; \
end end
`define TO_DISPATCH_DATA(data, tid) \ `define TO_DISPATCH_DATA(data, tid) { \
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data} data.uuid, \
data.wis, \
data.tmask, \
data.op_type, \
data.op_mod, \
data.wb, \
data.use_PC, \
data.use_imm, \
data.PC, \
data.imm, \
data.rd, \
tid, \
data.rs1_data, \
data.rs2_data, \
data.rs3_data}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@@ -99,7 +99,7 @@ package VX_gpu_pkg;
`ifdef ICACHE_ENABLE `ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES); localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else `else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES); localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif `endif
////////////////////////// Dcache Parameters ////////////////////////////// ////////////////////////// Dcache Parameters //////////////////////////////
@@ -142,10 +142,13 @@ package VX_gpu_pkg;
/////////////////////////////// L1 Parameters ///////////////////////////// /////////////////////////////// L1 Parameters /////////////////////////////
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2)); localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters ///////////////////////////// /////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
// Word size in bytes // Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE; localparam L2_WORD_SIZE = `L1_LINE_SIZE;
@@ -190,42 +193,46 @@ package VX_gpu_pkg;
/////////////////////////////// Issue parameters ////////////////////////// /////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH); localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH; localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO); localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO)); localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN
function logic [ISSUE_IDX_W-1:0] wid_to_isw( function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_ISW_W-1:0] isw
);
if (ISSUE_WIS == 0) begin
wis_to_wid = `NW_WIDTH'(isw);
end else if (ISSUE_ISW == 0) begin
wis_to_wid = `NW_WIDTH'(wis);
end else begin
wis_to_wid = `NW_WIDTH'({wis, isw});
end
endfunction
function logic [ISSUE_ISW_W-1:0] wid_to_isw(
input logic [`NW_WIDTH-1:0] wid input logic [`NW_WIDTH-1:0] wid
); );
if (`ISSUE_WIDTH > 1) begin if (ISSUE_ISW != 0) begin
wid_to_isw = ISSUE_IDX_W'(wid); wid_to_isw = wid[ISSUE_ISW_W-1:0];
end else begin end else begin
wid_to_isw = 0; wid_to_isw = 0;
end end
endfunction endfunction
`IGNORE_UNUSED_END
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_IDX_W-1:0] isw
);
wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH)));
endfunction
function logic [ISSUE_WIS_W-1:0] wid_to_wis( function logic [ISSUE_WIS_W-1:0] wid_to_wis(
input logic [`NW_WIDTH-1:0] wid input logic [`NW_WIDTH-1:0] wid
); );
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH)); if (ISSUE_WIS != 0) begin
endfunction wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW);
end else begin
function logic [ISSUE_ADDRW-1:0] wis_to_addr( wid_to_wis = 0;
input logic [`NR_BITS-1:0] rid, end
input logic [ISSUE_WIS_W-1:0] wis
);
wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO)));
endfunction endfunction
`IGNORE_UNUSED_END
endpackage endpackage

View File

@@ -14,7 +14,7 @@
`ifndef VX_PLATFORM_VH `ifndef VX_PLATFORM_VH
`define VX_PLATFORM_VH `define VX_PLATFORM_VH
`ifndef SYNTHESIS `ifdef SV_DPI
`include "util_dpi.vh" `include "util_dpi.vh"
`endif `endif

View File

@@ -65,59 +65,12 @@ module VX_socket import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if(); VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_icache;
cache_perf_t perf_dcache;
assign mem_perf_tmp_if.icache = perf_icache;
assign mem_perf_tmp_if.dcache = perf_dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x; assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem; assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) cache_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) mem_bus_tmp_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #( VX_mem_bus_if #(
@@ -125,6 +78,11 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (ICACHE_TAG_WIDTH) .TAG_WIDTH (ICACHE_TAG_WIDTH)
) per_core_icache_bus_if[`SOCKET_SIZE](); ) per_core_icache_bus_if[`SOCKET_SIZE]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
`RESET_RELAY (icache_reset, reset); `RESET_RELAY (icache_reset, reset);
VX_cache_cluster #( VX_cache_cluster #(
@@ -149,7 +107,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_REG (2) .MEM_OUT_REG (2)
) icache ( ) icache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_icache), .cache_perf (mem_perf_tmp_if.icache),
`endif `endif
.clk (clk), .clk (clk),
.reset (icache_reset), .reset (icache_reset),
@@ -164,6 +122,11 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS](); ) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
`RESET_RELAY (dcache_reset, reset); `RESET_RELAY (dcache_reset, reset);
VX_cache_cluster #( VX_cache_cluster #(
@@ -189,7 +152,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_REG (2) .MEM_OUT_REG (2)
) dcache ( ) dcache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_dcache), .cache_perf (mem_perf_tmp_if.dcache),
`endif `endif
.clk (clk), .clk (clk),
.reset (dcache_reset), .reset (dcache_reset),
@@ -199,6 +162,40 @@ module VX_socket import VX_gpu_pkg::*; #(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak; wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
wire [`SOCKET_SIZE-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_core_sim_wb_value; wire [`SOCKET_SIZE-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_core_sim_wb_value;
assign sim_ebreak = per_core_sim_ebreak[0]; assign sim_ebreak = per_core_sim_ebreak[0];
@@ -245,6 +242,6 @@ module VX_socket import VX_gpu_pkg::*; #(
); );
end end
`BUFFER_BUSY (busy, (| per_core_busy), (`SOCKET_SIZE > 1)); `BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
endmodule endmodule

View File

@@ -58,6 +58,8 @@
`define VX_CSR_MPM_BASE 12'hB00 `define VX_CSR_MPM_BASE 12'hB00
`define VX_CSR_MPM_BASE_H 12'hB80 `define VX_CSR_MPM_BASE_H 12'hB80
`define VX_CSR_MPM_USER 12'hB03
`define VX_CSR_MPM_USER_H 12'hB83
// Machine Performance-monitoring core counters // Machine Performance-monitoring core counters
// PERF: Standard // PERF: Standard
@@ -68,29 +70,38 @@
`define VX_CSR_MINSTRET 12'hB02 `define VX_CSR_MINSTRET 12'hB02
`define VX_CSR_MINSTRET_H 12'hB82 `define VX_CSR_MINSTRET_H 12'hB82
// PERF: pipeline // PERF: pipeline
`define VX_CSR_MPM_IBUF_ST 12'hB03 `define VX_CSR_MPM_SCHED_ID 12'hB03
`define VX_CSR_MPM_IBUF_ST_H 12'hB83 `define VX_CSR_MPM_SCHED_ID_H 12'hB83
`define VX_CSR_MPM_SCRB_ST 12'hB04 `define VX_CSR_MPM_SCHED_ST 12'hB04
`define VX_CSR_MPM_SCRB_ST_H 12'hB84 `define VX_CSR_MPM_SCHED_ST_H 12'hB84
`define VX_CSR_MPM_ALU_ST 12'hB05 `define VX_CSR_MPM_IBUF_ST 12'hB05
`define VX_CSR_MPM_ALU_ST_H 12'hB85 `define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_LSU_ST 12'hB06 `define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_LSU_ST_H 12'hB86 `define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_FPU_ST 12'hB07 `define VX_CSR_MPM_SCRB_ALU 12'hB07
`define VX_CSR_MPM_FPU_ST_H 12'hB87 `define VX_CSR_MPM_SCRB_ALU_H 12'hB87
`define VX_CSR_MPM_SFU_ST 12'hB08 `define VX_CSR_MPM_SCRB_FPU 12'hB08
`define VX_CSR_MPM_SFU_ST_H 12'hB88 `define VX_CSR_MPM_SCRB_FPU_H 12'hB88
`define VX_CSR_MPM_SCRB_LSU 12'hB09
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
// PERF: memory // PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0A `define VX_CSR_MPM_IFETCHES 12'hB0B
`define VX_CSR_MPM_IFETCHES_H 12'hB8A `define VX_CSR_MPM_IFETCHES_H 12'hB8B
`define VX_CSR_MPM_LOADS 12'hB0B `define VX_CSR_MPM_LOADS 12'hB0C
`define VX_CSR_MPM_LOADS_H 12'hB8B `define VX_CSR_MPM_LOADS_H 12'hB8C
`define VX_CSR_MPM_STORES 12'hB0C `define VX_CSR_MPM_STORES 12'hB0D
`define VX_CSR_MPM_STORES_H 12'hB8C `define VX_CSR_MPM_STORES_H 12'hB8D
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D `define VX_CSR_MPM_IFETCH_LT 12'hB0E
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D `define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LAT 12'hB0E `define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E `define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
// Machine Performance-monitoring memory counters // Machine Performance-monitoring memory counters
// PERF: icache // PERF: icache
@@ -98,59 +109,61 @@
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83 `define VX_CSR_MPM_ICACHE_READS_H 12'hB83
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses `define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84 `define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls
`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85
// PERF: dcache // PERF: dcache
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads `define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85 `define VX_CSR_MPM_DCACHE_READS_H 12'hB86
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes `define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86 `define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses `define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87 `define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses `define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88 `define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts `define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89 `define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls `define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A `define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
// PERF: l2cache // PERF: l2cache
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads `define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E `define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes `define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F `define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses `define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90 `define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses `define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91 `define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts `define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92 `define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls `define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93 `define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91
// PERF: l3cache // PERF: l3cache
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads `define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94 `define VX_CSR_MPM_L3CACHE_READS_H 12'hB92
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes `define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95 `define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses `define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96 `define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses `define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97 `define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts `define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98 `define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls `define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99 `define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97
// PERF: memory // PERF: memory
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads `define VX_CSR_MPM_MEM_READS 12'hB18 // total reads
`define VX_CSR_MPM_MEM_READS_H 12'hB9A `define VX_CSR_MPM_MEM_READS_H 12'hB98
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes `define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B `define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency `define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C `define VX_CSR_MPM_MEM_LT_H 12'hB9A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D
// Machine Information Registers // Machine Information Registers

View File

@@ -46,15 +46,9 @@ module Vortex import VX_gpu_pkg::*; (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); VX_mem_perf_if mem_perf_if();
cache_perf_t perf_l3cache; assign mem_perf_if.icache = 'x;
mem_perf_t mem_perf; assign mem_perf_if.dcache = 'x;
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x; assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.l3cache = perf_l3cache;
assign mem_perf_if.smem = 'x;
assign mem_perf_if.mem = mem_perf;
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(
@@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset), .reset (l3_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_l3cache), .cache_perf (mem_perf_if.l3cache),
`endif `endif
.core_bus_if (per_cluster_mem_bus_if), .core_bus_if (per_cluster_mem_bus_if),
@@ -166,11 +160,12 @@ module Vortex import VX_gpu_pkg::*; (
); );
end end
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1)); `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
@@ -181,19 +176,19 @@ module Vortex import VX_gpu_pkg::*; (
end end
end end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
mem_perf <= '0; mem_perf <= '0;
end else begin end else begin
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1); mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
end
if (mem_req_fire && mem_bus_if.req_data.rw) begin
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
end
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end end
end end
assign mem_perf_if.mem = mem_perf;
`endif `endif

View File

@@ -262,7 +262,7 @@ module VX_afu_wrap #(
.m_axi_awready (m_axi_mem_awready_a), .m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_w), .m_axi_awaddr (m_axi_mem_awaddr_w),
.m_axi_awid (m_axi_mem_awid_a), .m_axi_awid (m_axi_mem_awid_a),
`UNUSED_PIN (m_axi_awlen), .m_axi_awlen (m_axi_mem_awlen_a),
`UNUSED_PIN (m_axi_awsize), `UNUSED_PIN (m_axi_awsize),
`UNUSED_PIN (m_axi_awburst), `UNUSED_PIN (m_axi_awburst),
`UNUSED_PIN (m_axi_awlock), `UNUSED_PIN (m_axi_awlock),

View File

@@ -530,14 +530,17 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req = core_req_valid & core_req_ready & ~core_req_rw; wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req = core_req_valid & core_req_ready & core_req_rw; wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls // per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req); `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);

View File

@@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter")) `STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
cache_perf_t perf_cache_unit[NUM_CACHES]; cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES); `PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
assign cache_perf = perf_cache_tmp[0];
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(

View File

@@ -1,190 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_cluster_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_UNITS = 2,
parameter NUM_INPUTS = 4,
parameter TAG_SEL_IDX = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 16,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 1,
// Core response output register
parameter CORE_OUT_REG = 2,
// Memory request output register
parameter MEM_OUT_REG = 2,
parameter NUM_CACHES = `UP(NUM_UNITS),
parameter PASSTHRU = (NUM_UNITS == 0),
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
) (
input wire clk,
input wire reset,
// PERF
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
// Core request
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
// Core response
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus_if[NUM_INPUTS * NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_X_WIDTH)
) mem_bus_if();
// Core request
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
end
end
// Core response
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
end
end
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache_cluster #(
.INSTANCE_ID (INSTANCE_ID),
.NUM_UNITS (NUM_UNITS),
.NUM_INPUTS (NUM_INPUTS),
.TAG_SEL_IDX (TAG_SEL_IDX),
.NUM_REQS (NUM_REQS),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.NC_ENABLE (NC_ENABLE),
.CORE_OUT_REG (CORE_OUT_REG),
.MEM_OUT_REG (MEM_OUT_REG)
) cache (
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.clk (clk),
.reset (reset),
.core_bus_if (core_bus_if),
.mem_bus_if (mem_bus_if)
);
endmodule

View File

@@ -93,13 +93,7 @@ module VX_cache_data #(
assign wren = fill; assign wren = fill;
end end
wire [`CLOG2(NUM_WAYS)-1:0] way_idx; wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
// generate if (NUM_WAYS == 1) begin
// wire [0:0] way_idx;
// end else begin
// wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
// end
// endgenerate
VX_onehot_encoder #( VX_onehot_encoder #(
.N (NUM_WAYS) .N (NUM_WAYS)

View File

@@ -63,4 +63,16 @@
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))} `define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)} `define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
`endif // VX_CACHE_DEFINE_VH `endif // VX_CACHE_DEFINE_VH

View File

@@ -13,7 +13,7 @@
`include "VX_cache_define.vh" `include "VX_cache_define.vh"
module VX_cache_top #( module VX_cache_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "", parameter `STRING INSTANCE_ID = "",
// Number of Word requests per cycle // Number of Word requests per cycle
@@ -22,7 +22,7 @@ module VX_cache_top #(
// Size of cache in bytes // Size of cache in bytes
parameter CACHE_SIZE = 16384, parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes // Size of line inside a bank in bytes
parameter LINE_SIZE = 16, parameter LINE_SIZE = 64,
// Number of banks // Number of banks
parameter NUM_BANKS = 4, parameter NUM_BANKS = 4,
// Number of associative ways // Number of associative ways

View File

@@ -92,24 +92,24 @@ module VX_commit import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out) `UNUSED_PIN (sel_out)
); );
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready; assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask; assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid; assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop; assign commit_eop[i] = commit_if[i].data.eop;
end end
// CSRs update // CSRs update
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r; wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r; wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr; wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire); assign commit_fire_any = (| commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] pop_count; wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(pop_count, commit_tmask[i]); `POP_COUNT(count, commit_tmask[i]);
assign commit_size[i] = pop_count; assign commit_size[i] = count;
end end
VX_pipe_register #( VX_pipe_register #(
@@ -130,7 +130,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.OP ("+") .OP ("+")
) commit_size_reduce ( ) commit_size_reduce (
.data_in (commit_size_r), .data_in (commit_size_r),
.data_out (commit_size_all) .data_out (commit_size_all_r)
); );
VX_pipe_register #( VX_pipe_register #(
@@ -140,26 +140,26 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (1'b1), .enable (1'b1),
.data_in ({commit_fire_any_r, commit_size_all}), .data_in ({commit_fire_any_r, commit_size_all_r}),
.data_out ({commit_fire_any_rr, commit_size_all_r}) .data_out ({commit_fire_any_rr, commit_size_all_rr})
); );
reg [`PERF_CTR_BITS-1:0] instret; reg [`PERF_CTR_BITS-1:0] instret;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
instret <= '0; instret <= '0;
end else begin end else begin
if (commit_fire_any_rr) begin if (commit_fire_any_rr) begin
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r); instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
end end
end end
end end
assign commit_csr_if.instret = instret; assign commit_csr_if.instret = instret;
// Committed instructions // Committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
VX_pipe_register #( VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), .DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH) .RESETW (`ISSUE_WIDTH)
@@ -167,23 +167,23 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (1'b1), .enable (1'b1),
.data_in ({(commit_fire & commit_eop), commit_wid}), .data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid}) .data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
); );
// Writeback // Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb; assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_if[i].data.uuid; assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid); assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC; assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask = commit_if[i].data.tmask; assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd; assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.data = commit_if[i].data.data; assign writeback_if[i].data.data = commit_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop; assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop; assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1; assign commit_if[i].ready = 1'b1; // writeback has no backpressure
end end
// simulation helper signal to get RISC-V tests Pass/Fail status // simulation helper signal to get RISC-V tests Pass/Fail status

View File

@@ -79,20 +79,14 @@ module VX_core import VX_gpu_pkg::*; #(
) dcache_bus_tmp_if[DCACHE_NUM_REQS](); ) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_pipeline_perf_if pipeline_perf_if();
VX_mem_perf_if mem_perf_tmp_if(); VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache; assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
`ifdef SM_ENABLE assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t smem_perf;
assign mem_perf_tmp_if.smem = smem_perf;
`else
assign mem_perf_tmp_if.smem = '0;
`endif
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
`RESET_RELAY (dcr_data_reset, reset); `RESET_RELAY (dcr_data_reset, reset);
@@ -120,6 +114,10 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (schedule_reset), .reset (schedule_reset),
`ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule),
`endif
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if),
@@ -248,7 +246,7 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (smem_perf), .cache_perf (mem_perf_tmp_if.smem),
`endif `endif
.dcache_bus_in_if (dcache_bus_tmp_if), .dcache_bus_in_if (dcache_bus_tmp_if),
.dcache_bus_out_if (dcache_bus_if), .dcache_bus_out_if (dcache_bus_if),
@@ -267,32 +265,36 @@ module VX_core import VX_gpu_pkg::*; #(
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire perf_icache_pending_read_cycle; wire [1:0] perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_ifetches; reg [`PERF_CTR_BITS-1:0] perf_ifetches;
reg [`PERF_CTR_BITS-1:0] perf_loads; reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores; reg [`PERF_CTR_BITS-1:0] perf_stores;
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready; wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready; wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire; wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
end end
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire); `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire); `BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); `POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire; assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;

View File

@@ -130,6 +130,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.smem = '0;
assign mem_perf_if.mem = '0;
`endif `endif
`ifdef SCOPE `ifdef SCOPE

View File

@@ -35,7 +35,6 @@ import VX_fpu_pkg::*;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif `endif
VX_commit_csr_if.slave commit_csr_if, VX_commit_csr_if.slave commit_csr_if,
@@ -183,105 +182,115 @@ import VX_fpu_pkg::*;
default: begin default: begin
read_addr_valid_r = 0; read_addr_valid_r = 0;
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32)) if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin || (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
read_addr_valid_r = 1; read_addr_valid_r = 1;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
case (base_dcrs.mpm_class) case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin `VX_DCR_MPM_CLASS_CORE: begin
case (read_addr) case (read_addr)
// PERF: pipeline // PERF: pipeline
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; `VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; `VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0]; `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0]; `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0]; `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
`else `else
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0; `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0; `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`endif `endif
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0]; `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
// PERF: memory // PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0]; `VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0]; `VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; `VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; `VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:; default:;
endcase endcase
end end
`VX_DCR_MPM_CLASS_MEM: begin `VX_DCR_MPM_CLASS_MEM: begin
case (read_addr) case (read_addr)
// PERF: icache // PERF: icache
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0]; `VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: dcache // PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0]; `VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0]; `VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0]; `VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0]; `VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0]; `VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem // PERF: smem
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0]; `VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0]; `VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0]; `VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache // PERF: l2cache
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0]; `VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0]; `VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0]; `VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0]; `VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0]; `VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0]; `VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l3cache // PERF: l3cache
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0]; `VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0]; `VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0]; `VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0]; `VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0]; `VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0]; `VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory // PERF: memory
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0]; `VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0]; `VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0]; `VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
default:; default:;
endcase endcase
end end
@@ -301,8 +310,6 @@ import VX_fpu_pkg::*;
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid)) `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
`UNUSED_VAR (perf_wctl_stalls);
`UNUSED_VAR (mem_perf_if.icache); `UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.smem); `UNUSED_VAR (mem_perf_if.smem);
`endif `endif

View File

@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif `endif
.commit_csr_if (commit_csr_if), .commit_csr_if (commit_csr_if),

View File

@@ -12,6 +12,7 @@
// limitations under the License. // limitations under the License.
`include "VX_define.vh" `include "VX_define.vh"
`include "VX_trace.vh"
module VX_dispatch import VX_gpu_pkg::*; #( module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter CORE_ID = 0
@@ -175,29 +176,37 @@ module VX_dispatch import VX_gpu_pkg::*; #(
end end
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r; wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] operands_stall; reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready; always @(*) begin
assign operands_ex_type[i] = operands_if[i].data.ex_type; perf_issue_unit_stalls_per_cycle[i] = '0;
end if (operands_if[i].valid && ~operands_if[i].ready) begin
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
always @(*) begin
perf_stalls_n = perf_stalls_r;
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
if (operands_stall[i]) begin
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
end end
end end
end end
always @(posedge clk) begin VX_reduce #(
if (reset) begin .DATAW_IN (`NUM_EX_UNITS),
perf_stalls_r <= '0; .N (`ISSUE_WIDTH),
end else begin .OP ("|")
perf_stalls_r <= perf_stalls_n; ) reduce (
.data_in (perf_issue_unit_stalls_per_cycle),
.data_out (perf_unit_stalls_per_cycle)
);
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
end
end end
end end

View File

@@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
batch_idx <= '0; batch_idx <= '0;
end else if (batch_done) begin end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(1); batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end end
end end
end else begin end else begin
@@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_done[block_idx] = ~valid_p || ready_p; assign block_done[block_idx] = ~valid_p || ready_p;
end end
wire [ISSUE_IDX_W-1:0] wsi; wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin if (BLOCK_SIZE != 1) begin
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)}; assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin end else begin
assign wsi = batch_idx; assign isw = batch_idx;
end end
end else begin end else begin
assign wsi = block_idx; assign isw = block_idx;
end end
`RESET_RELAY(buf_out_reset, reset); `RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi); wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (OUT_DATAW), .DATAW (OUT_DATAW),

View File

@@ -30,7 +30,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
localparam NUM_LANES = `NUM_FPU_LANES; localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE); localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #( VX_execute_if #(
@@ -87,7 +87,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_index_buffer #( VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1), .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPU_REQ_QUEUE_SIZE) .SIZE (`FPUQ_SIZE)
) tag_store ( ) tag_store (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),

View File

@@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
wire [BLOCK_SIZE-1:0] commit_in_valid; wire [BLOCK_SIZE-1:0] commit_in_valid;
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data; wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
wire [BLOCK_SIZE-1:0] commit_in_ready; wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi; wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_valid[i] = commit_in_if[i].valid; assign commit_in_valid[i] = commit_in_if[i].valid;
@@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
assign commit_in_if[i].ready = commit_in_ready[i]; assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin if (BLOCK_SIZE != 1) begin
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin end else begin
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W]; assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
end end
end else begin end else begin
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i); assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
end end
end end
@@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
commit_out_data[i] = 'x; commit_out_data[i] = 'x;
end end
for (integer i = 0; i < BLOCK_SIZE; ++i) begin for (integer i = 0; i < BLOCK_SIZE; ++i) begin
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i]; commit_out_valid[commit_in_isw[i]] = commit_in_valid[i];
commit_out_data[commit_in_wsi[i]] = commit_in_data[i]; commit_out_data[commit_in_isw[i]] = commit_in_data[i];
end end
end end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]]; assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
end end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin

View File

@@ -14,10 +14,10 @@
`include "VX_platform.vh" `include "VX_platform.vh"
module VX_ipdom_stack #( module VX_ipdom_stack #(
parameter WIDTH = 1, parameter WIDTH = 1,
parameter DEPTH = 1, parameter DEPTH = 1,
parameter OUT_REG = 0, parameter OUT_REG = 0,
parameter ADDRW = `LOG2UP(DEPTH) parameter ADDRW = `LOG2UP(DEPTH)
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,

View File

@@ -59,6 +59,11 @@ module VX_issue #(
) scoreboard ( ) scoreboard (
.clk (clk), .clk (clk),
.reset (scoreboard_reset), .reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif
.writeback_if (writeback_if), .writeback_if (writeback_if),
.ibuffer_if (ibuffer_if), .ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if) .scoreboard_if (scoreboard_if)
@@ -80,7 +85,7 @@ module VX_issue #(
.clk (clk), .clk (clk),
.reset (dispatch_reset), .reset (dispatch_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.perf_stalls (perf_issue_if.dsp_stalls), `UNUSED_PIN (perf_stalls),
`endif `endif
.operands_if (operands_if), .operands_if (operands_if),
.alu_dispatch_if(alu_dispatch_if), .alu_dispatch_if(alu_dispatch_if),
@@ -152,29 +157,18 @@ module VX_issue #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle; wire decode_stall = decode_if.valid && ~decode_if.ready;
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
end
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
perf_ibf_stalls <= '0; perf_ibf_stalls <= '0;
perf_scb_stalls <= '0;
end else begin end else begin
if (decode_if.valid && ~decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
end end
end end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls; assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
assign perf_issue_if.scb_stalls = perf_scb_stalls;
`endif `endif
endmodule endmodule

View File

@@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
// detect duplicate addresses // detect duplicate addresses
wire lsu_is_dup; wire lsu_is_dup;
`ifdef LSU_DUP `ifdef LSU_DUP_ENABLE
if (NUM_LANES > 1) begin if (NUM_LANES > 1) begin
wire [NUM_LANES-2:0] addr_matches; wire [NUM_LANES-2:0] addr_matches;
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
@@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
assign mem_req_tag = { assign mem_req_tag = {
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
`ifdef LSU_DUP `ifdef LSU_DUP_ENABLE
, lsu_is_dup , lsu_is_dup
`endif `endif
}; };
@@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
wire [PID_WIDTH-1:0] rsp_pid; wire [PID_WIDTH-1:0] rsp_pid;
wire rsp_is_dup; wire rsp_is_dup;
`ifndef LSU_DUP `ifndef LSU_DUP_ENABLE
assign rsp_is_dup = 0; assign rsp_is_dup = 0;
`endif `endif
assign { assign {
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
`ifdef LSU_DUP `ifdef LSU_DUP_ENABLE
, rsp_is_dup , rsp_is_dup
`endif `endif
} = mem_rsp_tag; } = mem_rsp_tag;
@@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
VX_stream_arb #( VX_stream_arb #(
.NUM_INPUTS (2), .NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW), .DATAW (RSP_ARB_DATAW),
.OUT_REG (1) .OUT_REG (3)
) rsp_arb ( ) rsp_arb (
.clk (clk), .clk (clk),
.reset (commit_reset), .reset (commit_reset),

View File

@@ -220,8 +220,13 @@ module VX_muldiv_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2; wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i]; assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i]; assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
`else
assign div_in1[i] = execute_if.data.rs1_data[i];
assign div_in2[i] = execute_if.data.rs2_data[i];
`endif
end end
`ifdef IDIV_DPI `ifdef IDIV_DPI

View File

@@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #(
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
localparam STATE_IDLE = 2'd0; localparam STATE_IDLE = 2'd0;
localparam STATE_FETCH1 = 2'd1; localparam STATE_FETCH1 = 2'd1;
@@ -38,11 +39,16 @@ module VX_operands import VX_gpu_pkg::*; #(
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n; reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n; reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n; reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n; reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg valid_out_r;
reg [DATAW-1:0] data_out_r;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
@@ -54,12 +60,12 @@ module VX_operands import VX_gpu_pkg::*; #(
reg rs3_ready, rs3_ready_n; reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n; reg data_ready, data_ready_n;
wire ready_out = operands_if[i].ready;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0); wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0); wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0); wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
VX_operands_if staging_if();
always @(*) begin always @(*) begin
state_n = state; state_n = state;
rs2_n = rs2; rs2_n = rs2;
@@ -79,7 +85,7 @@ module VX_operands import VX_gpu_pkg::*; #(
case (state) case (state)
STATE_IDLE: begin STATE_IDLE: begin
if (staging_if.valid && staging_if.ready) begin if (valid_out_r && ready_out) begin
data_ready_n = 0; data_ready_n = 0;
end end
if (scoreboard_if[i].valid && data_ready_n == 0) begin if (scoreboard_if[i].valid && data_ready_n == 0) begin
@@ -160,11 +166,8 @@ module VX_operands import VX_gpu_pkg::*; #(
end end
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd; cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop; cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
if (writeback_if[i].data.sop) begin cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask; (cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
end else begin
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
end
end end
end end
end end
@@ -172,32 +175,84 @@ module VX_operands import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
state <= STATE_IDLE; state <= STATE_IDLE;
gpr_rd_rid <= '0;
gpr_rd_wis <= '0;
cache_eop <= {ISSUE_RATIO{1'b1}}; cache_eop <= {ISSUE_RATIO{1'b1}};
cache_reg <= '0;
data_ready <= 0; data_ready <= 0;
valid_out_r <= 0;
end else begin end else begin
state <= state_n; state <= state_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
cache_eop <= cache_eop_n; cache_eop <= cache_eop_n;
data_ready <= data_ready_n; data_ready <= data_ready_n;
if (~valid_out_r) begin
valid_out_r <= scoreboard_if[i].valid && data_ready;
end else if (ready_out) begin
valid_out_r <= 0;
end
end end
if (~valid_out_r) begin
data_out_r <= {scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd};
end
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
end end
assign operands_if[i].valid = valid_out_r;
assign {operands_if[i].data.uuid,
operands_if[i].data.wis,
operands_if[i].data.tmask,
operands_if[i].data.PC,
operands_if[i].data.wb,
operands_if[i].data.ex_type,
operands_if[i].data.op_type,
operands_if[i].data.op_mod,
operands_if[i].data.use_PC,
operands_if[i].data.use_imm,
operands_if[i].data.imm,
operands_if[i].data.rd} = data_out_r;
assign operands_if[i].data.rs1_data = rs1_data;
assign operands_if[i].data.rs2_data = rs2_data;
assign operands_if[i].data.rs3_data = rs3_data;
assign scoreboard_if[i].ready = ~valid_out_r && data_ready;
// GPR banks // GPR banks
reg [RAM_ADDRW-1:0] gpr_rd_addr;
wire [RAM_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
always @(posedge clk) begin
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
end
end else begin
assign gpr_wr_addr = writeback_if[i].data.rd;
always @(posedge clk) begin
gpr_rd_addr <= gpr_rd_rid_n;
end
end
`ifdef GPR_RESET `ifdef GPR_RESET
reg wr_enabled = 0; reg wr_enabled = 0;
always @(posedge clk) begin always @(posedge clk) begin
@@ -205,8 +260,6 @@ module VX_operands import VX_gpu_pkg::*; #(
wr_enabled <= 1; wr_enabled <= 1;
end end
end end
`else
wire wr_enabled = 1;
`endif `endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin for (genvar j = 0; j < `NUM_THREADS; ++j) begin
@@ -222,81 +275,17 @@ module VX_operands import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.read (1'b1), .read (1'b1),
`UNUSED_PIN (wren), `UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), .write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)), `else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]), .wdata (writeback_if[i].data.data[j]),
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)), .raddr (gpr_rd_addr),
.rdata (gpr_rd_data[j]) .rdata (gpr_rd_data[j])
); );
end end
// staging buffer
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (scoreboard_if[i].valid),
.ready_in (scoreboard_if[i].ready),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd}),
.data_out ({
staging_if.data.uuid,
staging_if.data.wis,
staging_if.data.tmask,
staging_if.data.PC,
staging_if.data.wb,
staging_if.data.ex_type,
staging_if.data.op_type,
staging_if.data.op_mod,
staging_if.data.use_PC,
staging_if.data.use_imm,
staging_if.data.imm,
staging_if.data.rd}),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
assign staging_if.data.rs1_data = rs1_data;
assign staging_if.data.rs2_data = rs2_data;
assign staging_if.data.rs3_data = rs3_data;
// output buffer
wire valid_stg, ready_stg;
assign valid_stg = staging_if.valid && data_ready;
assign staging_if.ready = ready_stg && data_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (operands_if[i].data),
.valid_out (operands_if[i].valid),
.ready_out (operands_if[i].ready)
);
end end
endmodule endmodule

View File

@@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.schedule perf_schedule_if,
`endif
// configuration // configuration
input base_dcrs_t base_dcrs, input base_dcrs_t base_dcrs,
@@ -304,6 +308,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS); localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid; reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid); wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
`ifdef SV_DPI
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0)); instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
@@ -311,6 +316,12 @@ module VX_schedule import VX_gpu_pkg::*; #(
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc))); instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
end end
end end
`else
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
always @(*) begin
instr_uuid = `UUID_WIDTH'(w_uuid);
end
`endif
`else `else
wire [`UUID_WIDTH-1:0] instr_uuid = '0; wire [`UUID_WIDTH-1:0] instr_uuid = '0;
`endif `endif
@@ -349,7 +360,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
.empty (no_pending_instr) .empty (no_pending_instr)
); );
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1); `BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
// export CSRs // export CSRs
assign sched_csr_if.cycles = cycles; assign sched_csr_if.cycles = cycles;
@@ -376,4 +387,25 @@ module VX_schedule import VX_gpu_pkg::*; #(
end end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps)); `RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
wire schedule_idle = ~schedule_valid;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_sched_idles <= '0;
perf_sched_stalls <= '0;
end else begin
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
end
end
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
`endif
endmodule endmodule

View File

@@ -19,6 +19,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH], VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH] VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
@@ -26,101 +32,186 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`ifdef PERF_ENABLE
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) perf_units_reduce (
.data_in (perf_issue_units_per_cycle),
.data_out (perf_units_per_cycle)
);
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_issue_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
);
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_units_uses[i] <= '0;
end else begin
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
end
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_sfu_uses[i] <= '0;
end else begin
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
end
end
end
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
reg [3:0] ready_masks, ready_masks_n;
VX_ibuffer_if staging_if();
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
`ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin always @(*) begin
inuse_regs_n = inuse_regs; case (scoreboard_if[i].data.op_type)
ready_masks_n = ready_masks; `INST_SFU_CSRRW,
if (writeback_fire) begin `INST_SFU_CSRRS,
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0; `INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}} default: sfu_type = `SFU_WCTL;
& {(writeback_if[i].data.rd == staging_if.data.rd), endcase
(writeback_if[i].data.rd == staging_if.data.rs1), end
(writeback_if[i].data.rd == staging_if.data.rs2),
(writeback_if[i].data.rd == staging_if.data.rs3)}; always @(*) begin
end perf_issue_units_per_cycle[i] = '0;
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin perf_issue_sfu_per_cycle[i] = '0;
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1; if (ibuffer_if[i].valid) begin
ready_masks_n = '0; if (inuse_rd) begin
end perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd], perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1], end
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2], end
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]}; if (inuse_rs1) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
end
end
if (inuse_rs2) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
end
end
if (inuse_rs3) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
end
end
end end
end end
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
`endif
reg [DATAW-1:0] data_out_r;
reg valid_out_r;
wire ready_out;
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire deps_ready = (& ready_masks);
wire valid_in = ibuffer_if[i].valid && deps_ready;
wire ready_in = ~valid_out_r && deps_ready;
wire [DATAW-1:0] data_in = ibuffer_if[i].data;
assign ready_out = scoreboard_if[i].ready;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
inuse_regs <= '0; valid_out_r <= 0;
ready_masks <= '0; inuse_regs <= '0;
end else begin end else begin
inuse_regs <= inuse_regs_n; if (writeback_fire) begin
ready_masks <= ready_masks_n; inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
end
if (~valid_out_r) begin
valid_out_r <= valid_in;
end else if (ready_out) begin
if (scoreboard_if[i].data.wb) begin
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
`ifdef PERF_ENABLE
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
end
`endif
end
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= data_in;
end end
end end
// staging buffer assign ibuffer_if[i].ready = ready_in;
assign scoreboard_if[i].valid = valid_out_r;
`RESET_RELAY (stg_buf_reset, reset); assign scoreboard_if[i].data = data_out_r;
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (ibuffer_if[i].valid),
.ready_in (ibuffer_if[i].ready),
.data_in (ibuffer_if[i].data),
.data_out (staging_if.data),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
// output buffer
wire valid_stg, ready_stg;
wire regs_ready = (& ready_masks);
assign valid_stg = staging_if.valid && regs_ready;
assign staging_if.ready = ready_stg && regs_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (scoreboard_if[i].data),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready)
);
`ifdef SIMULATION
reg [31:0] timeout_ctr; reg [31:0] timeout_ctr;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
timeout_ctr <= '0; timeout_ctr <= '0;
end else begin end else begin
if (staging_if.valid && ~regs_ready) begin if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE `ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", `TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid)); ~ready_masks, ibuffer_if[i].data.uuid));
`endif `endif
timeout_ctr <= timeout_ctr + 1; timeout_ctr <= timeout_ctr + 1;
end else if (staging_if.valid && staging_if.ready) begin end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
timeout_ctr <= '0; timeout_ctr <= '0;
end end
end end
@@ -128,12 +219,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", ("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid)); ~ready_masks, ibuffer_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0, `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)", ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid)); $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
`endif
end end
endmodule endmodule

View File

@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1; localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1; localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0; localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSR = 1; localparam RSP_ARB_IDX_CSRS = 1;
VX_execute_if #( VX_execute_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in; wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in; wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
`ifdef PERF_ENABLE
VX_sfu_perf_if sfu_perf_if();
`endif
// Warp control block // Warp control block
VX_execute_if #( VX_execute_if #(
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
@@ -141,18 +137,18 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.commit_if (csr_commit_if) .commit_if (csr_commit_if)
); );
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid; assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data; assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR]; assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
// can accept new request? // can accept new request?
reg sfu_req_ready; reg sfu_req_ready;
always @(*) begin always @(*) begin
case (execute_if[0].data.op_type) case (execute_if[0].data.op_type)
`INST_SFU_CSRRW, `INST_SFU_CSRRW,
`INST_SFU_CSRRS, `INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready; `INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready; default: sfu_req_ready = wctl_execute_if.ready;
endcase endcase
end end
@@ -170,7 +166,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.NUM_INPUTS (RSP_ARB_SIZE), .NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW), .DATAW (RSP_ARB_DATAW),
.ARBITER ("R"), .ARBITER ("R"),
.OUT_REG (1) .OUT_REG (3)
) rsp_arb ( ) rsp_arb (
.clk (clk), .clk (clk),
.reset (commit_reset), .reset (commit_reset),
@@ -186,7 +182,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_gather_unit #( VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE), .BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.OUT_REG (3) .OUT_REG (1)
) gather_unit ( ) gather_unit (
.clk (clk), .clk (clk),
.reset (commit_reset), .reset (commit_reset),
@@ -194,16 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.commit_out_if (commit_if) .commit_out_if (commit_if)
); );
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
always @(posedge clk) begin
if (reset) begin
perf_wctl_stalls <= '0;
end else begin
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
end
end
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
`endif
endmodule endmodule

View File

@@ -14,9 +14,7 @@
`ifndef VX_TRACE_VH `ifndef VX_TRACE_VH
`define VX_TRACE_VH `define VX_TRACE_VH
`ifndef SYNTHESIS `ifdef SIMULATION
`include "VX_define.vh"
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type); task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type) case (ex_type)

View File

@@ -29,7 +29,6 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_PARAM (CORE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES); localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); localparam PID_WIDTH = `UP(PID_BITS);
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t); localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
@@ -50,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN); wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR); wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [LANE_WIDTH-1:0] tid; wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS]; assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin end else begin

View File

@@ -54,28 +54,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
localparam EXP_BITS = 8; localparam EXP_BITS = 8;
localparam EXP_BIAS = 2**(EXP_BITS-1)-1; localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
// Use 32-bit integer // Use 32-bit integer
localparam MAX_INT_WIDTH = 32; localparam INT_WIDTH = 32;
// The internal mantissa includes normal bit or an entire integer // The internal mantissa includes normal bit or an entire integer
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH); localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH); localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
// The internal exponent must be able to represent the smallest denormal input value as signed // The internal exponent must be able to represent the smallest denormal input value as signed
// or the number of bits in an integer // or the number of bits in an integer
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1; localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
// shift amount for denormalization
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS; localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R
// Input processing // Input processing
@@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.EXP_BITS (EXP_BITS), .EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS) .MAN_BITS (MAN_BITS)
) fp_class ( ) fp_class (
.exp_i (dataa[i][30:23]), .exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]),
.man_i (dataa[i][22:0]), .man_i (dataa[i][MAN_BITS-1:0]),
.clss_o (fclass[i]) .clss_o (fclass[i])
); );
end end
@@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] input_sign; wire [NUM_LANES-1:0] input_sign;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [INT_MAN_WIDTH-1:0] int_mantissa; wire i2f_sign = dataa[i][INT_WIDTH-1];
wire [INT_MAN_WIDTH-1:0] fmt_mantissa; wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
wire fmt_sign = dataa[i][31]; wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
wire int_sign = dataa[i][31] && is_signed; wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal}); assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa; assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
assign input_sign[i] = is_itof ? int_sign : fmt_sign; assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
end end
// Pipeline stage0 // Pipeline stage0
wire valid_in_s0; wire valid_in_s0;
wire [NUM_LANES-1:0] lane_mask_s0; wire [NUM_LANES-1:0] lane_mask_s0;
wire [TAGW-1:0] tag_in_s0; wire [TAGW-1:0] tag_in_s0;
wire is_itof_s0; wire is_itof_s0;
wire unsigned_s0; wire is_signed_s0;
wire [2:0] rnd_mode_s0; wire [2:0] rnd_mode_s0;
fclass_t [NUM_LANES-1:0] fclass_s0; fclass_t [NUM_LANES-1:0] fclass_s0;
wire [NUM_LANES-1:0] input_sign_s0; wire [NUM_LANES-1:0] input_sign_s0;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0; wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
@@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall), .enable (~stall),
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}), .data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0}) .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
); );
// Normalization // Normalization
@@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i]; assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
// Unbias exponent and compensate for shift // Unbias exponent and compensate for shift
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]}); wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]}); wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0; assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
end end
// Pipeline stage1 // Pipeline stage1
wire valid_in_s1; wire valid_in_s1;
wire [NUM_LANES-1:0] lane_mask_s1; wire [NUM_LANES-1:0] lane_mask_s1;
wire [TAGW-1:0] tag_in_s1; wire [TAGW-1:0] tag_in_s1;
wire is_itof_s1; wire is_itof_s1;
wire unsigned_s1; wire is_signed_s1;
wire [2:0] rnd_mode_s1; wire [2:0] rnd_mode_s1;
fclass_t [NUM_LANES-1:0] fclass_s1; fclass_t [NUM_LANES-1:0] fclass_s1;
wire [NUM_LANES-1:0] input_sign_s1; wire [NUM_LANES-1:0] input_sign_s1;
wire [NUM_LANES-1:0] mant_is_zero_s1; wire [NUM_LANES-1:0] mant_is_zero_s1;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1; wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
@@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall), .enable (~stall),
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}), .data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1}) .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
); );
// Perform adjustments to mantissa and exponent // Perform adjustments to mantissa and exponent
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1; wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
wire [NUM_LANES-1:0] of_before_round_s1; wire [NUM_LANES-1:0] of_before_round_s1;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
reg of_before_round_tmp_s1; reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
always @(*) begin always @(*) begin
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits if (overflow) begin
preshift_mant_s1 = {input_mant_s1[i], 33'b0}; denorm_shamt_q = '0;
denorm_shamt_s1 = '0; end else if (underflow) begin
of_before_round_tmp_s1 = 1'b0; denorm_shamt_q = INT_WIDTH+1;
if (is_itof_s1) begin
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
// Overflow or infinities (for proper rounding)
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
preshift_mant_s1 = ~0; // largest normal value and RS bits set
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
// Limit the shift to retain sticky bits
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
// Denormalize underflowing values
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
end
end else begin end else begin
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin denorm_shamt_q = denorm_shamt;
// overflow: when converting to unsigned the range is larger by one
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
// underflow
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
end else begin
// By default right shift mantissa to be an integer
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
end
end end
end end
assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1; assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
assign final_exp_s1[i] = final_exp_tmp_s1; assign of_before_round_s1[i] = overflow;
assign of_before_round_s1[i] = of_before_round_tmp_s1;
end end
// Pipeline stage2 // Pipeline stage2
wire valid_in_s2; wire valid_in_s2;
wire [NUM_LANES-1:0] lane_mask_s2; wire [NUM_LANES-1:0] lane_mask_s2;
wire [TAGW-1:0] tag_in_s2; wire [TAGW-1:0] tag_in_s2;
wire is_itof_s2; wire is_itof_s2;
wire unsigned_s2; wire is_signed_s2;
wire [2:0] rnd_mode_s2; wire [2:0] rnd_mode_s2;
fclass_t [NUM_LANES-1:0] fclass_s2; fclass_t [NUM_LANES-1:0] fclass_s2;
wire [NUM_LANES-1:0] mant_is_zero_s2; wire [NUM_LANES-1:0] mant_is_zero_s2;
wire [NUM_LANES-1:0] input_sign_s2; wire [NUM_LANES-1:0] input_sign_s2;
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2; wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
wire [NUM_LANES-1:0] of_before_round_s2; wire [NUM_LANES-1:0] of_before_round_s2;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)), .DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
@@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall), .enable (~stall),
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}), .data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2}) .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
); );
wire [NUM_LANES-1:0] rounded_sign_s2; wire [NUM_LANES-1:0] rounded_sign_s2;
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
wire [NUM_LANES-1:0] int_round_has_sticky_s2; wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
wire [NUM_LANES-1:0] fp_round_has_sticky_s2; wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
// Rouding and classification // Rouding and classification
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] round_sticky_bits_s2; wire [1:0] round_sticky_bits_s2;
wire [31:0] fmt_pre_round_abs_s2; wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
wire [31:0] pre_round_abs_s2; wire [INT_WIDTH-1:0] pre_round_abs_s2;
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2; wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
// Extract final mantissa and round bit, discard the normal bit (for FP) // Extract final mantissa and round bit, discard the normal bit (for FP)
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1]; assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1];
// Collapse sticky bits // Collapse sticky bits
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]); assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]); assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2); assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2);
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2); assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2);
// select RS bits for destination operation // select RS bits for destination operation
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2; assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
// Pack exponent and mantissa into proper rounding form // Pack exponent and mantissa into proper rounding form
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]}; assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
@@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] lane_mask_s3; wire [NUM_LANES-1:0] lane_mask_s3;
wire [TAGW-1:0] tag_in_s3; wire [TAGW-1:0] tag_in_s3;
wire is_itof_s3; wire is_itof_s3;
wire unsigned_s3; wire is_signed_s3;
fclass_t [NUM_LANES-1:0] fclass_s3; fclass_t [NUM_LANES-1:0] fclass_s3;
wire [NUM_LANES-1:0] mant_is_zero_s3; wire [NUM_LANES-1:0] mant_is_zero_s3;
wire [NUM_LANES-1:0] input_sign_s3; wire [NUM_LANES-1:0] input_sign_s3;
wire [NUM_LANES-1:0] rounded_sign_s3; wire [NUM_LANES-1:0] rounded_sign_s3;
wire [NUM_LANES-1:0][31:0] rounded_abs_s3; wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
wire [NUM_LANES-1:0] of_before_round_s3; wire [NUM_LANES-1:0] of_before_round_s3;
wire [NUM_LANES-1:0] int_round_has_sticky_s3; wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
wire [NUM_LANES-1:0] fp_round_has_sticky_s3; wire [NUM_LANES-1:0] i2f_round_has_sticky_s3;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)), .DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
@@ -339,105 +304,71 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~stall), .enable (~stall),
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}), .data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3}) .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
); );
wire [NUM_LANES-1:0] of_after_round_s3; wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
wire [NUM_LANES-1:0] uf_after_round_s3; wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble regular result, nan box short ones. Int zeroes need to be detected // Assemble regular result, nan box short ones. Int zeroes need to be detected
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]}; assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
// Classification after rounding select by destination format
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
// Negative integer result needs to be brought into two's complement // Negative integer result needs to be brought into two's complement
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i]; assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0); assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
end end
// FP Special case handling // F2I Special case handling
wire [NUM_LANES-1:0][31:0] fp_special_result_s3; reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
fflags_t [NUM_LANES-1:0] fp_special_status_s3; fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
wire [NUM_LANES-1:0] fp_result_is_special_s3; wire [NUM_LANES-1:0] f2i_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Detect special case from source format, I2F casts don't produce a special result
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
// Signalling input NaNs raise invalid flag, otherwise no flags set
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
// Assemble result according to destination format
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
end
// INT Special case handling
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
fflags_t [NUM_LANES-1:0] int_special_status_s3;
wire [NUM_LANES-1:0] int_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble result according to destination format // Assemble result according to destination format
always @(*) begin always @(*) begin
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1 f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31 f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31
end else begin end else begin
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1 f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31 f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
end end
end end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned) // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan
| fclass_s3[i].is_inf | fclass_s3[i].is_inf
| of_before_round_s3[i] | of_before_round_s3[i]
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]); | (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
// All integer special cases are invalid // All integer special cases are invalid
assign int_special_status_s3[i] = {1'b1, 4'h0}; assign f2i_special_status_s3[i] = {1'b1, 4'h0};
end end
// Result selection and Output handshake // Result selection and Output handshake
fflags_t [NUM_LANES-1:0] tmp_fflags_s3; fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
wire [NUM_LANES-1:0][31:0] tmp_result_s3; wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t fp_regular_status_s3, int_regular_status_s3; fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
fflags_t fp_status_s3, int_status_s3; fflags_t i2f_status_s3, f2i_status_s3;
wire [31:0] fp_result_s3, int_result_s3;
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f; assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i]))); assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts assign i2f_status_s3 = i2f_regular_status_s3;
assign fp_regular_status_s3.DZ = 1'b0; // no divisions assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
assign fp_regular_status_s3.NX = inexact_s3;
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0; wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i]; assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i]; assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
// Select output depending on special case detection
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
end end
assign stall = ~ready_out && valid_out; assign stall = ~ready_out && valid_out;
@@ -457,7 +388,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
); );
assign ready_in = ~stall; assign ready_in = ~stall;
assign has_fflags = 1'b1; assign has_fflags = 1'b1;
endmodule endmodule

View File

@@ -16,7 +16,7 @@
`include "VX_define.vh" `include "VX_define.vh"
`ifndef SYNTHESIS `ifdef SV_DPI
`include "float_dpi.vh" `include "float_dpi.vh"
`endif `endif

View File

@@ -54,7 +54,6 @@ module VX_fpu_rounding #(
2'b01: round_up = 1'b0; // < ulp/2 away, round down 2'b01: round_up = 1'b0; // < ulp/2 away, round down
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result 2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
2'b11: round_up = 1'b1; // > ulp/2 away, round up 2'b11: round_up = 1'b1; // > ulp/2 away, round up
default: round_up = 1'bx;
endcase endcase
`INST_FRM_RTZ: round_up = 1'b0; // always round down `INST_FRM_RTZ: round_up = 1'b0; // always round down
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if - `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -

View File

@@ -14,26 +14,38 @@
`include "VX_define.vh" `include "VX_define.vh"
interface VX_pipeline_perf_if (); interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] sched_idles;
wire [`PERF_CTR_BITS-1:0] scb_stalls; wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
wire [`PERF_CTR_BITS-1:0] ifetches; wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads; wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores; wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency; wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency; wire [`PERF_CTR_BITS-1:0] load_latency;
modport schedule (
output sched_idles,
output sched_stalls
);
modport issue ( modport issue (
output ibf_stalls, output ibf_stalls,
output scb_stalls, output scb_stalls,
output dsp_stalls output units_uses,
output sfu_uses
); );
modport slave ( modport slave (
input sched_idles,
input sched_stalls,
input ibf_stalls, input ibf_stalls,
input scb_stalls, input scb_stalls,
input dsp_stalls, input units_uses,
input sfu_uses,
input ifetches, input ifetches,
input loads, input loads,
input stores, input stores,

View File

@@ -21,8 +21,8 @@ module VX_avs_adapter #(
parameter NUM_BANKS = 1, parameter NUM_BANKS = 1,
parameter TAG_WIDTH = 1, parameter TAG_WIDTH = 1,
parameter RD_QUEUE_SIZE = 1, parameter RD_QUEUE_SIZE = 1,
parameter OUT_REG_REQ = 0, parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0 parameter OUT_REG_RSP = 0
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,

View File

@@ -20,7 +20,7 @@ module VX_axi_adapter #(
parameter TAG_WIDTH = 8, parameter TAG_WIDTH = 8,
parameter NUM_BANKS = 1, parameter NUM_BANKS = 1,
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)), parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
parameter OUT_REG_RSP = 0 parameter OUT_REG_RSP = 0
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,

View File

@@ -201,9 +201,7 @@ module VX_fifo_queue #(
rd_ptr_r <= '0; rd_ptr_r <= '0;
rd_ptr_n_r <= 1; rd_ptr_n_r <= 1;
end else begin end else begin
if (push) begin wr_ptr_r <= wr_ptr_r + ADDRW'(push);
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
end
if (pop) begin if (pop) begin
rd_ptr_r <= rd_ptr_n_r; rd_ptr_r <= rd_ptr_n_r;
if (DEPTH > 2) begin if (DEPTH > 2) begin

View File

@@ -21,8 +21,8 @@ module VX_mem_adapter #(
parameter DST_ADDR_WIDTH = 1, parameter DST_ADDR_WIDTH = 1,
parameter SRC_TAG_WIDTH = 1, parameter SRC_TAG_WIDTH = 1,
parameter DST_TAG_WIDTH = 1, parameter DST_TAG_WIDTH = 1,
parameter OUT_REG_REQ = 0, parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0 parameter OUT_REG_RSP = 0
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,

View File

@@ -21,7 +21,7 @@ module VX_stream_arb #(
parameter `STRING ARBITER = "P", parameter `STRING ARBITER = "P",
parameter LOCK_ENABLE = 1, parameter LOCK_ENABLE = 1,
parameter MAX_FANOUT = `MAX_FANOUT, parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_REG = 0 , parameter OUT_REG = 0 ,
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS, parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS), parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter NUM_REQS_W = `UP(LOG_NUM_REQS) parameter NUM_REQS_W = `UP(LOG_NUM_REQS)

View File

@@ -173,25 +173,27 @@ module VX_stream_xbar #(
end end
// compute inputs collision // compute inputs collision
// we have a collision when there exists a valid transfer with mutiple input candicates // we have a collision when there exists a valid transfer with multiple input candicates
// we caount the unique duplicates each cycle. // we count the unique duplicates each cycle.
reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r;
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
reg [PERF_CTR_BITS-1:0] collisions_r; reg [PERF_CTR_BITS-1:0] collisions_r;
reg [NUM_INPUTS-1:0] per_cycle_collision;
always @(*) begin always @(*) begin
per_cycle_collision = 0; per_cycle_collision = 0;
for (integer i = 0; i < NUM_INPUTS; ++i) begin for (integer i = 0; i < NUM_INPUTS; ++i) begin
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
if (valid_in[i] && valid_in[j+i] && sel_in[i] == sel_in[j+i]) begin per_cycle_collision[i] |= valid_in[i]
per_cycle_collision[i] |= ready_in[i] | ready_in[j+i]; && valid_in[j+i]
end && (sel_in[i] == sel_in[j+i])
&& (ready_in[i] | ready_in[j+i]);
end end
end end
end end
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count; `BUFFER(per_cycle_collision_r, per_cycle_collision);
`POP_COUNT(collision_count, per_cycle_collision); `POP_COUNT(collision_count, per_cycle_collision_r);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin

View File

@@ -15,7 +15,7 @@
module VX_gbar_arb #( module VX_gbar_arb #(
parameter NUM_REQS = 1, parameter NUM_REQS = 1,
parameter OUT_REG = 0, parameter OUT_REG = 0,
parameter `STRING ARBITER = "R" parameter `STRING ARBITER = "R"
) ( ) (
input wire clk, input wire clk,

View File

@@ -21,8 +21,8 @@ module VX_mem_arb #(
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)), parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
parameter TAG_WIDTH = 1, parameter TAG_WIDTH = 1,
parameter TAG_SEL_IDX = 0, parameter TAG_SEL_IDX = 0,
parameter OUT_REG_REQ = 0, parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0, parameter OUT_REG_RSP = 0,
parameter `STRING ARBITER = "R" parameter `STRING ARBITER = "R"
) ( ) (
input wire clk, input wire clk,

View File

@@ -233,10 +233,12 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [NUM_REQS-1:0] perf_reads_per_req = req_valid & req_ready & ~req_rw; wire [NUM_REQS-1:0] perf_reads_per_req, perf_writes_per_req;
wire [NUM_REQS-1:0] perf_writes_per_req = req_valid & req_ready & req_rw;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready; wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready;
`BUFFER(perf_reads_per_req, req_valid & req_ready & ~req_rw);
`BUFFER(perf_writes_per_req, req_valid & req_ready & req_rw);
`POP_COUNT(perf_reads_per_cycle, perf_reads_per_req); `POP_COUNT(perf_reads_per_cycle, perf_reads_per_req);
`POP_COUNT(perf_writes_per_cycle, perf_writes_per_req); `POP_COUNT(perf_writes_per_cycle, perf_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req); `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);

View File

@@ -19,8 +19,8 @@ module VX_smem_switch #(
parameter TAG_WIDTH = 1, parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter TAG_SEL_IDX = 0, parameter TAG_SEL_IDX = 0,
parameter OUT_REG_REQ = 0, parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0, parameter OUT_REG_RSP = 0,
parameter `STRING ARBITER = "R" parameter `STRING ARBITER = "R"
) ( ) (
input wire clk, input wire clk,

View File

@@ -56,17 +56,17 @@ TARGET=asesim make -C runtime/opae
PREFIX=build_base CONFIGS="-DEXT_F_DISABLE -DL1_DISABLE -DSM_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" TARGET=asesim make PREFIX=build_base CONFIGS="-DEXT_F_DISABLE -DL1_DISABLE -DSM_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" TARGET=asesim make
# ASE test runs # ASE test runs
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t0 ./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t0
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t1 ./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t1
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n16 ./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n16
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/demo/demo -n16 ./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/demo/demo -n16
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/dogfood/dogfood -n16 ./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/dogfood/dogfood -n16
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/vecadd/vecadd ./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/vecadd/vecadd
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/sgemm/sgemm -n4 ./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/sgemm/sgemm -n4
# modify "vsim_run.tcl" to dump VCD trace # modify "vsim_run.tcl" to dump VCD trace
vcd file trace.vcd vcd file trace.vcd
vcd add -r /*/Vortex/hw/rtl/* vcd add -r /*/afu/*
run -all run -all
# compress FPGA output files # compress FPGA output files

View File

@@ -15,27 +15,27 @@
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
BUILD_DIR=$1 BUILD_DIR=$(realpath $1)
PROGRAM=$(basename "$2") PROGRAM=$(basename "$2")
PROGRAM_DIR=`dirname $2` PROGRAM_DIR=`dirname $2`
POCL_RT_PATH=$TOOLDIR/pocl/runtime
VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime
# Export ASE_WORKDIR variable # Export ASE_WORKDIR variable
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work export ASE_WORKDIR=$BUILD_DIR/synth/work
shift 2
# cleanup incomplete runs # cleanup incomplete runs
rm -f $ASE_WORKDIR/.app_lock.pid rm -f $ASE_WORKDIR/.app_lock.pid
rm -f $ASE_WORKDIR/.ase_ready.pid rm -f $ASE_WORKDIR/.ase_ready.pid
rm -f $SCRIPT_DIR/$BUILD_DIR/nohup.out rm -f $BUILD_DIR/synth/nohup.out
# Start Simulator in background # Start Simulator in background (capture processs group pid)
pushd $SCRIPT_DIR/$BUILD_DIR pushd $BUILD_DIR/synth
echo " [DBG] starting ASE simnulator (stdout saved to '$SCRIPT_DIR/$BUILD_DIR/nohup.out')" echo " [DBG] starting ASE simnulator (stdout saved to '$BUILD_DIR/synth/nohup.out')"
nohup make sim & setsid make sim &> /dev/null &
SIM_PID=$!
popd popd
# Wait for simulator readiness # Wait for simulator readiness
@@ -47,6 +47,11 @@ done
# run application # run application
pushd $PROGRAM_DIR pushd $PROGRAM_DIR
shift 2
echo " [DBG] running ./$PROGRAM $*" echo " [DBG] running ./$PROGRAM $*"
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $* ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $*
popd popd
# stop the simulator (kill process group)
kill -- -$(ps -o pgid= $SIM_PID | grep -o '[0-9]*')
wait $SIM_PID 2> /dev/null

View File

@@ -1,6 +1,6 @@
PROJECT = VX_cache_cluster_top PROJECT = VX_cache_top
TOP_LEVEL_ENTITY = $(PROJECT) TOP_LEVEL_ENTITY = $(PROJECT)
SRC_FILE = VX_cache_cluster.sv SRC_FILE = $(PROJECT).sv
include ../../common.mk include ../../common.mk

View File

@@ -1,6 +1,6 @@
PROJECT = VX_core_top PROJECT = VX_core_top
TOP_LEVEL_ENTITY = $(PROJECT) TOP_LEVEL_ENTITY = $(PROJECT)
SRC_FILE = VX_core.sv SRC_FILE = $(PROJECT).sv
include ../../common.mk include ../../common.mk

View File

@@ -1,10 +1,11 @@
XLEN ?= 32 XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64) ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d CFLAGS += -march=rv64imafd -mabi=lp64d
else else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f CFLAGS += -march=rv32imaf -mabi=ilp32f
endif endif

View File

@@ -33,7 +33,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS) VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS) VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(RTL_INCLUDE)

View File

@@ -27,7 +27,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS) VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS) VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(RTL_INCLUDE)

View File

@@ -27,7 +27,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS) VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS) VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(RTL_INCLUDE)

View File

@@ -25,7 +25,7 @@ VL_FLAGS = --exe
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += $(CONFIGS) VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(PARAMS) VL_FLAGS += $(PARAMS)
VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(RTL_INCLUDE)
@@ -56,7 +56,6 @@ PROJECT = top_modules
all: build all: build
build: $(SRCS) build: $(SRCS)
verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)' verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)' verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'

View File

@@ -1,17 +1,18 @@
XLEN ?= 32 XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64) ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d CFLAGS += -march=rv64imafd -mabi=lp64d
else else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f CFLAGS += -march=rv32imaf -mabi=ilp32f
endif endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX) RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
LLVM_VORTEX ?= /opt/llvm-vortex LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT) LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)

View File

@@ -51,9 +51,8 @@ inline char is_log2(int x) {
return ((x & (x-1)) == 0); return ((x & (x-1)) == 0);
} }
inline int fast_log2(int x) { inline int log2_fast(int x) {
float f = x; return 31 - __builtin_clz (x);
return (*(int*)(&f)>>23) - 127;
} }
static void __attribute__ ((noinline)) spawn_tasks_all_stub() { static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
@@ -286,8 +285,8 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
// fast path handling // fast path handling
char isXYpow2 = is_log2(XY); char isXYpow2 = is_log2(XY);
char log2XY = fast_log2(XY); char log2XY = log2_fast(XY);
char log2X = fast_log2(X); char log2X = log2_fast(X);
wspawn_kernel_args_t wspawn_args = { wspawn_kernel_args_t wspawn_args = {
ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X

View File

@@ -175,8 +175,9 @@ static uint64_t get_csr_64(const void* ptr, int addr) {
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ret = 0; int ret = 0;
uint64_t instrs = 0; uint64_t total_instrs = 0;
uint64_t cycles = 0; uint64_t total_cycles = 0;
uint64_t max_cycles = 0;
#ifdef PERF_ENABLE #ifdef PERF_ENABLE
@@ -186,27 +187,29 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
return int((1.0 - (double(part) / double(total))) * 100); return int((1.0 - (double(part) / double(total))) * 100);
}; };
auto caclAvgLatency = [&](uint64_t sum, uint64_t requests)->int { auto caclAverage = [&](uint64_t part, uint64_t total)->double {
if (requests == 0) if (total == 0)
return 0; return 0;
return int(double(sum) / double(requests)); return double(part) / double(total);
}; };
auto calcUtilization = [&](uint64_t count, uint64_t stalls)->int { auto calcAvgPercent = [&](uint64_t part, uint64_t total)->int {
if (count == 0) return int(caclAverage(part, total) * 100);
return 0;
return int((double(count) / double(count + stalls)) * 100);
}; };
auto perf_class = gAutoPerfDump.get_perf_class(); auto perf_class = gAutoPerfDump.get_perf_class();
// PERF: pipeline stalls // PERF: pipeline stalls
uint64_t sched_idles = 0;
uint64_t sched_stalls = 0;
uint64_t ibuffer_stalls = 0; uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0; uint64_t scrb_stalls = 0;
uint64_t lsu_stalls = 0; uint64_t scrb_alu = 0;
uint64_t fpu_stalls = 0; uint64_t scrb_fpu = 0;
uint64_t alu_stalls = 0; uint64_t scrb_lsu = 0;
uint64_t sfu_stalls = 0; uint64_t scrb_sfu = 0;
uint64_t scrb_wctl = 0;
uint64_t scrb_csrs = 0;
uint64_t ifetches = 0; uint64_t ifetches = 0;
uint64_t loads = 0; uint64_t loads = 0;
uint64_t stores = 0; uint64_t stores = 0;
@@ -258,61 +261,111 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (ret != 0) if (ret != 0)
return ret; return ret;
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
#ifdef PERF_ENABLE #ifdef PERF_ENABLE
switch (perf_class) { switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: { case VX_DCR_MPM_CLASS_CORE: {
// PERF: pipeline // PERF: pipeline
// ibuffer_stall // scheduler idles
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); {
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core); uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
ibuffer_stalls += ibuffer_stalls_per_core; if (num_cores > 1) {
// scoreboard_stall int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core); }
scoreboard_stalls += scoreboard_stalls_per_core; sched_idles += sched_idles_per_core;
// alu_stall }
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST); // scheduler stalls
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); {
alu_stalls += alu_stalls_per_core; uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
// lsu_stall if (num_cores > 1) {
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST); int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
lsu_stalls += lsu_stalls_per_core; }
// fpu_stall sched_stalls += sched_stalls_per_core;
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST); }
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); // ibuffer_stalls
fpu_stalls += fpu_stalls_per_core; {
// sfu_stall uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST); if (num_cores > 1) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core); int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
sfu_stalls += sfu_stalls_per_core; fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
}
ibuffer_stalls += ibuffer_stalls_per_core;
}
// issue_stalls
{
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core;
scrb_sfu += scrb_sfu_per_core;
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
calcAvgPercent(scrb_alu_per_core, scrb_total),
calcAvgPercent(scrb_fpu_per_core, scrb_total),
calcAvgPercent(scrb_lsu_per_core, scrb_total),
calcAvgPercent(scrb_sfu_per_core, scrb_total));
}
scrb_stalls += scrb_stalls_per_core;
}
// sfu_stalls
{
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL);
uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS);
if (num_cores > 1) {
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
, core_id
, scrb_sfu_per_core
, calcAvgPercent(scrb_csrs_per_core, sfu_total)
, calcAvgPercent(scrb_wctl_per_core, sfu_total)
);
}
scrb_wctl += scrb_wctl_per_core;
scrb_csrs += scrb_csrs_per_core;
}
// PERF: memory // PERF: memory
// ifetches // ifetches
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); {
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCHES);
ifetches += ifetches_per_core; if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
if (num_cores > 1) {
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
}
ifetch_lat += ifetch_lat_per_core;
}
// loads // loads
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); {
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
loads += loads_per_core; if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
if (num_cores > 1) {
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
}
load_lat += load_lat_per_core;
}
// stores // stores
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES); {
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
stores += stores_per_core; if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
// ifetch latency stores += stores_per_core;
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
if (num_cores > 1) {
int mem_avg_lat = caclAvgLatency(ifetch_lat_per_core, ifetches_per_core);
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
} }
ifetch_lat += ifetch_lat_per_core;
// load latency
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
if (num_cores > 1) {
int mem_avg_lat = caclAvgLatency(load_lat_per_core, loads_per_core);
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
}
load_lat += load_lat_per_core;
} break; } break;
case VX_DCR_MPM_CLASS_MEM: { case VX_DCR_MPM_CLASS_MEM: {
if (smem_enable) { if (smem_enable) {
@@ -320,7 +373,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS); uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES); uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST); uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
int smem_bank_utilization = calcUtilization(smem_reads + smem_writes, smem_bank_stalls); int smem_bank_utilization = calcAvgPercent(smem_reads + smem_writes, smem_reads + smem_writes + smem_bank_stalls);
fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads); fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads);
fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes); fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes);
fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization); fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization);
@@ -330,9 +383,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: Icache // PERF: Icache
uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS); uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R); uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
uint64_t icache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MSHR_ST);
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads); int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads); fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads);
fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio); fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio);
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
} }
if (dcache_enable) { if (dcache_enable) {
@@ -345,13 +401,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST); uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads); int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes); int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
int dcache_bank_utilization = calcUtilization(dcache_reads + dcache_writes, dcache_bank_stalls); int dcache_bank_utilization = calcAvgPercent(dcache_reads + dcache_writes, dcache_reads + dcache_writes + dcache_bank_stalls);
int mshr_utilization = calcAvgPercent(dcache_read_misses + dcache_write_misses, dcache_read_misses + dcache_write_misses + dcache_mshr_stalls);
fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads); fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads);
fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes); fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes);
fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio); fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio);
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio); fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization); fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization);
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_stalls); fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
} }
if (l2cache_enable) { if (l2cache_enable) {
@@ -378,7 +435,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: memory // PERF: memory
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS); mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES); mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT); mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LT);
} }
} break; } break;
default: default:
@@ -386,25 +443,36 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
} }
#endif #endif
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core)); float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC); if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core; total_instrs += instrs_per_core;
cycles = std::max<uint64_t>(cycles_per_core, cycles); total_cycles += cycles_per_core;
max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
} }
#ifdef PERF_ENABLE #ifdef PERF_ENABLE
switch (perf_class) { switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: { case VX_DCR_MPM_CLASS_CORE: {
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles);
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles);
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches)); int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads)); int load_avg_lat = (int)(double(load_lat) / double(loads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); uint64_t sfu_total = scrb_wctl + scrb_csrs;
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls); fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls); fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
calcAvgPercent(scrb_alu, scrb_total),
calcAvgPercent(scrb_fpu, scrb_total),
calcAvgPercent(scrb_lsu, scrb_total),
calcAvgPercent(scrb_sfu, scrb_total));
fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
, scrb_sfu
, calcAvgPercent(scrb_csrs, sfu_total)
, calcAvgPercent(scrb_wctl, sfu_total)
);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches); fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads); fprintf(stream, "PERF: loads=%ld\n", loads);
fprintf(stream, "PERF: stores=%ld\n", stores); fprintf(stream, "PERF: stores=%ld\n", stores);
@@ -419,31 +487,32 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
l2cache_write_misses /= num_cores; l2cache_write_misses /= num_cores;
l2cache_bank_stalls /= num_cores; l2cache_bank_stalls /= num_cores;
l2cache_mshr_stalls /= num_cores; l2cache_mshr_stalls /= num_cores;
int l2cache_read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads); int read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads);
int l2cache_write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes); int write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes);
int l2cache_bank_utilization = calcUtilization(l2cache_reads + l2cache_writes, l2cache_bank_stalls); int bank_utilization = calcAvgPercent(l2cache_reads + l2cache_writes, l2cache_reads + l2cache_writes + l2cache_bank_stalls);
int mshr_utilization = calcAvgPercent(l2cache_read_misses + l2cache_write_misses, l2cache_read_misses + l2cache_write_misses + l2cache_mshr_stalls);
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads); fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes); fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio); fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, read_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio); fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization); fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, bank_utilization);
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls); fprintf(stream, "PERF: l2cache mshr stalls=%ld (utilization=%d%%)\n", l2cache_mshr_stalls, mshr_utilization);
} }
if (l3cache_enable) { if (l3cache_enable) {
int l3cache_read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
int l3cache_write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
int l3cache_bank_utilization = calcUtilization(l3cache_reads + l3cache_writes, l3cache_bank_stalls); int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio); fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio); fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization); fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization);
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls); fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization);
} }
int mem_avg_lat = caclAvgLatency(mem_lat, mem_reads); int mem_avg_lat = caclAverage(mem_lat, mem_reads);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
} break; } break;
@@ -452,8 +521,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
} }
#endif #endif
float IPC = (float)(double(instrs) / double(cycles)); float IPC = (float)(double(total_instrs) / double(max_cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
fflush(stream); fflush(stream);

1
runtime/opae/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/obj_dir/*

View File

@@ -1,25 +1,14 @@
XLEN ?= 32 XLEN ?= 32
TARGET ?= opaesim TARGET ?= opaesim
DESTDIR ?= $(CURDIR)
OPAESIM_DIR = ../../sim/opaesim SIM_DIR = $(abspath ../../sim)
HW_DIR = $(abspath ../../hw)
RTL_DIR=../../hw/rtl SYN_DIR = $(HW_DIR)/syn/altera/opae
SYN_DIR=../../hw/syn/altera/opae
SCRIPT_DIR=../../hw/scripts
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I. -I../include -I../common/ -I../../hw CXXFLAGS += -I../include -I../common -I$(HW_DIR) -I$(DESTDIR)
CXXFLAGS += -DXLEN_$(XLEN) CXXFLAGS += -DXLEN_$(XLEN)
ifeq ($(TARGET), opaesim)
CXXFLAGS += -I$(OPAESIM_DIR)
else
CXXFLAGS += -I$(SYN_DIR)
endif
# Position independent code # Position independent code
CXXFLAGS += -fPIC CXXFLAGS += -fPIC
@@ -35,9 +24,11 @@ SRCS = vortex.cpp driver.cpp ../common/utils.cpp
# set up target types # set up target types
ifeq ($(TARGET), opaesim) ifeq ($(TARGET), opaesim)
CXXFLAGS += -DOPAESIM OPAESIM = $(DESTDIR)/libopae-c-sim.so
OPAESIM = libopae-c-sim.so CXXFLAGS += -DOPAESIM -I$(SIM_DIR)/opaesim
LDFLAGS += -L$(DESTDIR) -lopae-c-sim
else else
CXXFLAGS += -I$(SYN_DIR)
ifeq ($(TARGET), asesim) ifeq ($(TARGET), asesim)
CXXFLAGS += -DASESIM CXXFLAGS += -DASESIM
else else
@@ -65,14 +56,14 @@ endif
PROJECT = libvortex.so PROJECT = libvortex.so
all: $(PROJECT) all: $(DESTDIR)/$(PROJECT)
libopae-c-sim.so: $(DESTDIR)/libopae-c-sim.so:
DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) ../../runtime/opae/libopae-c-sim.so DESTDIR=$(DESTDIR) $(MAKE) -C $(SIM_DIR)/opaesim $(DESTDIR)/libopae-c-sim.so
$(PROJECT): $(SRCS) $(OPAESIM) $(DESTDIR)/$(PROJECT): $(SRCS) $(OPAESIM)
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT) $(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@
clean: clean:
DESTDIR=../../runtime/opae $(MAKE) -C $(OPAESIM_DIR) clean DESTDIR=$(DESTDIR) $(MAKE) -C $(SIM_DIR)/opaesim clean
rm -rf $(PROJECT) rm -rf $(DESTDIR)/$(PROJECT)

View File

@@ -1,2 +1 @@
obj_dir /obj_dir/*
*.so

View File

@@ -1,9 +1,10 @@
XLEN ?= 32 XLEN ?= 32
DESTDIR ?= $(CURDIR)
RTLSIM_DIR = ../../sim/rtlsim SIM_DIR = $(abspath ../../sim)
HW_DIR = $(abspath ../../hw)
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common CXXFLAGS += -I../include -I../common -I$(HW_DIR) -I$(SIM_DIR)/rtlsim -I$(SIM_DIR)/common
CXXFLAGS += -DXLEN_$(XLEN) CXXFLAGS += -DXLEN_$(XLEN)
# Position independent code # Position independent code
@@ -16,7 +17,7 @@ CXXFLAGS += $(CONFIGS)
CXXFLAGS += -DDUMP_PERF_STATS CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread LDFLAGS += -shared -pthread
LDFLAGS += -L. -lrtlsim LDFLAGS += -L$(DESTDIR) -lrtlsim
SRCS = vortex.cpp ../common/utils.cpp SRCS = vortex.cpp ../common/utils.cpp
@@ -34,12 +35,12 @@ endif
PROJECT = libvortex.so PROJECT = libvortex.so
all: $(PROJECT) all: $(DESTDIR)/$(PROJECT)
$(PROJECT): $(SRCS) $(DESTDIR)/$(PROJECT): $(SRCS)
DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../runtime/rtlsim/librtlsim.so DESTDIR=$(DESTDIR) $(MAKE) -C $(SIM_DIR)/rtlsim $(DESTDIR)/librtlsim.so
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT) $(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@
clean: clean:
DESTDIR=../../runtime/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean DESTDIR=$(DESTDIR) $(MAKE) -C $(SIM_DIR)/rtlsim clean
rm -rf $(PROJECT) *.o rm -rf $(DESTDIR)/$(PROJECT) *.o

View File

@@ -1,16 +1,17 @@
XLEN ?= 32 XLEN ?= 32
DESTDIR ?= $(CURDIR)
SIMX_DIR = ../../sim/simx SIM_DIR = $(abspath ../../sim)
HW_DIR = $(abspath ../../hw)
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common CXXFLAGS += -I../include -I../common -I$(HW_DIR) -I$(SIM_DIR)/simx -I$(SIM_DIR)/common
CXXFLAGS += $(CONFIGS) CXXFLAGS += $(CONFIGS)
CXXFLAGS += -DDUMP_PERF_STATS CXXFLAGS += -DDUMP_PERF_STATS
CXXFLAGS += -DXLEN_$(XLEN) CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared -pthread LDFLAGS += -shared -pthread
LDFLAGS += -L. -lsimx LDFLAGS += -L$(DESTDIR) -lsimx
SRCS = vortex.cpp ../common/utils.cpp SRCS = vortex.cpp ../common/utils.cpp
@@ -23,12 +24,12 @@ endif
PROJECT = libvortex.so PROJECT = libvortex.so
all: $(PROJECT) all: $(DESTDIR)/$(PROJECT)
$(PROJECT): $(SRCS) $(DESTDIR)/$(PROJECT): $(SRCS)
DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) ../../runtime/simx/libsimx.so DESTDIR=$(DESTDIR) $(MAKE) -C $(SIM_DIR)/simx $(DESTDIR)/libsimx.so
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
clean: clean:
DESTDIR=../../runtime/simx $(MAKE) -C $(SIMX_DIR) clean DESTDIR=$(DESTDIR) $(MAKE) -C $(SIM_DIR)/simx clean
rm -rf libsimx.so $(PROJECT) *.o rm -rf $(DESTDIR)/$(PROJECT) *.o

View File

@@ -87,7 +87,7 @@ private:
class vx_device { class vx_device {
public: public:
vx_device() vx_device()
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS) : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
, ram_(RAM_PAGE_SIZE) , ram_(RAM_PAGE_SIZE)
, processor_(arch_) , processor_(arch_)
, global_mem_( , global_mem_(

View File

@@ -1,6 +1,11 @@
XLEN ?= 32
DESTDIR ?= $(CURDIR)
SIM_DIR = $(abspath ../../sim)
HW_DIR = $(abspath ../../hw)
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I../../runtime -I../../hw -I../../sim/common CXXFLAGS += -I../include -I../common -I$(HW_DIR) -I$(SIM_DIR)/common
CXXFLAGS += -fPIC CXXFLAGS += -fPIC

View File

@@ -1,20 +1,22 @@
XLEN ?= 32 XLEN ?= 32
DESTDIR ?= . DESTDIR ?= $(CURDIR)
RTL_DIR = ../../hw/rtl HW_DIR = $(abspath ../../hw)
DPI_DIR = ../../hw/dpi COMMON_DIR = $(abspath ../common)
THIRD_PARTY_DIR = $(abspath ../../third_party)
RTL_DIR = $(HW_DIR)/rtl
DPI_DIR = $(HW_DIR)/dpi
AFU_DIR = $(RTL_DIR)/afu/opae AFU_DIR = $(RTL_DIR)/afu/opae
SCRIPT_DIR = ../../hw/scripts SCRIPT_DIR = $(HW_DIR)/scripts
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR)) CXXFLAGS += -I$(CURDIR) -I$(HW_DIR) -I$(COMMON_DIR) -I$(DESTDIR)
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include CXXFLAGS += -I/$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR) CXXFLAGS += -I/$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN) CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += -shared $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
# control RTL debug tracing states # control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
@@ -53,9 +55,9 @@ endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp SRCS += $(CURDIR)/fpga.cpp $(CURDIR)/opae_sim.cpp
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
@@ -73,7 +75,7 @@ TOP = vortex_afu_shim
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += -DXLEN_$(XLEN) VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS) VL_FLAGS += $(CONFIGS)
VL_FLAGS += verilator.vlt VL_FLAGS += verilator.vlt
@@ -119,16 +121,16 @@ PROJECT = libopae-c-sim.so
all: $(DESTDIR)/$(PROJECT) all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/vortex.xml: $(DESTDIR)/vortex.xml:
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $@
$(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json $(SCRIPT_DIR)/scope.py $^ -o $@
$(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh $(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h $(SCRIPT_DIR)/gen_config.py -i $^ -o $@
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON) $(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT) verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
clean: clean:
rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT) rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)

View File

@@ -1,2 +1 @@
VX_config.h
/obj_dir/* /obj_dir/*

View File

@@ -1,18 +1,20 @@
XLEN ?= 32 XLEN ?= 32
DESTDIR ?= . DESTDIR ?= $(CURDIR)
RTL_DIR = ../../hw/rtl HW_DIR = $(abspath ../../hw)
DPI_DIR = ../../hw/dpi COMMON_DIR = $(abspath ../common)
THIRD_PARTY_DIR = ../../third_party THIRD_PARTY_DIR = $(abspath ../../third_party)
RTL_DIR = $(HW_DIR)/rtl
DPI_DIR = $(HW_DIR)/dpi
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../../../hw -I../../common CXXFLAGS += -I$(HW_DIR) -I$(COMMON_DIR)
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR) CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN) CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
# control RTL debug tracing states # control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
@@ -38,9 +40,9 @@ ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
endif endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += processor.cpp SRCS += $(CURDIR)/processor.cpp
ifdef AXI_BUS ifdef AXI_BUS
TOP = Vortex_axi TOP = Vortex_axi
@@ -54,7 +56,7 @@ VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt VL_FLAGS += verilator.vlt
VL_FLAGS += -DSIMULATION VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += -DXLEN_$(XLEN) VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS) VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(RTL_INCLUDE)
@@ -87,11 +89,11 @@ PROJECT = rtlsim
all: $(DESTDIR)/$(PROJECT) all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp $(DESTDIR)/$(PROJECT): $(SRCS) $(CURDIR)/main.cpp
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' -o ../$@ verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS) -DSTARTUP_ADDR=0x80000000' -LDFLAGS '$(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
$(DESTDIR)/lib$(PROJECT).so: $(SRCS) $(DESTDIR)/lib$(PROJECT).so: $(SRCS)
verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@ verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' --Mdir $(DESTDIR)/obj_dir -o $@
clean: clean:
rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so rm -rf $(DESTDIR)/obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so

View File

@@ -1,11 +1,12 @@
XLEN ?= 32 XLEN ?= 32
DESTDIR ?= . DESTDIR ?= $(CURDIR)
RTL_DIR = ../hw/rtl HW_DIR = $(abspath ../../hw)
THIRD_PARTY_DIR = ../../third_party COMMON_DIR = $(abspath ../common)
THIRD_PARTY_DIR = $(abspath ../../third_party)
CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I. -I../common -I../../hw CXXFLAGS += -I$(CURDIR) -I$(COMMON_DIR) -I$(HW_DIR)
CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I$(THIRD_PARTY_DIR) CXXFLAGS += -I$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN) CXXFLAGS += -DXLEN_$(XLEN)
@@ -14,8 +15,8 @@ CXXFLAGS += $(CONFIGS)
LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp
# Debugigng # Debugigng
ifdef DEBUG ifdef DEBUG

View File

@@ -28,6 +28,7 @@ private:
uint16_t num_warps_; uint16_t num_warps_;
uint16_t num_cores_; uint16_t num_cores_;
uint16_t num_clusters_; uint16_t num_clusters_;
uint16_t socket_size_;
uint16_t vsize_; uint16_t vsize_;
uint16_t num_regs_; uint16_t num_regs_;
uint16_t num_csrs_; uint16_t num_csrs_;
@@ -35,11 +36,12 @@ private:
uint16_t ipdom_size_; uint16_t ipdom_size_;
public: public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters) Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
: num_threads_(num_threads) : num_threads_(num_threads)
, num_warps_(num_warps) , num_warps_(num_warps)
, num_cores_(num_cores) , num_cores_(num_cores)
, num_clusters_(num_clusters) , num_clusters_(NUM_CLUSTERS)
, socket_size_(SOCKET_SIZE)
, vsize_(16) , vsize_(16)
, num_regs_(32) , num_regs_(32)
, num_csrs_(4096) , num_csrs_(4096)
@@ -82,6 +84,10 @@ public:
uint16_t num_clusters() const { uint16_t num_clusters() const {
return num_clusters_; return num_clusters_;
} }
uint16_t socket_size() const {
return socket_size_;
}
}; };
} }

View File

@@ -45,20 +45,20 @@ public:
char sname[100]; char sname[100];
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units); std::vector<MemSwitch::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) { for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u); snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs); unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) { for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i)); this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i)); unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
} }
} }
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs); std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) { for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i); snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches); mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) { for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u)); unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i)); mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
@@ -66,7 +66,7 @@ public:
} }
snprintf(sname, 100, "%s-cache-arb", name); snprintf(sname, 100, "%s-cache-arb", name);
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1); auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
for (uint32_t i = 0; i < num_caches; ++i) { for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i); snprintf(sname, 100, "%s-cache%d", name, i);

View File

@@ -41,19 +41,16 @@ struct params_t {
uint32_t tag_select_addr_end; uint32_t tag_select_addr_end;
params_t(const CacheSim::Config& config) { params_t(const CacheSim::Config& config) {
int32_t bank_bits = log2ceil(config.num_banks); int32_t offset_bits = config.L - config.W;
int32_t offset_bits = config.B - config.W; int32_t index_bits = config.C - (config.L + config.A + config.B);
int32_t log2_bank_size = config.C - bank_bits;
int32_t index_bits = log2_bank_size - (config.B + config.A);
assert(log2_bank_size > 0);
assert(offset_bits >= 0); assert(offset_bits >= 0);
assert(index_bits >= 0); assert(index_bits >= 0);
this->log2_num_inputs = log2ceil(config.num_inputs); this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_line = 1 << offset_bits; this->sets_per_bank = 1 << index_bits;
this->lines_per_set = 1 << config.A; this->lines_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits; this->words_per_line = 1 << offset_bits;
assert(config.ports_per_bank <= this->words_per_line); assert(config.ports_per_bank <= this->words_per_line);
@@ -63,7 +60,7 @@ struct params_t {
// Bank select // Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end); this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1); this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1);
// Set select // Set select
this->set_select_addr_start = (1+this->bank_select_addr_end); this->set_select_addr_start = (1+this->bank_select_addr_end);
@@ -74,23 +71,23 @@ struct params_t {
this->tag_select_addr_end = (config.addr_width-1); this->tag_select_addr_end = (config.addr_width-1);
} }
uint32_t addr_bank_id(uint64_t word_addr) const { uint32_t addr_bank_id(uint64_t addr) const {
if (bank_select_addr_end >= bank_select_addr_start) if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end); return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end);
else else
return 0; return 0;
} }
uint32_t addr_set_id(uint64_t word_addr) const { uint32_t addr_set_id(uint64_t addr) const {
if (set_select_addr_end >= set_select_addr_start) if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end); return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end);
else else
return 0; return 0;
} }
uint64_t addr_tag(uint64_t word_addr) const { uint64_t addr_tag(uint64_t addr) const {
if (tag_select_addr_end >= tag_select_addr_start) if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end); return bit_getw(addr, tag_select_addr_start, tag_select_addr_end);
else else
return 0; return 0;
} }
@@ -288,8 +285,8 @@ private:
Config config_; Config config_;
params_t params_; params_t params_;
std::vector<bank_t> banks_; std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr bank_switch_; MemSwitch::Ptr bank_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_; MemSwitch::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_; std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_; std::vector<SimPort<MemRsp>> mem_rsp_ports_;
std::vector<bank_req_t> pipeline_reqs_; std::vector<bank_req_t> pipeline_reqs_;
@@ -304,16 +301,16 @@ public:
: simobject_(simobject) : simobject_(simobject)
, config_(config) , config_(config)
, params_(config) , params_(config)
, banks_(config.num_banks, {config, params_}) , banks_((1 << config.B), {config, params_})
, mem_req_ports_(config.num_banks, simobject) , mem_req_ports_((1 << config.B), simobject)
, mem_rsp_ports_(config.num_banks, simobject) , mem_rsp_ports_((1 << config.B), simobject)
, pipeline_reqs_(config.num_banks, config.ports_per_bank) , pipeline_reqs_((1 << config.B), config.ports_per_bank)
{ {
char sname[100]; char sname[100];
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str()); snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
if (config_.bypass) { if (config_.bypass) {
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs); bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
for (uint32_t i = 0; i < config_.num_inputs; ++i) { for (uint32_t i = 0; i < config_.num_inputs; ++i) {
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i)); simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
@@ -323,14 +320,14 @@ public:
return; return;
} }
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2); bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2);
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
if (config.num_banks > 1) { if (config.B != 0) {
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks); bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B));
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) {
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
} }
@@ -383,20 +380,22 @@ public:
pipeline_req.clear(); pipeline_req.clear();
} }
// schedule MSHR replay // first: schedule MSHR replay (flush MSHR queue)
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id); auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id); auto& pipeline_req = pipeline_reqs_.at(bank_id);
bank.mshr.pop(&pipeline_req); bank.mshr.pop(&pipeline_req);
} }
// schedule memory fill // second: schedule memory fill (flush memory queue)
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (mem_rsp_port.empty()) if (mem_rsp_port.empty())
continue; continue;
auto& pipeline_req = pipeline_reqs_.at(bank_id); auto& pipeline_req = pipeline_reqs_.at(bank_id);
// skip if bank already busy
if (pipeline_req.type != bank_req_t::None) if (pipeline_req.type != bank_req_t::None)
continue; continue;
@@ -407,7 +406,7 @@ public:
mem_rsp_port.pop(); mem_rsp_port.pop();
} }
// schedule core requests // last: schedule core requests (flush core queue)
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id); auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty()) if (core_req_port.empty())
@@ -425,18 +424,21 @@ public:
} }
auto bank_id = params_.addr_bank_id(core_req.addr); auto bank_id = params_.addr_bank_id(core_req.addr);
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// skip if bank already busy
if (pipeline_req.type != bank_req_t::None)
continue;
auto set_id = params_.addr_set_id(core_req.addr); auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr); auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank; auto port_id = req_id % config_.ports_per_bank;
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// check MSHR capacity // check MSHR capacity
if ((!core_req.write || !config_.write_through) if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) { && bank.mshr.full()) {
++perf_stats_.mshr_stalls; ++perf_stats_.mshr_stalls;
++perf_stats_.bank_stalls;
continue; continue;
} }
@@ -452,7 +454,7 @@ public:
} }
// extend request ports // extend request ports
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true}; pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
} else if (pipeline_req.type == bank_req_t::None) { } else {
// schedule new request // schedule new request
bank_req_t bank_req(config_.ports_per_bank); bank_req_t bank_req(config_.ports_per_bank);
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true}; bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
@@ -463,10 +465,6 @@ public:
bank_req.type = bank_req_t::Core; bank_req.type = bank_req_t::Core;
bank_req.write = core_req.write; bank_req.write = core_req.write;
pipeline_req = bank_req; pipeline_req = bank_req;
} else {
// bank in use
++perf_stats_.bank_stalls;
continue;
} }
if (core_req.write) if (core_req.write)
@@ -516,7 +514,7 @@ private:
} }
void processBankRequests() { void processBankRequests() {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id); auto& bank = banks_.at(bank_id);
auto pipeline_req = pipeline_reqs_.at(bank_id); auto pipeline_req = pipeline_reqs_.at(bank_id);
@@ -546,10 +544,9 @@ private:
} }
} break; } break;
case bank_req_t::Core: { case bank_req_t::Core: {
bool hit = false; int32_t hit_line_id = -1;
bool found_free_line = false; int32_t free_line_id = -1;
uint32_t hit_line_id = 0; int32_t repl_line_id = 0;
uint32_t repl_line_id = 0;
uint32_t max_cnt = 0; uint32_t max_cnt = 0;
auto& set = bank.sets.at(pipeline_req.set_id); auto& set = bank.sets.at(pipeline_req.set_id);
@@ -557,38 +554,34 @@ private:
// tag lookup // tag lookup
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) { for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
auto& line = set.lines.at(i); auto& line = set.lines.at(i);
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
if (line.valid) { if (line.valid) {
if (line.tag == pipeline_req.tag) { if (line.tag == pipeline_req.tag) {
line.lru_ctr = 0;
hit_line_id = i; hit_line_id = i;
hit = true; line.lru_ctr = 0;
} else { } else {
++line.lru_ctr; ++line.lru_ctr;
} }
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
} else { } else {
found_free_line = true; free_line_id = i;
repl_line_id = i;
} }
} }
if (hit) { if (hit_line_id != -1) {
//
// Hit handling // Hit handling
//
if (pipeline_req.write) { if (pipeline_req.write) {
// handle write hit // handle write has_hit
auto& hit_line = set.lines.at(hit_line_id); auto& hit_line = set.lines.at(hit_line_id);
if (config_.write_through) { if (config_.write_through) {
// forward write request to memory // forward write request to memory
MemReq mem_req; MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag); mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true; mem_req.write = true;
mem_req.cid = pipeline_req.cid; mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid; mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1); mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req); DT(3, simobject_->name() << "-dram-" << mem_req);
} else { } else {
@@ -607,22 +600,20 @@ private:
} }
} }
} else { } else {
//
// Miss handling // Miss handling
//
if (pipeline_req.write) if (pipeline_req.write)
++perf_stats_.write_misses; ++perf_stats_.write_misses;
else else
++perf_stats_.read_misses; ++perf_stats_.read_misses;
if (!found_free_line && !config_.write_through) { if (free_line_id == -1 && !config_.write_through) {
// write back dirty line // write back dirty line
auto& repl_line = set.lines.at(repl_line_id); auto& repl_line = set.lines.at(repl_line_id);
if (repl_line.dirty) { if (repl_line.dirty) {
MemReq mem_req; MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag); mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
mem_req.write = true; mem_req.write = true;
mem_req.cid = pipeline_req.cid; mem_req.cid = pipeline_req.cid;
mem_req_ports_.at(bank_id).send(mem_req, 1); mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req); DT(3, simobject_->name() << "-dram-" << mem_req);
++perf_stats_.evictions; ++perf_stats_.evictions;
@@ -635,8 +626,8 @@ private:
MemReq mem_req; MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true; mem_req.write = true;
mem_req.cid = pipeline_req.cid; mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid; mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1); mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req); DT(3, simobject_->name() << "-dram-" << mem_req);
} }
@@ -655,7 +646,7 @@ private:
auto mshr_pending = bank.mshr.lookup(pipeline_req); auto mshr_pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR // allocate MSHR
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id); auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id);
// send fill request // send fill request
if (!mshr_pending) { if (!mshr_pending) {
@@ -663,8 +654,8 @@ private:
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false; mem_req.write = false;
mem_req.tag = mshr_id; mem_req.tag = mshr_id;
mem_req.cid = pipeline_req.cid; mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid; mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1); mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req); DT(3, simobject_->name() << "-dram-" << mem_req);
++pending_fill_reqs_; ++pending_fill_reqs_;

View File

@@ -23,16 +23,15 @@ public:
struct Config { struct Config {
bool bypass; // cache bypass bool bypass; // cache bypass
uint8_t C; // log2 cache size uint8_t C; // log2 cache size
uint8_t B; // log2 block size uint8_t L; // log2 line size
uint8_t W; // log2 word size uint8_t W; // log2 word size
uint8_t A; // log2 associativity uint8_t A; // log2 associativity
uint8_t B; // log2 number of banks
uint8_t addr_width; // word address bits uint8_t addr_width; // word address bits
uint8_t num_banks; // number of banks
uint8_t ports_per_bank; // number of ports per bank uint8_t ports_per_bank; // number of ports per bank
uint8_t num_inputs; // number of inputs uint8_t num_inputs; // number of inputs
bool write_through; // is write-through bool write_through; // is write-through
bool write_reponse; // enable write response bool write_reponse; // enable write response
uint16_t victim_size; // victim cache size
uint16_t mshr_size; // MSHR buffer size uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency uint8_t latency; // pipeline latency
}; };

View File

@@ -18,34 +18,60 @@ using namespace vortex;
Cluster::Cluster(const SimContext& ctx, Cluster::Cluster(const SimContext& ctx,
uint32_t cluster_id, uint32_t cluster_id,
ProcessorImpl* processor, ProcessorImpl* processor,
const Arch &arch, const const Arch &arch,
DCRS &dcrs) const DCRS &dcrs)
: SimObject(ctx, "cluster") : SimObject(ctx, "cluster")
, mem_req_port(this) , mem_req_port(this)
, mem_rsp_port(this) , mem_rsp_port(this)
, cluster_id_(cluster_id) , cluster_id_(cluster_id)
, cores_(arch.num_cores())
, barriers_(arch.num_barriers(), 0)
, sharedmems_(arch.num_cores())
, processor_(processor) , processor_(processor)
, sockets_(NUM_SOCKETS)
, barriers_(arch.num_barriers(), 0)
, cores_per_socket_(arch.socket_size())
{ {
auto num_cores = arch.num_cores();
char sname[100]; char sname[100];
uint32_t sockets_per_cluster = sockets_.size();
// create sockets
snprintf(sname, 100, "cluster%d-icache-arb", cluster_id);
auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id);
auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster);
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
auto socket = Socket::Create(socket_id,
this,
arch,
dcrs);
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i));
dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port);
sockets_.at(i) = socket;
}
// Create l2cache
snprintf(sname, 100, "cluster%d-l2cache", cluster_id); snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{ l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED, !L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B log2ceil(MEM_BLOCK_SIZE), // L
log2ceil(L2_NUM_WAYS), // W log2ceil(L2_NUM_WAYS), // W
0, // A 0, // A
log2ceil(L2_NUM_BANKS), // B
XLEN, // address bits XLEN, // address bits
L2_NUM_BANKS, // number of banks
1, // number of ports 1, // number of ports
5, // request size 2, // request size
true, // write-through true, // write-through
false, // write response false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr L2_MSHR_SIZE, // mshr
2, // pipeline latency 2, // pipeline latency
}); });
@@ -53,89 +79,11 @@ Cluster::Cluster(const SimContext& ctx,
l2cache_->MemReqPort.bind(&this->mem_req_port); l2cache_->MemReqPort.bind(&this->mem_req_port);
this->mem_rsp_port.bind(&l2cache_->MemRspPort); this->mem_rsp_port.bind(&l2cache_->MemRspPort);
snprintf(sname, 100, "cluster%d-icaches", cluster_id); icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0));
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{ l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0));
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
XLEN, // address bits
1, // number of banks
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0)); dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort); l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0));
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
XLEN, // address bits
DCACHE_NUM_BANKS, // number of banks
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
// create shared memory blocks
for (uint32_t i = 0; i < num_cores; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
}
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
uint32_t core_id = cluster_id * num_cores + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
}
}
} }
Cluster::~Cluster() { Cluster::~Cluster() {
@@ -153,14 +101,14 @@ void Cluster::tick() {
} }
void Cluster::attach_ram(RAM* ram) { void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) { for (auto& socket : sockets_) {
core->attach_ram(ram); socket->attach_ram(ram);
} }
} }
bool Cluster::running() const { bool Cluster::running() const {
for (auto& core : cores_) { for (auto& socket : sockets_) {
if (core->running()) if (socket->running())
return true; return true;
} }
return false; return false;
@@ -169,9 +117,9 @@ bool Cluster::running() const {
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const { bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true; bool done = true;
Word exitcode_ = 0; Word exitcode_ = 0;
for (auto& core : cores_) { for (auto& socket : sockets_) {
Word ec; Word ec;
if (core->check_exit(&ec, riscv_test)) { if (socket->check_exit(&ec, riscv_test)) {
exitcode_ |= ec; exitcode_ |= ec;
} else { } else {
done = false; done = false;
@@ -184,36 +132,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) { void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id); auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size(); auto sockets_per_cluster = sockets_.size();
auto cores_per_socket = cores_per_socket_;
uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket;
uint32_t local_core_id = core_id % cores_per_cluster;
barrier.set(local_core_id); barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id); DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) { if (barrier.count() == (size_t)count) {
// resume all suspended cores // resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) { for (uint32_t s = 0; s < sockets_per_cluster; ++s) {
if (barrier.test(i)) { for (uint32_t c = 0; c < cores_per_socket; ++c) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id); uint32_t i = s * cores_per_socket + c;
cores_.at(i)->resume(); if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
sockets_.at(s)->resume(c);
}
} }
} }
barrier.reset(); barrier.reset();
} }
} }
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const { Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf; PerfStats perf_stats;
perf.icache = icaches_->perf_stats(); perf_stats.l2cache = l2cache_->perf_stats();
perf.dcache = dcaches_->perf_stats(); return perf_stats;
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
return perf;
} }

View File

@@ -19,6 +19,7 @@
#include "cache_cluster.h" #include "cache_cluster.h"
#include "shared_mem.h" #include "shared_mem.h"
#include "core.h" #include "core.h"
#include "socket.h"
#include "constants.h" #include "constants.h"
namespace vortex { namespace vortex {
@@ -28,18 +29,7 @@ class ProcessorImpl;
class Cluster : public SimObject<Cluster> { class Cluster : public SimObject<Cluster> {
public: public:
struct PerfStats { struct PerfStats {
CacheSim::PerfStats icache; CacheSim::PerfStats l2cache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
PerfStats& operator+=(const PerfStats& rhs) {
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
return *this;
}
}; };
SimPort<MemReq> mem_req_port; SimPort<MemReq> mem_req_port;
@@ -53,6 +43,14 @@ public:
~Cluster(); ~Cluster();
uint32_t id() const {
return cluster_id_;
}
ProcessorImpl* processor() const {
return processor_;
}
void reset(); void reset();
void tick(); void tick();
@@ -65,22 +63,15 @@ public:
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const; PerfStats perf_stats() const;
Cluster::PerfStats perf_stats() const;
private: private:
uint32_t cluster_id_; uint32_t cluster_id_;
std::vector<Core::Ptr> cores_; ProcessorImpl* processor_;
std::vector<CoreMask> barriers_; std::vector<Socket::Ptr> sockets_;
CacheSim::Ptr l2cache_; std::vector<CoreMask> barriers_;
CacheCluster::Ptr icaches_; CacheSim::Ptr l2cache_;
CacheCluster::Ptr dcaches_; uint32_t cores_per_socket_;
std::vector<SharedMem::Ptr> sharedmems_;
CacheCluster::Ptr tcaches_;
CacheCluster::Ptr ocaches_;
CacheCluster::Ptr rcaches_;
ProcessorImpl* processor_;
}; };
} // namespace vortex } // namespace vortex

View File

@@ -21,6 +21,7 @@
#include "mem.h" #include "mem.h"
#include "decode.h" #include "decode.h"
#include "core.h" #include "core.h"
#include "socket.h"
#include "debug.h" #include "debug.h"
#include "constants.h" #include "constants.h"
#include "processor_impl.h" #include "processor_impl.h"
@@ -29,35 +30,36 @@ using namespace vortex;
Core::Core(const SimContext& ctx, Core::Core(const SimContext& ctx,
uint32_t core_id, uint32_t core_id,
Cluster* cluster, Socket* socket,
const Arch &arch, const Arch &arch,
const DCRS &dcrs, const DCRS &dcrs)
SharedMem::Ptr sharedmem)
: SimObject(ctx, "core") : SimObject(ctx, "core")
, icache_req_ports(1, this) , icache_req_ports(1, this)
, icache_rsp_ports(1, this) , icache_rsp_ports(1, this)
, dcache_req_ports(NUM_LSU_LANES, this) , dcache_req_ports(NUM_LSU_LANES, this)
, dcache_rsp_ports(NUM_LSU_LANES, this) , dcache_rsp_ports(NUM_LSU_LANES, this)
, core_id_(core_id) , core_id_(core_id)
, socket_(socket)
, arch_(arch) , arch_(arch)
, dcrs_(dcrs) , dcrs_(dcrs)
, decoder_(arch) , decoder_(arch)
, warps_(arch.num_warps()) , warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0) , barriers_(arch.num_barriers(), 0)
, fcsrs_(arch.num_warps(), 0) , fcsrs_(arch.num_warps(), 0)
, ibuffers_(ISSUE_WIDTH, IBUF_SIZE) , ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_) , scoreboard_(arch_)
, operands_(ISSUE_WIDTH) , operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)ExeType::MAX) , dispatchers_((uint32_t)ExeType::ExeTypeCount)
, exe_units_((uint32_t)ExeType::MAX) , exe_units_((uint32_t)ExeType::ExeTypeCount)
, sharedmem_(sharedmem) , smem_demuxs_(NUM_LSU_LANES)
, fetch_latch_("fetch") , fetch_latch_("fetch")
, decode_latch_("decode") , decode_latch_("decode")
, pending_icache_(arch_.num_warps()) , pending_icache_(arch_.num_warps())
, committed_traces_(ISSUE_WIDTH, nullptr)
, csrs_(arch.num_warps()) , csrs_(arch.num_warps())
, cluster_(cluster) , commit_arbs_(ISSUE_WIDTH)
{ {
char sname[100];
for (uint32_t i = 0; i < arch_.num_warps(); ++i) { for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
csrs_.at(i).resize(arch.num_threads()); csrs_.at(i).resize(arch.num_threads());
} }
@@ -70,6 +72,28 @@ Core::Core(const SimContext& ctx,
operands_.at(i) = SimPlatform::instance().create_object<Operand>(); operands_.at(i) = SimPlatform::instance().create_object<Operand>();
} }
// initialize shared memory
snprintf(sname, 100, "core%d-shared_mem", core_id);
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
(1 << SMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
false
});
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
auto smem_demux = SMemDemux::Create(sname);
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
smem_demuxs_.at(i) = smem_demux;
}
// initialize dispatchers // initialize dispatchers
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES); dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES); dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
@@ -82,6 +106,16 @@ Core::Core(const SimContext& ctx,
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this); exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this); exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
// bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
}
commit_arbs_.at(i) = arbiter;
}
this->reset(); this->reset();
} }
@@ -100,7 +134,11 @@ void Core::reset() {
exe_unit->reset(); exe_unit->reset();
} }
for ( auto& barrier : barriers_) { for (auto& commit_arb : commit_arbs_) {
commit_arb->reset();
}
for (auto& barrier : barriers_) {
barrier.reset(); barrier.reset();
} }
@@ -112,7 +150,7 @@ void Core::reset() {
ibuf.clear(); ibuf.clear();
} }
commit_exe_= 0; ibuffer_idx_ = 0;
scoreboard_.clear(); scoreboard_.clear();
fetch_latch_.clear(); fetch_latch_.clear();
@@ -150,8 +188,10 @@ void Core::schedule() {
break; break;
} }
} }
if (scheduled_warp == -1) if (scheduled_warp == -1) {
++perf_stats_.sched_idle;
return; return;
}
// suspend warp until decode // suspend warp until decode
stalled_warps_.set(scheduled_warp); stalled_warps_.set(scheduled_warp);
@@ -192,11 +232,11 @@ void Core::fetch() {
mem_req.tag = pending_icache_.allocate(trace); mem_req.tag = pending_icache_.allocate(trace);
mem_req.cid = trace->cid; mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid; mem_req.uuid = trace->uuid;
icache_req_ports.at(0).send(mem_req, 1); icache_req_ports.at(0).send(mem_req, 2);
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop(); fetch_latch_.pop();
++pending_ifetches_;
++perf_stats_.ifetches; ++perf_stats_.ifetches;
++pending_ifetches_;
} }
void Core::decode() { void Core::decode() {
@@ -206,7 +246,7 @@ void Core::decode() {
auto trace = decode_latch_.front(); auto trace = decode_latch_.front();
// check ibuffer capacity // check ibuffer capacity
auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH); auto& ibuffer = ibuffers_.at(trace->wid);
if (ibuffer.full()) { if (ibuffer.full()) {
if (!trace->log_once(true)) { if (!trace->log_once(true)) {
DT(3, "*** ibuffer-stall: " << *trace); DT(3, "*** ibuffer-stall: " << *trace);
@@ -223,13 +263,6 @@ void Core::decode() {
stalled_warps_.reset(trace->wid); stalled_warps_.reset(trace->wid);
} }
// update perf counters
uint32_t active_threads = trace->tmask.count();
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD)
perf_stats_.loads += active_threads;
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
perf_stats_.stores += active_threads;
DT(3, "pipeline-decode: " << *trace); DT(3, "pipeline-decode: " << *trace);
// insert to ibuffer // insert to ibuffer
@@ -239,7 +272,7 @@ void Core::decode() {
} }
void Core::issue() { void Core::issue() {
// operands to dispatch // operands to dispatchers
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& operand = operands_.at(i); auto& operand = operands_.at(i);
if (operand->Output.empty()) if (operand->Output.empty())
@@ -257,7 +290,8 @@ void Core::issue() {
// issue ibuffer instructions // issue ibuffer instructions
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& ibuffer = ibuffers_.at(i); uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
auto& ibuffer = ibuffers_.at(ii);
if (ibuffer.empty()) if (ibuffer.empty())
continue; continue;
@@ -265,17 +299,41 @@ void Core::issue() {
// check scoreboard // check scoreboard
if (scoreboard_.in_use(trace)) { if (scoreboard_.in_use(trace)) {
auto uses = scoreboard_.get_uses(trace);
if (!trace->log_once(true)) { if (!trace->log_once(true)) {
DTH(3, "*** scoreboard-stall: dependents={"); DTH(3, "*** scoreboard-stall: dependents={");
auto uses = scoreboard_.get_uses(trace);
for (uint32_t j = 0, n = uses.size(); j < n; ++j) { for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j); auto& use = uses.at(j);
__unused (use); __unused (use);
if (j) DTN(3, ", "); if (j) DTN(3, ", ");
DTN(3, use.type << use.reg << "(#" << use.owner << ")"); DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
} }
DTN(3, "}, " << *trace << std::endl); DTN(3, "}, " << *trace << std::endl);
} }
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
switch (use.exe_type) {
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
case ExeType::SFU: {
++perf_stats_.scrb_sfu;
switch (use.sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::BAR:
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
default: assert(false);
}
} break;
default: assert(false);
}
}
++perf_stats_.scrb_stalls; ++perf_stats_.scrb_stalls;
continue; continue;
} else { } else {
@@ -294,10 +352,11 @@ void Core::issue() {
ibuffer.pop(); ibuffer.pop();
} }
ibuffer_idx_ += ISSUE_WIDTH;
} }
void Core::execute() { void Core::execute() {
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) { for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) {
auto& dispatch = dispatchers_.at(i); auto& dispatch = dispatchers_.at(i);
auto& exe_unit = exe_units_.at(i); auto& exe_unit = exe_units_.at(i);
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) { for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
@@ -313,10 +372,10 @@ void Core::execute() {
void Core::commit() { void Core::commit() {
// process completed instructions // process completed instructions
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto trace = committed_traces_.at(i); auto& commit_arb = commit_arbs_.at(i);
if (!trace) if (commit_arb->Outputs.at(0).empty())
continue; continue;
committed_traces_.at(i) = nullptr; auto trace = commit_arb->Outputs.at(0).front();
// advance to commit stage // advance to commit stage
DT(3, "pipeline-commit: " << *trace); DT(3, "pipeline-commit: " << *trace);
@@ -334,27 +393,11 @@ void Core::commit() {
perf_stats_.instrs += trace->tmask.count(); perf_stats_.instrs += trace->tmask.count();
} }
commit_arb->Outputs.at(0).pop();
// delete the trace // delete the trace
delete trace; delete trace;
} }
// select completed instructions
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX;
auto& exe_unit = exe_units_.at(ii);
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
auto committed_trace = committed_traces_.at(j);
if (committed_trace)
continue;
auto& output = exe_unit->Outputs.at(j);
if (output.empty())
continue;
auto trace = output.front();
committed_traces_.at(j) = trace;
output.pop();
}
}
++commit_exe_;
} }
void Core::wspawn(uint32_t num_warps, Word nextPC) { void Core::wspawn(uint32_t num_warps, Word nextPC) {
@@ -379,7 +422,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
if (is_global) { if (is_global) {
// global barrier handling // global barrier handling
if (barrier.count() == active_warps_.count()) { if (barrier.count() == active_warps_.count()) {
cluster_->barrier(bar_idx, count, core_id_); socket_->barrier(bar_idx, count, core_id_);
barrier.reset(); barrier.reset();
} }
} else { } else {
@@ -416,7 +459,7 @@ AddrType Core::get_addr_type(uint64_t addr) {
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) { void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
auto type = this->get_addr_type(addr); auto type = this->get_addr_type(addr);
if (type == AddrType::Shared) { if (type == AddrType::Shared) {
sharedmem_->read(data, addr, size); shared_mem_->read(data, addr, size);
} else { } else {
mmu_.read(data, addr, size, 0); mmu_.read(data, addr, size, 0);
} }
@@ -431,7 +474,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
this->writeToStdOut(data, addr, size); this->writeToStdOut(data, addr, size);
} else { } else {
if (type == AddrType::Shared) { if (type == AddrType::Shared) {
sharedmem_->write(data, addr, size); shared_mem_->write(data, addr, size);
} else { } else {
mmu_.write(data, addr, size, 0); mmu_.write(data, addr, size, 0);
} }
@@ -533,71 +576,76 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
break; break;
case VX_DCR_MPM_CLASS_CORE: { case VX_DCR_MPM_CLASS_CORE: {
switch (addr) { switch (addr) {
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32; case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff; case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32; case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff; case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32; case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff; case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff;
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32; case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff; case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32; case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff; case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff; case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32; case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff; case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32; case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff; case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff;
case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32; case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32;
case VX_CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff; case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff;
case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32; case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
} }
} break; } break;
case VX_DCR_MPM_CLASS_MEM: { case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = cluster_->processor()->perf_stats(); auto proc_perf = socket_->cluster()->processor()->perf_stats();
auto cluster_perf = socket_->cluster()->perf_stats();
auto socket_perf = socket_->perf_stats();
auto smem_perf = shared_mem_->perf_stats();
switch (addr) { switch (addr) {
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff; case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32; case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff; case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32; case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff; case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32; case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff; case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff; case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff; case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32; case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32; case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff; case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32; case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32; case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff; case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32; case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff; case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32; case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff; case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff; case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32; case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
@@ -612,14 +660,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff; case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32; case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff; case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32; case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff; case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32; case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_SMEM_READS: return smem_perf.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return smem_perf.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return smem_perf.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return smem_perf.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return smem_perf.bank_stalls & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
} }
} break; } break;
default: {
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
} break;
} }
} else { } else {
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl; std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;

View File

@@ -22,11 +22,11 @@
#include <memory> #include <memory>
#include <set> #include <set>
#include <simobject.h> #include <simobject.h>
#include <mem.h>
#include "debug.h" #include "debug.h"
#include "types.h" #include "types.h"
#include "arch.h" #include "arch.h"
#include "decode.h" #include "decode.h"
#include "mem.h"
#include "warp.h" #include "warp.h"
#include "pipeline.h" #include "pipeline.h"
#include "cache_sim.h" #include "cache_sim.h"
@@ -40,19 +40,25 @@
namespace vortex { namespace vortex {
class Cluster; class Socket;
using TraceSwitch = Mux<pipeline_trace_t*>;
class Core : public SimObject<Core> { class Core : public SimObject<Core> {
public: public:
struct PerfStats { struct PerfStats {
uint64_t cycles; uint64_t cycles;
uint64_t instrs; uint64_t instrs;
uint64_t sched_idle;
uint64_t sched_stalls;
uint64_t ibuf_stalls; uint64_t ibuf_stalls;
uint64_t scrb_stalls; uint64_t scrb_stalls;
uint64_t alu_stalls; uint64_t scrb_alu;
uint64_t lsu_stalls; uint64_t scrb_fpu;
uint64_t fpu_stalls; uint64_t scrb_lsu;
uint64_t sfu_stalls; uint64_t scrb_sfu;
uint64_t scrb_wctl;
uint64_t scrb_csrs;
uint64_t ifetches; uint64_t ifetches;
uint64_t loads; uint64_t loads;
uint64_t stores; uint64_t stores;
@@ -62,12 +68,16 @@ public:
PerfStats() PerfStats()
: cycles(0) : cycles(0)
, instrs(0) , instrs(0)
, sched_idle(0)
, sched_stalls(0)
, ibuf_stalls(0) , ibuf_stalls(0)
, scrb_stalls(0) , scrb_stalls(0)
, alu_stalls(0) , scrb_alu(0)
, lsu_stalls(0) , scrb_fpu(0)
, fpu_stalls(0) , scrb_lsu(0)
, sfu_stalls(0) , scrb_sfu(0)
, scrb_wctl(0)
, scrb_csrs(0)
, ifetches(0) , ifetches(0)
, loads(0) , loads(0)
, stores(0) , stores(0)
@@ -84,10 +94,9 @@ public:
Core(const SimContext& ctx, Core(const SimContext& ctx,
uint32_t core_id, uint32_t core_id,
Cluster* cluster, Socket* socket,
const Arch &arch, const Arch &arch,
const DCRS &dcrs, const DCRS &dcrs);
SharedMem::Ptr sharedmem);
~Core(); ~Core();
@@ -105,6 +114,10 @@ public:
return core_id_; return core_id_;
} }
Socket* socket() const {
return socket_;
}
const Arch& arch() const { const Arch& arch() const {
return arch_; return arch_;
} }
@@ -153,6 +166,7 @@ private:
void cout_flush(); void cout_flush();
uint32_t core_id_; uint32_t core_id_;
Socket* socket_;
const Arch& arch_; const Arch& arch_;
const DCRS &dcrs_; const DCRS &dcrs_;
@@ -167,13 +181,13 @@ private:
std::vector<Operand::Ptr> operands_; std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_; std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_; std::vector<ExeUnit::Ptr> exe_units_;
SharedMem::Ptr sharedmem_; SharedMem::Ptr shared_mem_;
std::vector<SMemDemux::Ptr> smem_demuxs_;
PipelineLatch fetch_latch_; PipelineLatch fetch_latch_;
PipelineLatch decode_latch_; PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_; HashTable<pipeline_trace_t*> pending_icache_;
std::vector<pipeline_trace_t*> committed_traces_;
WarpMask active_warps_; WarpMask active_warps_;
WarpMask stalled_warps_; WarpMask stalled_warps_;
uint64_t issued_instrs_; uint64_t issued_instrs_;
@@ -188,9 +202,10 @@ private:
PerfStats perf_stats_; PerfStats perf_stats_;
Cluster* cluster_; std::vector<TraceSwitch::Ptr> commit_arbs_;
uint32_t commit_exe_; uint32_t commit_exe_;
uint32_t ibuffer_idx_;
friend class Warp; friend class Warp;
friend class LsuUnit; friend class LsuUnit;

View File

@@ -66,6 +66,7 @@ public:
} }
auto& output = Outputs.at(i); auto& output = Outputs.at(i);
auto trace = input.front(); auto trace = input.front();
auto new_trace = trace;
if (pid_count_ != 1) { if (pid_count_ != 1) {
auto start_p = start_p_.at(b); auto start_p = start_p_.at(b);
if (start_p == -1) { if (start_p == -1) {
@@ -82,32 +83,29 @@ public:
} }
start /= num_lanes_; start /= num_lanes_;
end /= num_lanes_; end /= num_lanes_;
auto new_trace = new pipeline_trace_t(*trace); if (start != end) {
new_trace->tmask.reset(); new_trace = new pipeline_trace_t(*trace);
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) { new_trace->eop = false;
new_trace->tmask[j] = trace->tmask[j]; start_p_.at(b) = start + 1;
} } else {
new_trace->pid = start;
new_trace->sop = (start_p == 0);
if (start == end) {
new_trace->eop = 1;
start_p_.at(b) = -1; start_p_.at(b) = -1;
input.pop(); input.pop();
++block_sent; ++block_sent;
delete trace;
} else {
new_trace->eop = 0;
start_p_.at(b) = start + 1;
} }
output.send(new_trace, 1); new_trace->pid = start;
DT(3, "pipeline-dispatch: " << *new_trace); new_trace->sop = (0 == start_p);
ThreadMask tmask;
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
tmask[j] = trace->tmask[j];
}
new_trace->tmask = tmask;
} else { } else {
trace->pid = 0; new_trace->pid = 0;
input.pop(); input.pop();
output.send(trace, 1);
DT(3, "pipeline-dispatch: " << *trace);
++block_sent; ++block_sent;
} }
DT(3, "pipeline-dispatch: " << *new_trace);
output.send(new_trace, 1);
} }
if (block_sent == block_size_) { if (block_sent == block_size_) {
batch_idx_ = (batch_idx_ + 1) % batch_count_; batch_idx_ = (batch_idx_ + 1) % batch_count_;

View File

@@ -51,8 +51,7 @@ void AluUnit::tick() {
assert(core_->stalled_warps_.test(trace->wid)); assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid); core_->stalled_warps_.reset(trace->wid);
} }
auto time = input.pop(); input.pop();
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
} }
} }
@@ -87,8 +86,7 @@ void FpuUnit::tick() {
std::abort(); std::abort();
} }
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace); DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
auto time = input.pop(); input.pop();
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
} }
} }
@@ -114,7 +112,7 @@ void LsuUnit::tick() {
// handle dcache response // handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) { for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t); auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn;
if (dcache_rsp_port.empty()) if (dcache_rsp_port.empty())
continue; continue;
auto& mem_rsp = dcache_rsp_port.front(); auto& mem_rsp = dcache_rsp_port.front();
@@ -136,7 +134,7 @@ void LsuUnit::tick() {
// handle shared memory response // handle shared memory response
for (uint32_t t = 0; t < num_lanes_; ++t) { for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t); auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
if (smem_rsp_port.empty()) if (smem_rsp_port.empty())
continue; continue;
auto& mem_rsp = smem_rsp_port.front(); auto& mem_rsp = smem_rsp_port.front();
@@ -184,8 +182,7 @@ void LsuUnit::tick() {
fence_lock_ = true; fence_lock_ = true;
DT(3, "fence-lock: " << *trace); DT(3, "fence-lock: " << *trace);
// remove input // remove input
auto time = input.pop(); input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break; break;
} }
@@ -213,7 +210,9 @@ void LsuUnit::tick() {
auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask; auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask;
matches += (addr0 == mem_addr); matches += (addr0 == mem_addr);
} }
#ifdef LSU_DUP_ENABLE
is_dup = (matches == trace->tmask.count()); is_dup = (matches == trace->tmask.count());
#endif
} }
uint32_t addr_count; uint32_t addr_count;
@@ -229,7 +228,7 @@ void LsuUnit::tick() {
if (!trace->tmask.test(t0 + t)) if (!trace->tmask.test(t0 + t))
continue; continue;
auto& dcache_req_port = core_->dcache_req_ports.at(t); auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t); auto mem_addr = trace_data->mem_addrs.at(t);
auto type = core_->get_addr_type(mem_addr.addr); auto type = core_->get_addr_type(mem_addr.addr);
@@ -241,12 +240,16 @@ void LsuUnit::tick() {
mem_req.cid = trace->cid; mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid; mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 2); dcache_req_port.send(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace); << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
++pending_loads_; if (is_write) {
++core_->perf_stats_.loads; ++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
if (is_dup) if (is_dup)
break; break;
} }
@@ -255,12 +258,10 @@ void LsuUnit::tick() {
if (is_write) { if (is_write) {
pending_rd_reqs_.release(tag); pending_rd_reqs_.release(tag);
output.send(trace, 1); output.send(trace, 1);
++core_->perf_stats_.stores;
} }
// remove input // remove input
auto time = input.pop(); input.pop();
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
break; // single block break; // single block
} }
@@ -318,10 +319,7 @@ void SfuUnit::tick() {
core_->stalled_warps_.reset(trace->wid); core_->stalled_warps_.reset(trace->wid);
} }
auto time = input.pop(); input.pop();
auto stalls = (SimPlatform::instance().cycles() - time);
core_->perf_stats_.sfu_stalls += stalls;
break; // single block break; // single block
} }

Some files were not shown because too many files have changed in this diff Show More