Merge remote-tracking branch 'upstream/master' into kernels
This commit is contained in:
@@ -37,8 +37,8 @@ jobs:
|
|||||||
script:
|
script:
|
||||||
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
|
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
|
||||||
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
|
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
|
||||||
- make -C $HOME/build32
|
- make -C $HOME/build32 > /dev/null
|
||||||
- XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make -C $HOME/build64
|
- XLEN=64 make -C $HOME/build64 > /dev/null
|
||||||
- stage: test
|
- stage: test
|
||||||
name: unittest
|
name: unittest
|
||||||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
|
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
|
||||||
@@ -47,13 +47,13 @@ jobs:
|
|||||||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
|
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
|
||||||
- stage: test
|
- stage: test
|
||||||
name: isa64
|
name: isa64
|
||||||
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa
|
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
|
||||||
- stage: test
|
- stage: test
|
||||||
name: regression
|
name: regression
|
||||||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
|
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
|
||||||
- stage: test
|
- stage: test
|
||||||
name: regression64
|
name: regression64
|
||||||
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression
|
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
|
||||||
- stage: test
|
- stage: test
|
||||||
name: opencl
|
name: opencl
|
||||||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl
|
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl
|
||||||
|
|||||||
@@ -33,8 +33,9 @@ Vortex is a full-stack open-source RISC-V GPGPU.
|
|||||||
- `miscs`: Miscellaneous resources.
|
- `miscs`: Miscellaneous resources.
|
||||||
|
|
||||||
## Build Instructions
|
## Build Instructions
|
||||||
|
More detailed build instructions can be found [here](docs/install_vortex.md).
|
||||||
### Supported OS Platforms
|
### Supported OS Platforms
|
||||||
- Ubuntu 18.04
|
- Ubuntu 18.04, 20.04
|
||||||
- Centos 7
|
- Centos 7
|
||||||
### Toolchain Dependencies
|
### Toolchain Dependencies
|
||||||
- [POCL](http://portablecl.org/)
|
- [POCL](http://portablecl.org/)
|
||||||
@@ -53,9 +54,9 @@ Vortex is a full-stack open-source RISC-V GPGPU.
|
|||||||
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git
|
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git
|
||||||
$ cd Vortex
|
$ cd Vortex
|
||||||
### Install prebuilt toolchain
|
### Install prebuilt toolchain
|
||||||
By default, the toolchain will install to /opt folder.
|
By default, the toolchain will install to /opt folder which requires sudo access.
|
||||||
You can install the toolchain to a different directory by overriding TOOLDIR (e.g. export TOOLDIR=$HOME/tools).
|
You can install the toolchain to a different location of your choice by setting TOOLDIR (e.g. export TOOLDIR=$HOME/tools).
|
||||||
|
$ export TOOLDIR=/opt
|
||||||
$ ./ci/toolchain_install.sh --all
|
$ ./ci/toolchain_install.sh --all
|
||||||
$ source ./ci/toolchain_env.sh
|
$ source ./ci/toolchain_env.sh
|
||||||
### Build Vortex sources
|
### Build Vortex sources
|
||||||
|
|||||||
4
RELEASE
4
RELEASE
@@ -1,4 +0,0 @@
|
|||||||
|
|
||||||
Release Notes!
|
|
||||||
|
|
||||||
* 07/01/2020 - LKG FPGA build - Passed basic, demo, vecadd kernels.
|
|
||||||
23
TODO
23
TODO
@@ -1,23 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
Functionality:
|
|
||||||
1) vx_cl_warpSpawn()
|
|
||||||
-> To be used by pocl->ops->run
|
|
||||||
|
|
||||||
2) newlib Integration (LoadFile(""))
|
|
||||||
-> To be used by the Rhinio benchmarks
|
|
||||||
|
|
||||||
3) POCL OPS Vortex Suite
|
|
||||||
|
|
||||||
Performance:
|
|
||||||
1) Icache doesn't need SEND_MEM_REQUEST Stage
|
|
||||||
-> Blocks are never dirty, so why not evict right away
|
|
||||||
|
|
||||||
2) Branch not taken speculation
|
|
||||||
|
|
||||||
3) Runtime -02 not running on RTL, and -03 not running on RTL and Emulator
|
|
||||||
|
|
||||||
|
|
||||||
Vector:
|
|
||||||
1) Cycle accurate simulator (would require Cache Simulator)
|
|
||||||
159
ci/blackbox.sh
159
ci/blackbox.sh
@@ -16,7 +16,17 @@
|
|||||||
show_usage()
|
show_usage()
|
||||||
{
|
{
|
||||||
echo "Vortex BlackBox Test Driver v1.0"
|
echo "Vortex BlackBox Test Driver v1.0"
|
||||||
echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--rebuild=0|1] [--log=logfile] [--help]]"
|
echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--rebuild=#n] [--log=logfile] [--help]]"
|
||||||
|
}
|
||||||
|
|
||||||
|
show_help()
|
||||||
|
{
|
||||||
|
show_usage
|
||||||
|
echo " where"
|
||||||
|
echo "--driver: simx, rtlsim, oape, xrt"
|
||||||
|
echo "--app: any subfolder test under regression or opencl"
|
||||||
|
echo "--class: 0=disable, 1=pipeline, 2=memsys"
|
||||||
|
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
|
||||||
}
|
}
|
||||||
|
|
||||||
SCRIPT_DIR=$(dirname "$0")
|
SCRIPT_DIR=$(dirname "$0")
|
||||||
@@ -36,6 +46,7 @@ SCOPE=0
|
|||||||
HAS_ARGS=0
|
HAS_ARGS=0
|
||||||
PERF_CLASS=0
|
PERF_CLASS=0
|
||||||
REBUILD=2
|
REBUILD=2
|
||||||
|
TEMPBUILD=0
|
||||||
LOGFILE=run.log
|
LOGFILE=run.log
|
||||||
|
|
||||||
for i in "$@"
|
for i in "$@"
|
||||||
@@ -102,7 +113,7 @@ case $i in
|
|||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
--help)
|
--help)
|
||||||
show_usage
|
show_help
|
||||||
exit 0
|
exit 0
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
@@ -112,6 +123,12 @@ case $i in
|
|||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
if [ $REBUILD -eq 3 ];
|
||||||
|
then
|
||||||
|
REBUILD=1
|
||||||
|
TEMPBUILD=1
|
||||||
|
fi
|
||||||
|
|
||||||
case $DRIVER in
|
case $DRIVER in
|
||||||
simx)
|
simx)
|
||||||
DRIVER_PATH=$VORTEX_HOME/runtime/simx
|
DRIVER_PATH=$VORTEX_HOME/runtime/simx
|
||||||
@@ -174,53 +191,119 @@ make -C $VORTEX_HOME/runtime/stub > /dev/null
|
|||||||
|
|
||||||
if [ $DEBUG -ne 0 ]
|
if [ $DEBUG -ne 0 ]
|
||||||
then
|
then
|
||||||
# driver initialization
|
|
||||||
if [ $SCOPE -eq 1 ]
|
|
||||||
then
|
|
||||||
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
|
||||||
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
|
||||||
else
|
|
||||||
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
|
||||||
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
|
||||||
fi
|
|
||||||
|
|
||||||
# running application
|
# running application
|
||||||
if [ $HAS_ARGS -eq 1 ]
|
if [ $TEMPBUILD -eq 1 ]
|
||||||
then
|
then
|
||||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
# setup temp directory
|
||||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
TEMPDIR=$(mktemp -d)
|
||||||
status=$?
|
mkdir -p "$TEMPDIR/$DRIVER"
|
||||||
|
|
||||||
|
# driver initialization
|
||||||
|
if [ $SCOPE -eq 1 ]
|
||||||
|
then
|
||||||
|
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
|
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
|
else
|
||||||
|
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
|
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
# running application
|
||||||
|
if [ $HAS_ARGS -eq 1 ]
|
||||||
|
then
|
||||||
|
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||||
|
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||||
|
status=$?
|
||||||
|
else
|
||||||
|
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||||
|
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||||
|
status=$?
|
||||||
|
fi
|
||||||
|
|
||||||
|
# cleanup temp directory
|
||||||
|
trap "rm -rf $TEMPDIR" EXIT
|
||||||
else
|
else
|
||||||
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
# driver initialization
|
||||||
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
if [ $SCOPE -eq 1 ]
|
||||||
status=$?
|
then
|
||||||
|
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
|
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
|
else
|
||||||
|
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
|
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
# running application
|
||||||
|
if [ $HAS_ARGS -eq 1 ]
|
||||||
|
then
|
||||||
|
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||||
|
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||||
|
status=$?
|
||||||
|
else
|
||||||
|
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||||
|
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||||
|
status=$?
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -f "$APP_PATH/trace.vcd" ]
|
if [ -f "$APP_PATH/trace.vcd" ]
|
||||||
then
|
then
|
||||||
mv -f $APP_PATH/trace.vcd .
|
mv -f $APP_PATH/trace.vcd .
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
# driver initialization
|
if [ $TEMPBUILD -eq 1 ]
|
||||||
if [ $SCOPE -eq 1 ]
|
|
||||||
then
|
then
|
||||||
echo "running: SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
# setup temp directory
|
||||||
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
TEMPDIR=$(mktemp -d)
|
||||||
|
mkdir -p "$TEMPDIR/$DRIVER"
|
||||||
|
|
||||||
|
# driver initialization
|
||||||
|
if [ $SCOPE -eq 1 ]
|
||||||
|
then
|
||||||
|
echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
|
DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
|
else
|
||||||
|
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
|
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
|
fi
|
||||||
|
|
||||||
|
# running application
|
||||||
|
if [ $HAS_ARGS -eq 1 ]
|
||||||
|
then
|
||||||
|
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
|
||||||
|
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER
|
||||||
|
status=$?
|
||||||
|
else
|
||||||
|
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER"
|
||||||
|
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER
|
||||||
|
status=$?
|
||||||
|
fi
|
||||||
|
|
||||||
|
# cleanup temp directory
|
||||||
|
trap "rm -rf $TEMPDIR" EXIT
|
||||||
else
|
else
|
||||||
echo "running: CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
|
||||||
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
# driver initialization
|
||||||
fi
|
if [ $SCOPE -eq 1 ]
|
||||||
|
then
|
||||||
# running application
|
echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
if [ $HAS_ARGS -eq 1 ]
|
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
then
|
else
|
||||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
|
echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
|
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||||
status=$?
|
fi
|
||||||
else
|
|
||||||
echo "running: make -C $APP_PATH run-$DRIVER"
|
# running application
|
||||||
make -C $APP_PATH run-$DRIVER
|
if [ $HAS_ARGS -eq 1 ]
|
||||||
status=$?
|
then
|
||||||
|
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
|
||||||
|
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
|
||||||
|
status=$?
|
||||||
|
else
|
||||||
|
echo "running: make -C $APP_PATH run-$DRIVER"
|
||||||
|
make -C $APP_PATH run-$DRIVER
|
||||||
|
status=$?
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ rm -f blackbox.*.cache
|
|||||||
unittest()
|
unittest()
|
||||||
{
|
{
|
||||||
make -C tests/unittest run
|
make -C tests/unittest run
|
||||||
make -C hw/unittest
|
make -C hw/unittest > /dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
isa()
|
isa()
|
||||||
@@ -31,33 +31,36 @@ echo "begin isa tests..."
|
|||||||
|
|
||||||
make -C tests/riscv/isa run-simx
|
make -C tests/riscv/isa run-simx
|
||||||
make -C tests/riscv/isa run-rtlsim
|
make -C tests/riscv/isa run-rtlsim
|
||||||
CONFIGS="-DDPI_DISABLE" make -C tests/riscv/isa run-rtlsim
|
|
||||||
|
|
||||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
|
make -C sim/rtlsim clean && CONFIGS="-DDPI_DISABLE" make -C sim/rtlsim > /dev/null
|
||||||
|
make -C tests/riscv/isa run-rtlsim
|
||||||
|
|
||||||
|
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-rtlsim-32f
|
make -C tests/riscv/isa run-rtlsim-32f
|
||||||
|
|
||||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
|
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-rtlsim-32f
|
make -C tests/riscv/isa run-rtlsim-32f
|
||||||
|
|
||||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
|
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-rtlsim-32f
|
make -C tests/riscv/isa run-rtlsim-32f
|
||||||
|
|
||||||
if [ "$XLEN" == "64" ]
|
if [ "$XLEN" == "64" ]
|
||||||
then
|
then
|
||||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
|
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-rtlsim-64f
|
make -C tests/riscv/isa run-rtlsim-64f
|
||||||
|
|
||||||
make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
|
make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-rtlsim-64d || true
|
make -C tests/riscv/isa run-rtlsim-64d || true
|
||||||
|
|
||||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
|
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-rtlsim-64f
|
make -C tests/riscv/isa run-rtlsim-64f
|
||||||
|
|
||||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
|
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-rtlsim-64fx
|
make -C tests/riscv/isa run-rtlsim-64fx
|
||||||
fi
|
fi
|
||||||
|
|
||||||
make -C sim/rtlsim clean && make -C sim/rtlsim
|
# restore default prebuilt configuration
|
||||||
|
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
|
||||||
|
|
||||||
echo "isa tests done!"
|
echo "isa tests done!"
|
||||||
}
|
}
|
||||||
@@ -134,15 +137,16 @@ debug()
|
|||||||
echo "begin debugging tests..."
|
echo "begin debugging tests..."
|
||||||
|
|
||||||
# test CSV trace generation
|
# test CSV trace generation
|
||||||
make -C sim/simx clean && DEBUG=3 make -C sim/simx
|
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
|
||||||
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim
|
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
|
||||||
make -C tests/riscv/isa run-simx-32im > run_simx.log
|
make -C tests/riscv/isa run-simx-32im > run_simx.log
|
||||||
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
|
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
|
||||||
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
|
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
|
||||||
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
||||||
diff trace_rtlsim.csv trace_simx.csv
|
diff trace_rtlsim.csv trace_simx.csv
|
||||||
make -C sim/simx clean && make -C sim/simx
|
# restore default prebuilt configuration
|
||||||
make -C sim/rtlsim clean && make -C sim/rtlsim
|
make -C sim/simx clean && make -C sim/simx > /dev/null
|
||||||
|
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
|
||||||
|
|
||||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||||
|
|||||||
@@ -16,14 +16,11 @@
|
|||||||
|
|
||||||
TOOLDIR=${TOOLDIR:=/opt}
|
TOOLDIR=${TOOLDIR:=/opt}
|
||||||
|
|
||||||
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
|
|
||||||
export LLVM_POCL=$TOOLDIR/llvm-pocl
|
|
||||||
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
|
|
||||||
export VERILATOR_ROOT=$TOOLDIR/verilator
|
export VERILATOR_ROOT=$TOOLDIR/verilator
|
||||||
export PATH=$VERILATOR_ROOT/bin:$PATH
|
export PATH=$VERILATOR_ROOT/bin:$PATH
|
||||||
|
|
||||||
export SV2V_PATH=$TOOLDIR/sv2v
|
export SV2V_PATH=$TOOLDIR/sv2v
|
||||||
export PATH=$SV2V_PATH/bin:$PATH
|
export PATH=$SV2V_PATH/bin:$PATH
|
||||||
|
|
||||||
export YOSYS_PATH=$TOOLDIR/yosys
|
export YOSYS_PATH=$TOOLDIR/yosys
|
||||||
export PATH=$YOSYS_PATH/bin:$PATH
|
export PATH=$YOSYS_PATH/bin:$PATH
|
||||||
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
|
|
||||||
export POCL_RT_PATH=$TOOLDIR/pocl/runtime
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
# Copyright © 2019-2023
|
# Copyright 2019-2023
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
# You may obtain a copy of the License at
|
# You may obtain a copy of the License at
|
||||||
@@ -34,11 +34,11 @@ def monitor(stop):
|
|||||||
break
|
break
|
||||||
|
|
||||||
def execute(command):
|
def execute(command):
|
||||||
process = subprocess.Popen(command, stdout=subprocess.PIPE)
|
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||||
while True:
|
while True:
|
||||||
output = process.stdout.readline()
|
output = process.stdout.readline()
|
||||||
if output:
|
if output:
|
||||||
line = output.decode('ascii').rstrip()
|
line = output.decode('utf-8').rstrip()
|
||||||
print(">>> " + line)
|
print(">>> " + line)
|
||||||
process.stdout.flush()
|
process.stdout.flush()
|
||||||
ret = process.poll()
|
ret = process.poll()
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 60 KiB |
BIN
docs/assets/img/cache_microarchitecture.png
Normal file
BIN
docs/assets/img/cache_microarchitecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 207 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 77 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 67 KiB |
BIN
docs/assets/img/vortex_microarchitecture.png
Normal file
BIN
docs/assets/img/vortex_microarchitecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 463 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 517 KiB |
@@ -2,69 +2,26 @@
|
|||||||
|
|
||||||
The Vortex Cache Sub-system has the following main properties:
|
The Vortex Cache Sub-system has the following main properties:
|
||||||
|
|
||||||
- High-bandwidth with bank parallelism
|
- High-bandwidth transfer with Multi-bank parallelism
|
||||||
- Snoop protocol to flush data for CPU access
|
- Non-blocking pipelined architecture with local MSHR
|
||||||
- Generic design: Dcache, Icache, Shared Memory, L2 cache, L3 cache
|
- Configurable design: Dcache, Icache, L2 cache, L3 cache
|
||||||
|
|
||||||
### Cache Hierarchy
|
### Cache Microarchitecture
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
- Cache can be configured to be any level in the hierarchy
|
The Vortex cache is comprised of multiple parallel banks. It is comprised of the following modules:
|
||||||
- Caches communicate via snooping
|
- **Bank request dispatch crossbar**: assign a bank to incoming requests and resolve collision using stalls.
|
||||||
- Cache flush from AFU is passed down the hierarchy
|
- **Bank response merge crossbar**: merge result from banks and forward to the core response.
|
||||||
|
- **Memory request multiplexer**: arbitrate bank memory requests
|
||||||
|
- **Memory response demultiplexer**: forward memory response to the corresponding bank.
|
||||||
|
- **Flush Unit**: perform tag memory initialization.
|
||||||
|
|
||||||
### VX_cache.v (Top Module)
|
Incoming requests entering the cache are sent to a dispatch crossbar that select the corresponding bank for each request, resolving bank collisions with stalls. The result output of each bank is merge back into outgoing response port via merger crossbar. Each bank intergates a non-blocking pipeline with a local Miss Status Holding Register (MSHR) to reduce the miss rate. The bank pipeline consists of the following stages:
|
||||||
|
|
||||||
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory.
|
- **Schedule**: Selects the next request into the pipeline from the incoming core request, memory fill, or the MSHR entry, with priority given to the latter.
|
||||||
|
- **Tag Access**: A single-port read/write access to the tag store.
|
||||||
|
- **Data Access**: Single-port read/write access to the data store.
|
||||||
|
- **Response Handling**: Core response back to the core.
|
||||||
|
|
||||||

|
Deadlocks inside the cache can occur when the MSHR is full and a new request is already in the pipeline. It can also occur when the memory request queue is full, and there is an incoming memory response. The cache mitigates MSHR deadlocks by using an early full signal before a new request is issued and similarly mitigates memory deadlocks by ensuring that its request queue never fills up.
|
||||||
|
|
||||||
- Configurable (Cache size, number of banks, bank line size, etc.)
|
|
||||||
- I/O signals
|
|
||||||
- Core Request
|
|
||||||
- Core Rsp
|
|
||||||
- DRAM Req
|
|
||||||
- DRAM Rsp
|
|
||||||
- Snoop Rsp
|
|
||||||
- Snoop Rsp
|
|
||||||
- Snoop Forwarding Out
|
|
||||||
- Snoop Forwarding In
|
|
||||||
- Bank Select
|
|
||||||
- Assigns valid and ready signals for each bank
|
|
||||||
- Snoop Forwarder
|
|
||||||
- DRAM Request Arbiter
|
|
||||||
- Prepares cache response for communication with DRAM
|
|
||||||
- Snoop Response Arbiter
|
|
||||||
- Sends snoop response
|
|
||||||
- Core Response Merge
|
|
||||||
- Cache accesses one line at a time. As a result, each request may not come back in the same response. This module tries to recombine the responses by thread ID.
|
|
||||||
|
|
||||||
### VX_cache_bank.v
|
|
||||||
|
|
||||||
VX_cache_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
|
|
||||||
|
|
||||||

|
|
||||||
|
|
||||||
- Allows for high throughput
|
|
||||||
- Each bank contains queues to hold requests to the cache
|
|
||||||
- I/O signals
|
|
||||||
- Core request
|
|
||||||
- Core Response
|
|
||||||
- DRAM Fill Requests
|
|
||||||
- DRAM Fill Response
|
|
||||||
- DRAM WB Requests
|
|
||||||
- Snp Request
|
|
||||||
- Snp Response
|
|
||||||
- Request Priority: DRAM fill, miss reserve, core request, snoop request
|
|
||||||
- Snoop Request Queue
|
|
||||||
- DRAM Fill Queue
|
|
||||||
- Core Req Arbiter
|
|
||||||
- Requests to be processed by the bank
|
|
||||||
- Tag Data Store
|
|
||||||
- Registers for valid, dirty, dirtyb, tag, and data
|
|
||||||
- Length of registers determined by lines in the bank
|
|
||||||
- Tag Data Access:
|
|
||||||
- I/O: stall, snoop info, force request miss
|
|
||||||
- Writes to cache or sends read response; hit or miss determined here
|
|
||||||
- A missed request goes to the miss reserve if it is not a snoop request or DRAM fill
|
|
||||||
|
|||||||
36
docs/continuous_integration.md
Normal file
36
docs/continuous_integration.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# Continuous Integration
|
||||||
|
- Each time you push to the repo, the Continuous Integration pipeline will run
|
||||||
|
- This pipeline consists of creating the correct development environment, building your code, and running all tests
|
||||||
|
- This is an extensive pipeline so it might take some time to complete
|
||||||
|
|
||||||
|
|
||||||
|
## Protecting Master Branch
|
||||||
|
Navigate to your Repository:
|
||||||
|
Open your repository on GitHub.
|
||||||
|
|
||||||
|
Click on "Settings":
|
||||||
|
In the upper-right corner of your repository page, click on the "Settings" tab.
|
||||||
|
|
||||||
|
Select "Branches" in the left sidebar:
|
||||||
|
On the left sidebar, look for the "Branches" option and click on it.
|
||||||
|
|
||||||
|
Choose the Branch:
|
||||||
|
Under "Branch protection rules," select the branch you want to protect. In this case, choose the main branch.
|
||||||
|
|
||||||
|
Enable Branch Protection:``
|
||||||
|
Check the box that says "Protect this branch."
|
||||||
|
|
||||||
|
Configure Protection Settings:
|
||||||
|
You can configure various protection settings. Some common settings include:
|
||||||
|
|
||||||
|
Require pull request reviews before merging: This ensures that changes are reviewed before being merged.
|
||||||
|
Require status checks to pass before merging: This ensures that automated tests and checks are passing.
|
||||||
|
Require signed commits: This enforces that commits are signed with a verified signature.
|
||||||
|
Restrict Who Can Push:
|
||||||
|
You can further restrict who can push directly to the branch. You might want to limit this privilege to specific people or teams.
|
||||||
|
|
||||||
|
Save Changes:
|
||||||
|
Once you've configured the protection settings, scroll down and click on the "Save changes" button.
|
||||||
|
|
||||||
|
Now, your main branch is protected, and certain criteria must be met before changes can be pushed directly to it. Contributors will need to create pull requests, have their changes reviewed, and meet other specified criteria before the changes can be merged into the main branch.
|
||||||
|
|
||||||
18
docs/contributing.md
Normal file
18
docs/contributing.md
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# Contributing to Vortex on Github
|
||||||
|
|
||||||
|
## Github Details
|
||||||
|
- There are two main repos, `vortex` (public, this one) and `vortex-dev` (private)
|
||||||
|
- todo: Most current development is on `vortex`
|
||||||
|
- If you have a legacy version of `vortex`, you can use the releases branch or tags to access the repo at that point in time
|
||||||
|
|
||||||
|
## Contribution Process
|
||||||
|
- You should create a new branch from develop that is clearly named with the feature that you want to add
|
||||||
|
- Avoid pushing directly to the `master` branch instead you will need to make a Pull Request (PR)
|
||||||
|
- There should be protections in place that prevent pushing directly to the main branch, but don't rely on it
|
||||||
|
- When you make a PR it will be tested against the continuous integration (ci) pipeline (see `continuous_integration.md`)
|
||||||
|
- It is not sufficient to just write some tests, they need to be incorporated into the ci pipeline to make sure they are run
|
||||||
|
- During a PR, you might receive feedback regarding your changes and you might need to make further commits to your branch
|
||||||
|
|
||||||
|
|
||||||
|
## Creating and Adding Tests
|
||||||
|
see `testing.md`
|
||||||
45
docs/environment_setup.md
Normal file
45
docs/environment_setup.md
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Environment Setup
|
||||||
|
These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
|
||||||
|
|
||||||
|
|
||||||
|
## Set Up on Your Own System
|
||||||
|
The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md).
|
||||||
|
|
||||||
|
|
||||||
|
## Servers for Georgia Tech Students and Collaborators
|
||||||
|
### Volvo
|
||||||
|
Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up.
|
||||||
|
|
||||||
|
Setup on Volvo:
|
||||||
|
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
|
||||||
|
2. `ssh volvo.cc.gatech.edu`
|
||||||
|
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
|
||||||
|
4. `source /nethome/software/set_vortex_env.sh` to set up the necessary environment variables.
|
||||||
|
5. `make -s` in the `vortex` root directory
|
||||||
|
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
|
||||||
|
|
||||||
|
### Nio
|
||||||
|
Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio.
|
||||||
|
|
||||||
|
Setup on Nio:
|
||||||
|
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
|
||||||
|
2. `ssh nio.cc.gatech.edu`
|
||||||
|
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
|
||||||
|
4. `source /opt/set_vortex_env_dev.sh` to set up the necessary environment variables.
|
||||||
|
5. `make -s` in the `vortex` root directory
|
||||||
|
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
|
||||||
|
|
||||||
|
|
||||||
|
## Docker (Experimental)
|
||||||
|
Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported.
|
||||||
|
|
||||||
|
### Setup with Docker
|
||||||
|
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
|
||||||
|
2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo.
|
||||||
|
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .`
|
||||||
|
4. Run a container based on the image: `docker run --rm -v ./:/root/vortex/ -it --name vtx-dev --privileged=true --platform=linux/amd64 vortex`
|
||||||
|
5. Install the toolchain `./ci/toolchain_install.sh --all` (once per container)
|
||||||
|
6. `make -s` in `vortex` root directory
|
||||||
|
7. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
|
||||||
|
|
||||||
|
You may exit from a container and resume a container you have exited or start a second terminal session `docker exec -it <container-name> bash`
|
||||||
@@ -9,9 +9,6 @@ OPAE Environment Setup
|
|||||||
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
|
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
|
||||||
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
|
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
|
||||||
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
|
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
|
||||||
$ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
|
|
||||||
$ export PATH=:/opt/verilator/bin:$PATH
|
|
||||||
$ export VERILATOR_ROOT=/opt/verilator
|
|
||||||
|
|
||||||
OPAE Build
|
OPAE Build
|
||||||
------------------
|
------------------
|
||||||
|
|||||||
@@ -13,7 +13,8 @@
|
|||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
- Refer to the build instructions in [README](../README.md).
|
- For the different environments Vortex supports, [read this document](environment_setup.md).
|
||||||
|
- To install on your own system, [follow this document](install_vortex.md).
|
||||||
|
|
||||||
## Quick Start Scenarios
|
## Quick Start Scenarios
|
||||||
|
|
||||||
@@ -28,4 +29,4 @@ Running Vortex simulators with different configurations:
|
|||||||
|
|
||||||
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
|
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
|
||||||
|
|
||||||
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
|
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
|
||||||
|
|||||||
124
docs/install_vortex.md
Normal file
124
docs/install_vortex.md
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
# Installing and Setting Up the Vortex Environment
|
||||||
|
|
||||||
|
## Ubuntu 18.04, 20.04
|
||||||
|
|
||||||
|
1. Install the following dependencies:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo apt-get install build-essential zlib1g-dev libtinfo-dev libncurses5 uuid-dev libboost-serialization-dev libpng-dev libhwloc-dev
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Upgrade gcc to 11:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo apt-get install gcc-11 g++-11
|
||||||
|
```
|
||||||
|
|
||||||
|
Multiple gcc versions on Ubuntu can be managed with update-alternatives, e.g.:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
|
||||||
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
|
||||||
|
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
|
||||||
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Download the Vortex codebase:
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone --recursive https://github.com/vortexgpgpu/vortex.git
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Install Vortex's prebuilt toolchain:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd vortex
|
||||||
|
sudo ./ci/toolchain_install.sh -all
|
||||||
|
|
||||||
|
# By default, the toolchain will install to /opt folder. This is recommended, but you can install the toolchain to a different directory by setting DESTDIR.
|
||||||
|
DESTDIR=$TOOLDIR ./ci/toolchain_install.sh -all
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Set up environment:
|
||||||
|
|
||||||
|
```
|
||||||
|
export VORTEX_HOME=$TOOLDIR/vortex
|
||||||
|
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
|
||||||
|
export LLVM_POCL=$TOOLDIR/llvm-pocl
|
||||||
|
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
|
||||||
|
export POCL_RT_PATH=$TOOLDIR/pocl/runtime
|
||||||
|
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
|
||||||
|
export VERILATOR_ROOT=$TOOLDIR/verilator
|
||||||
|
export SV2V_PATH=$TOOLDIR/sv2v
|
||||||
|
export YOSYS_PATH=$TOOLDIR/yosys
|
||||||
|
|
||||||
|
export PATH=$YOSYS_PATH/bin:$SV2V_PATH/bin:$VERILATOR_ROOT/bin:$PATH
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Build Vortex
|
||||||
|
|
||||||
|
```
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## RHEL 8
|
||||||
|
Note: depending on the system, some of the toolchain may need to be recompiled for non-Ubuntu Linux. The source for the tools can be found [here](https://github.com/vortexgpgpu/).
|
||||||
|
|
||||||
|
1. Install the following dependencies:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo yum install libpng-devel boost boost-devel boost-serialization libuuid-devel opencl-headers hwloc hwloc-devel gmp-devel compat-hwloc1
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Upgrade gcc to 11:
|
||||||
|
|
||||||
|
```
|
||||||
|
sudo yum install gcc-toolset-11
|
||||||
|
```
|
||||||
|
|
||||||
|
Multiple gcc versions on Red Hat can be managed with scl
|
||||||
|
|
||||||
|
3. Install MPFR 4.2.0:
|
||||||
|
|
||||||
|
Download [the source](https://ftp.gnu.org/gnu/mpfr/) and follow [the installation documentation](https://www.mpfr.org/mpfr-current/mpfr.html#How-to-Install).
|
||||||
|
|
||||||
|
4. Download the Vortex codebase:
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone --recursive https://github.com/vortexgpgpu/vortex.git
|
||||||
|
```
|
||||||
|
|
||||||
|
5. Install Vortex's prebuilt toolchain:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd vortex
|
||||||
|
sudo ./ci/toolchain_install.sh -all
|
||||||
|
|
||||||
|
# By default, the toolchain will install to /opt folder. This is recommended, but you can install the toolchain to a different directory by setting DESTDIR.
|
||||||
|
DESTDIR=$TOOLDIR ./ci/toolchain_install.sh -all
|
||||||
|
```
|
||||||
|
|
||||||
|
6. Set up environment:
|
||||||
|
|
||||||
|
```
|
||||||
|
export VORTEX_HOME=$TOOLDIR/vortex
|
||||||
|
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
|
||||||
|
export LLVM_POCL=$TOOLDIR/llvm-pocl
|
||||||
|
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
|
||||||
|
export POCL_RT_PATH=$TOOLDIR/pocl/runtime
|
||||||
|
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
|
||||||
|
export VERILATOR_ROOT=$TOOLDIR/verilator
|
||||||
|
export SV2V_PATH=$TOOLDIR/sv2v
|
||||||
|
export YOSYS_PATH=$TOOLDIR/yosys
|
||||||
|
|
||||||
|
export PATH=$YOSYS_PATH/bin:$SV2V_PATH/bin:$VERILATOR_ROOT/bin:$PATH
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=<path to mpfr>/src/.libs:$LD_LIBRARY_PATH
|
||||||
|
```
|
||||||
|
|
||||||
|
7. Build Vortex
|
||||||
|
|
||||||
|
```
|
||||||
|
make
|
||||||
|
```
|
||||||
@@ -24,71 +24,57 @@ Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with
|
|||||||
- Control the number of warps to activate during execution
|
- Control the number of warps to activate during execution
|
||||||
- `WSPAWN` *count, addr*: activate count warps and jump to addr location
|
- `WSPAWN` *count, addr*: activate count warps and jump to addr location
|
||||||
- **Control-Flow Divergence**
|
- **Control-Flow Divergence**
|
||||||
- Control threads to activate when a branch diverges
|
- Control threads activation when a branch diverges
|
||||||
- `SPLIT` *predicate*: apply 'taken' predicate thread mask adn save 'not-taken' into IPDOM stack
|
- `SPLIT` *taken, predicate*: apply predicate thread mask and save current state into IPDOM stack
|
||||||
- `JOIN`: restore 'not-taken' thread mask
|
- `JOIN`: pop IPDOM stack to restore thread mask
|
||||||
|
- `PRED` *predicate, restore_mask*: thread predicate instruction
|
||||||
- **Warp Synchronization**
|
- **Warp Synchronization**
|
||||||
- `BAR` *id, count*: stall warps entering barrier *id* until count is reached
|
- `BAR` *id, count*: stall warps entering barrier *id* until count is reached
|
||||||
|
|
||||||
### Vortex Pipeline/Datapath
|
### Vortex Pipeline/Datapath
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB.
|
Vortex has a 6-stage pipeline:
|
||||||
|
|
||||||
|
- **Schedule**
|
||||||
|
- Warp Scheduler
|
||||||
|
- Schedule the next PC into the pipeline
|
||||||
|
- Track stalled, active warps
|
||||||
|
- IPDOM Stack
|
||||||
|
- Save split/join states for divergent threads
|
||||||
|
- Inflight Tracker
|
||||||
|
- Track in-flight instructions
|
||||||
|
|
||||||
- **Fetch**
|
- **Fetch**
|
||||||
- Warp Scheduler
|
- Retrieve instructions from memory
|
||||||
- Track stalled & active warps, resolve branches and barriers, maintain split/join IPDOM stack
|
- Handle I-cache requests/responses
|
||||||
- Instruction Cache
|
|
||||||
- Retrieve instruction from cache, issue I-cache requests/responses
|
|
||||||
- **Decode**
|
- **Decode**
|
||||||
- Decode fetched instructions, notify warp scheduler when the following instructions are decoded:
|
- Decode fetched instructions
|
||||||
- Branch, tmc, split/join, wspawn
|
- Notify warp scheduler on control instructions
|
||||||
- Precompute used_regs mask (needed for Issue stage)
|
|
||||||
- **Issue**
|
- **Issue**
|
||||||
- Scheduling
|
|
||||||
- In-order issue (operands/execute unit ready), out-of-order commit
|
|
||||||
- IBuffer
|
- IBuffer
|
||||||
- Store fetched instructions, separate queues per-warp, selects next warp through round-robin scheduling
|
- Store decoded instructions in separate per-warp queues
|
||||||
- Scoreboard
|
- Scoreboard
|
||||||
- Track in-use registers
|
- Track in-use registers
|
||||||
- GPRs (General-Purpose Registers) stage
|
- Check register use for decoded instructions
|
||||||
- Fetch issued instruction operands and send operands to execute unit
|
- Operands Collector
|
||||||
|
- Fetch the operands for issued instructions from the register file
|
||||||
- **Execute**
|
- **Execute**
|
||||||
- ALU Unit
|
- ALU Unit
|
||||||
- Single-cycle operations (+,-,>>,<<,&,|,^), Branch instructions (Share ALU resources)
|
- Handle arithmetic and branch operations
|
||||||
- MULDIV Unit
|
|
||||||
- Multiplier - done in 2 cycles
|
|
||||||
- Divider - division and remainder, done in 32 cycles
|
|
||||||
- Implements serial alogrithm (Stalls the pipeline)
|
|
||||||
- FPU Unit
|
- FPU Unit
|
||||||
- Multi-cycle operations, uses `FPnew` Library on ASIC, uses hard DSPs on FPGA
|
- Handle floating-point operations
|
||||||
- CSR Unit
|
|
||||||
- Store constant status registers - device caps, FPU status flags, performance counters
|
|
||||||
- Handle external CSR requests (requests from host CPU)
|
|
||||||
- LSU Unit
|
- LSU Unit
|
||||||
- Handle load/store operations, issue D-cache requests, handle D-cache responses
|
- Handle load/store operations
|
||||||
- Commit load responses - saves storage, Scoreboard tracks completion
|
- SFU Unit
|
||||||
- GPGPU Unit
|
- Handle warp control operations
|
||||||
- Handle GPGPU instructions
|
- Handle Control Status Registers (CSRs) operations
|
||||||
- TMC, WSPAWN, SPLIT, BAR
|
|
||||||
- JOIN is handled by Warp Scheduler (upon SPLIT response)
|
|
||||||
- **Commit**
|
- **Commit**
|
||||||
- Commit
|
- Write result back to the register file and update the Scoreboard.
|
||||||
- Update CSR flags, update performance counters
|
|
||||||
- Writeback
|
### Vortex clustering architecture
|
||||||
- Write result back to GPRs, notify Scoreboard (release in-use register), select candidate instruction (ALU unit has highest priority)
|
- Sockets
|
||||||
- **Clustering**
|
- Grouping multiple cores sharing L1 cache
|
||||||
- Group mulitple cores into clusters (optionally share L2 cache)
|
- Clusters
|
||||||
- Group multiple clusters (optionally share L3 cache)
|
- Grouping of sockets sharing L2 cache
|
||||||
- Configurable at build time
|
|
||||||
- Default configuration:
|
|
||||||
- #Clusters = 1
|
|
||||||
- #Cores = 4
|
|
||||||
- #Warps = 4
|
|
||||||
- #Threads = 4
|
|
||||||
- **FPGA AFU Interface**
|
|
||||||
- Manage CPU-GPU comunication
|
|
||||||
- Query devices caps, load kernel instructions and resource buffers, start kernel execution, read destination buffers
|
|
||||||
- Local Memory - GPU access to local DRAM
|
|
||||||
- Reserved I/O addresses - redirect to host CPU, console output
|
|
||||||
@@ -2,18 +2,18 @@
|
|||||||
|
|
||||||
## Running a Vortex application
|
## Running a Vortex application
|
||||||
|
|
||||||
The framework provides a utility script: blakcbox.sh under the /ci/ folder for executing applications in the tests tree.
|
The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree.
|
||||||
You can query the commandline options of the tool using:
|
You can query the commandline options of the tool using:
|
||||||
|
|
||||||
$ ./ci/blakcbox.sh --help
|
$ ./ci/blackbox.sh --help
|
||||||
|
|
||||||
To execute sgemm test program on the simx driver and passing "-n10" as argument to sgemm:
|
To execute sgemm test program on the simx driver and passing "-n10" as argument to sgemm:
|
||||||
|
|
||||||
$ ./ci/blakcbox.sh --driver=simx --app=sgemm --args="-n10"
|
$ ./ci/blackbox.sh --driver=simx --app=sgemm --args="-n10"
|
||||||
|
|
||||||
You can execute the same application of a GPU architecture with 2 cores:
|
You can execute the same application of a GPU architecture with 2 cores:
|
||||||
|
|
||||||
$ ./ci/blakcbox.sh --core=2 --driver=simx --app=sgemm --args="-n10"
|
$ ./ci/blackbox.sh --core=2 --driver=simx --app=sgemm --args="-n10"
|
||||||
|
|
||||||
When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
|
When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
|
||||||
It tracks the latest configuration in a file under the current directory blackbox.<driver>.cache.
|
It tracks the latest configuration in a file under the current directory blackbox.<driver>.cache.
|
||||||
@@ -30,4 +30,18 @@ You can execute the default regression suite by running the following commands a
|
|||||||
You can execute the default opncl suite by running the following commands at the root folder.
|
You can execute the default opncl suite by running the following commands at the root folder.
|
||||||
|
|
||||||
$ make -C tests/opencl run-simx
|
$ make -C tests/opencl run-simx
|
||||||
$ make -C tests/opencl run-rtlsim
|
$ make -C tests/opencl run-rtlsim
|
||||||
|
|
||||||
|
## Creating Your Own Regression Tests
|
||||||
|
- Inside `test/` you will find a series of folders which are named based on what they test
|
||||||
|
- You can view the tests to see which ones have tests similar to what you are trying to create new tests for
|
||||||
|
- once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test
|
||||||
|
- `testcases.h` contains each of the test case templates
|
||||||
|
- `main.cpp` contains the implementation of each of the test cases and builds a test suite of all the tests cases you want
|
||||||
|
|
||||||
|
Compile the test case: `make -C tests/regression/<testcase-name>/ clean-all && make -C tests/regression/<testcase-name>/`
|
||||||
|
|
||||||
|
Run the test case: `./ci/blackbox.sh --driver=simx --cores=4 --app=<testcase-name> --debug`
|
||||||
|
|
||||||
|
## Adding Your Tests to the CI Pipeline
|
||||||
|
see `continuous_integration.md`
|
||||||
@@ -43,7 +43,16 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
`ifdef SCOPE
|
`ifdef SCOPE
|
||||||
localparam scope_socket = 0;
|
localparam scope_socket = 0;
|
||||||
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
|
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
|
assign mem_perf_tmp_if.icache = 'x;
|
||||||
|
assign mem_perf_tmp_if.dcache = 'x;
|
||||||
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
|
assign mem_perf_tmp_if.smem = 'x;
|
||||||
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
|
|
||||||
@@ -69,18 +78,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.reset (gbar_reset),
|
.reset (gbar_reset),
|
||||||
.gbar_bus_if (gbar_bus_if)
|
.gbar_bus_if (gbar_bus_if)
|
||||||
);
|
);
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
VX_mem_perf_if mem_perf_tmp_if();
|
|
||||||
cache_perf_t perf_l2cache;
|
|
||||||
|
|
||||||
assign mem_perf_tmp_if.icache = 'x;
|
|
||||||
assign mem_perf_tmp_if.dcache = 'x;
|
|
||||||
assign mem_perf_tmp_if.l2cache = perf_l2cache;
|
|
||||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
|
||||||
assign mem_perf_tmp_if.smem = 'x;
|
|
||||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
@@ -102,7 +100,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.MSHR_SIZE (`L2_MSHR_SIZE),
|
.MSHR_SIZE (`L2_MSHR_SIZE),
|
||||||
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
||||||
.MREQ_SIZE (`L2_MREQ_SIZE),
|
.MREQ_SIZE (`L2_MREQ_SIZE),
|
||||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH),
|
.TAG_WIDTH (L2_TAG_WIDTH),
|
||||||
.WRITE_ENABLE (1),
|
.WRITE_ENABLE (1),
|
||||||
.UUID_WIDTH (`UUID_WIDTH),
|
.UUID_WIDTH (`UUID_WIDTH),
|
||||||
.CORE_OUT_REG (2),
|
.CORE_OUT_REG (2),
|
||||||
@@ -113,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (l2_reset),
|
.reset (l2_reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_l2cache),
|
.cache_perf (mem_perf_tmp_if.l2cache),
|
||||||
`endif
|
`endif
|
||||||
.core_bus_if (per_socket_mem_bus_if),
|
.core_bus_if (per_socket_mem_bus_if),
|
||||||
.mem_bus_if (mem_bus_if)
|
.mem_bus_if (mem_bus_if)
|
||||||
@@ -146,6 +144,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
|
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
|
||||||
) socket (
|
) socket (
|
||||||
`SCOPE_IO_BIND (scope_socket+i)
|
`SCOPE_IO_BIND (scope_socket+i)
|
||||||
|
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (socket_reset),
|
.reset (socket_reset),
|
||||||
|
|
||||||
@@ -156,7 +155,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.dcr_bus_if (socket_dcr_bus_if),
|
.dcr_bus_if (socket_dcr_bus_if),
|
||||||
|
|
||||||
.mem_bus_if (per_socket_mem_bus_if[i]),
|
.mem_bus_if (per_socket_mem_bus_if[i]),
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
||||||
`endif
|
`endif
|
||||||
@@ -167,6 +166,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
end
|
end
|
||||||
|
|
||||||
`BUFFER_BUSY (busy, (| per_socket_busy), (`NUM_SOCKETS > 1));
|
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -136,6 +136,18 @@
|
|||||||
`endif
|
`endif
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifdef L2_ENABLE
|
||||||
|
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
|
||||||
|
`else
|
||||||
|
`define L2_LINE_SIZE `L1_LINE_SIZE
|
||||||
|
`endif
|
||||||
|
|
||||||
|
`ifdef L3_ENABLE
|
||||||
|
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
||||||
|
`else
|
||||||
|
`define L3_LINE_SIZE `L2_LINE_SIZE
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifdef XLEN_64
|
`ifdef XLEN_64
|
||||||
|
|
||||||
`ifndef STARTUP_ADDR
|
`ifndef STARTUP_ADDR
|
||||||
@@ -191,13 +203,21 @@
|
|||||||
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifndef SV_DPI
|
||||||
|
`define DPI_DISABLE
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifndef FPU_FPNEW
|
`ifndef FPU_FPNEW
|
||||||
`ifndef FPU_DSP
|
`ifndef FPU_DSP
|
||||||
`ifndef FPU_DPI
|
`ifndef FPU_DPI
|
||||||
`ifdef SYNTHESIS
|
`ifndef SYNTHESIS
|
||||||
`define FPU_DSP
|
`ifndef DPI_DISABLE
|
||||||
`else
|
|
||||||
`define FPU_DPI
|
`define FPU_DPI
|
||||||
|
`else
|
||||||
|
`define FPU_DSP
|
||||||
|
`endif
|
||||||
|
`else
|
||||||
|
`define FPU_DSP
|
||||||
`endif
|
`endif
|
||||||
`endif
|
`endif
|
||||||
`endif
|
`endif
|
||||||
@@ -223,18 +243,18 @@
|
|||||||
|
|
||||||
// Number of ALU units
|
// Number of ALU units
|
||||||
`ifndef NUM_ALU_LANES
|
`ifndef NUM_ALU_LANES
|
||||||
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2)
|
`define NUM_ALU_LANES `NUM_THREADS
|
||||||
`endif
|
`endif
|
||||||
`ifndef NUM_ALU_BLOCKS
|
`ifndef NUM_ALU_BLOCKS
|
||||||
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
`define NUM_ALU_BLOCKS `ISSUE_WIDTH
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Number of FPU units
|
// Number of FPU units
|
||||||
`ifndef NUM_FPU_LANES
|
`ifndef NUM_FPU_LANES
|
||||||
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2)
|
`define NUM_FPU_LANES `NUM_THREADS
|
||||||
`endif
|
`endif
|
||||||
`ifndef NUM_FPU_BLOCKS
|
`ifndef NUM_FPU_BLOCKS
|
||||||
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
`define NUM_FPU_BLOCKS `ISSUE_WIDTH
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Number of LSU units
|
// Number of LSU units
|
||||||
@@ -258,7 +278,10 @@
|
|||||||
`endif
|
`endif
|
||||||
|
|
||||||
// LSU Duplicate Address Check
|
// LSU Duplicate Address Check
|
||||||
`ifdef LSU_DUP
|
`ifndef LSU_DUP_DISABLE
|
||||||
|
`define LSU_DUP_ENABLE
|
||||||
|
`endif
|
||||||
|
`ifdef LSU_DUP_ENABLE
|
||||||
`define LSU_DUP_ENABLED 1
|
`define LSU_DUP_ENABLED 1
|
||||||
`else
|
`else
|
||||||
`define LSU_DUP_ENABLED 0
|
`define LSU_DUP_ENABLED 0
|
||||||
@@ -285,8 +308,8 @@
|
|||||||
// Floating-Point Units ///////////////////////////////////////////////////////
|
// Floating-Point Units ///////////////////////////////////////////////////////
|
||||||
|
|
||||||
// Size of FPU Request Queue
|
// Size of FPU Request Queue
|
||||||
`ifndef FPU_REQ_QUEUE_SIZE
|
`ifndef FPUQ_SIZE
|
||||||
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
|
`define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// FNCP Latency
|
// FNCP Latency
|
||||||
@@ -377,7 +400,7 @@
|
|||||||
|
|
||||||
// Number of Cache Units
|
// Number of Cache Units
|
||||||
`ifndef NUM_ICACHES
|
`ifndef NUM_ICACHES
|
||||||
`define NUM_ICACHES `UP(`NUM_CORES / 4)
|
`define NUM_ICACHES `UP(`SOCKET_SIZE / 4)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Cache Size
|
// Cache Size
|
||||||
@@ -407,7 +430,7 @@
|
|||||||
|
|
||||||
// Number of Associative Ways
|
// Number of Associative Ways
|
||||||
`ifndef ICACHE_NUM_WAYS
|
`ifndef ICACHE_NUM_WAYS
|
||||||
`define ICACHE_NUM_WAYS 2
|
`define ICACHE_NUM_WAYS 1
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||||
@@ -426,7 +449,7 @@
|
|||||||
|
|
||||||
// Number of Cache Units
|
// Number of Cache Units
|
||||||
`ifndef NUM_DCACHES
|
`ifndef NUM_DCACHES
|
||||||
`define NUM_DCACHES `UP(`NUM_CORES / 4)
|
`define NUM_DCACHES `UP(`SOCKET_SIZE / 4)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Cache Size
|
// Cache Size
|
||||||
@@ -436,7 +459,7 @@
|
|||||||
|
|
||||||
// Number of Banks
|
// Number of Banks
|
||||||
`ifndef DCACHE_NUM_BANKS
|
`ifndef DCACHE_NUM_BANKS
|
||||||
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES)
|
`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Core Response Queue Size
|
// Core Response Queue Size
|
||||||
@@ -461,7 +484,7 @@
|
|||||||
|
|
||||||
// Number of Associative Ways
|
// Number of Associative Ways
|
||||||
`ifndef DCACHE_NUM_WAYS
|
`ifndef DCACHE_NUM_WAYS
|
||||||
`define DCACHE_NUM_WAYS 2
|
`define DCACHE_NUM_WAYS 1
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||||
@@ -520,7 +543,7 @@
|
|||||||
|
|
||||||
// Number of Associative Ways
|
// Number of Associative Ways
|
||||||
`ifndef L2_NUM_WAYS
|
`ifndef L2_NUM_WAYS
|
||||||
`define L2_NUM_WAYS 4
|
`define L2_NUM_WAYS 2
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||||
|
|||||||
@@ -1,418 +1,432 @@
|
|||||||
// Copyright © 2019-2023
|
// Copyright © 2019-2023
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
// you may not use this file except in compliance with the License.
|
// you may not use this file except in compliance with the License.
|
||||||
// You may obtain a copy of the License at
|
// You may obtain a copy of the License at
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
//
|
//
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
`ifndef VX_DEFINE_VH
|
`ifndef VX_DEFINE_VH
|
||||||
`define VX_DEFINE_VH
|
`define VX_DEFINE_VH
|
||||||
|
|
||||||
`include "VX_platform.vh"
|
`include "VX_platform.vh"
|
||||||
`include "VX_config.vh"
|
`include "VX_config.vh"
|
||||||
`include "VX_types.vh"
|
`include "VX_types.vh"
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
`define NW_BITS `CLOG2(`NUM_WARPS)
|
`define NW_BITS `CLOG2(`NUM_WARPS)
|
||||||
`define NC_WIDTH `UP(`NC_BITS)
|
`define NC_WIDTH `UP(`NC_BITS)
|
||||||
|
|
||||||
`define NT_BITS `CLOG2(`NUM_THREADS)
|
`define NT_BITS `CLOG2(`NUM_THREADS)
|
||||||
`define NW_WIDTH `UP(`NW_BITS)
|
`define NW_WIDTH `UP(`NW_BITS)
|
||||||
|
|
||||||
`define NC_BITS `CLOG2(`NUM_CORES)
|
`define NC_BITS `CLOG2(`NUM_CORES)
|
||||||
`define NT_WIDTH `UP(`NT_BITS)
|
`define NT_WIDTH `UP(`NT_BITS)
|
||||||
|
|
||||||
`define NB_BITS `CLOG2(`NUM_BARRIERS)
|
`define NB_BITS `CLOG2(`NUM_BARRIERS)
|
||||||
`define NB_WIDTH `UP(`NB_BITS)
|
`define NB_WIDTH `UP(`NB_BITS)
|
||||||
|
|
||||||
`define NUM_IREGS 32
|
`define NUM_IREGS 32
|
||||||
|
|
||||||
`define NRI_BITS `CLOG2(`NUM_IREGS)
|
`define NRI_BITS `CLOG2(`NUM_IREGS)
|
||||||
|
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
`define NUM_REGS (2 * `NUM_IREGS)
|
`define NUM_REGS (2 * `NUM_IREGS)
|
||||||
`else
|
`else
|
||||||
`define NUM_REGS `NUM_IREGS
|
`define NUM_REGS `NUM_IREGS
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`define NR_BITS `CLOG2(`NUM_REGS)
|
`define NR_BITS `CLOG2(`NUM_REGS)
|
||||||
|
|
||||||
`define PERF_CTR_BITS 44
|
`define PERF_CTR_BITS 44
|
||||||
|
|
||||||
`ifndef NDEBUG
|
`ifndef NDEBUG
|
||||||
`define UUID_WIDTH 44
|
`define UUID_WIDTH 44
|
||||||
`else
|
`else
|
||||||
`define UUID_WIDTH 1
|
`define UUID_WIDTH 1
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
`define EX_ALU 0
|
`define EX_ALU 0
|
||||||
`define EX_LSU 1
|
`define EX_LSU 1
|
||||||
`define EX_SFU 2
|
`define EX_SFU 2
|
||||||
`define EX_FPU 3
|
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
|
||||||
|
|
||||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
||||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||||
|
`define EX_WIDTH `UP(`EX_BITS)
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
`define SFU_CSRS 0
|
||||||
`define INST_LUI 7'b0110111
|
`define SFU_WCTL 1
|
||||||
`define INST_AUIPC 7'b0010111
|
|
||||||
`define INST_JAL 7'b1101111
|
`define NUM_SFU_UNITS (2)
|
||||||
`define INST_JALR 7'b1100111
|
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
|
||||||
`define INST_B 7'b1100011 // branch instructions
|
`define SFU_WIDTH `UP(`SFU_BITS)
|
||||||
`define INST_L 7'b0000011 // load instructions
|
|
||||||
`define INST_S 7'b0100011 // store instructions
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
`define INST_I 7'b0010011 // immediate instructions
|
|
||||||
`define INST_R 7'b0110011 // register instructions
|
`define INST_LUI 7'b0110111
|
||||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
`define INST_AUIPC 7'b0010111
|
||||||
`define INST_SYS 7'b1110011 // system instructions
|
`define INST_JAL 7'b1101111
|
||||||
|
`define INST_JALR 7'b1100111
|
||||||
// RV64I instruction specific opcodes (for any W instruction)
|
`define INST_B 7'b1100011 // branch instructions
|
||||||
`define INST_I_W 7'b0011011 // W type immediate instructions
|
`define INST_L 7'b0000011 // load instructions
|
||||||
`define INST_R_W 7'b0111011 // W type register instructions
|
`define INST_S 7'b0100011 // store instructions
|
||||||
|
`define INST_I 7'b0010011 // immediate instructions
|
||||||
`define INST_FL 7'b0000111 // float load instruction
|
`define INST_R 7'b0110011 // register instructions
|
||||||
`define INST_FS 7'b0100111 // float store instruction
|
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||||
`define INST_FMADD 7'b1000011
|
`define INST_SYS 7'b1110011 // system instructions
|
||||||
`define INST_FMSUB 7'b1000111
|
|
||||||
`define INST_FNMSUB 7'b1001011
|
// RV64I instruction specific opcodes (for any W instruction)
|
||||||
`define INST_FNMADD 7'b1001111
|
`define INST_I_W 7'b0011011 // W type immediate instructions
|
||||||
`define INST_FCI 7'b1010011 // float common instructions
|
`define INST_R_W 7'b0111011 // W type register instructions
|
||||||
|
|
||||||
// Custom extension opcodes
|
`define INST_FL 7'b0000111 // float load instruction
|
||||||
`define INST_EXT1 7'b0001011 // 0x0B
|
`define INST_FS 7'b0100111 // float store instruction
|
||||||
`define INST_EXT2 7'b0101011 // 0x2B
|
`define INST_FMADD 7'b1000011
|
||||||
`define INST_EXT3 7'b1011011 // 0x5B
|
`define INST_FMSUB 7'b1000111
|
||||||
`define INST_EXT4 7'b1111011 // 0x7B
|
`define INST_FNMSUB 7'b1001011
|
||||||
|
`define INST_FNMADD 7'b1001111
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
`define INST_FCI 7'b1010011 // float common instructions
|
||||||
|
|
||||||
`define INST_FRM_RNE 3'b000 // round to nearest even
|
// Custom extension opcodes
|
||||||
`define INST_FRM_RTZ 3'b001 // round to zero
|
`define INST_EXT1 7'b0001011 // 0x0B
|
||||||
`define INST_FRM_RDN 3'b010 // round to -inf
|
`define INST_EXT2 7'b0101011 // 0x2B
|
||||||
`define INST_FRM_RUP 3'b011 // round to +inf
|
`define INST_EXT3 7'b1011011 // 0x5B
|
||||||
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
|
`define INST_EXT4 7'b1111011 // 0x7B
|
||||||
`define INST_FRM_DYN 3'b111 // dynamic mode
|
|
||||||
`define INST_FRM_BITS 3
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
`define INST_FRM_RNE 3'b000 // round to nearest even
|
||||||
|
`define INST_FRM_RTZ 3'b001 // round to zero
|
||||||
`define INST_OP_BITS 4
|
`define INST_FRM_RDN 3'b010 // round to -inf
|
||||||
`define INST_MOD_BITS 3
|
`define INST_FRM_RUP 3'b011 // round to +inf
|
||||||
`define INST_FMT_BITS 2
|
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
|
||||||
|
`define INST_FRM_DYN 3'b111 // dynamic mode
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
`define INST_FRM_BITS 3
|
||||||
|
|
||||||
`define INST_ALU_ADD 4'b0000
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
`define INST_ALU_LUI 4'b0010
|
|
||||||
`define INST_ALU_AUIPC 4'b0011
|
`define INST_OP_BITS 4
|
||||||
`define INST_ALU_SLTU 4'b0100
|
`define INST_MOD_BITS 3
|
||||||
`define INST_ALU_SLT 4'b0101
|
`define INST_FMT_BITS 2
|
||||||
`define INST_ALU_SUB 4'b0111
|
|
||||||
`define INST_ALU_SRL 4'b1000
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
`define INST_ALU_SRA 4'b1001
|
|
||||||
`define INST_ALU_AND 4'b1100
|
`define INST_ALU_ADD 4'b0000
|
||||||
`define INST_ALU_OR 4'b1101
|
`define INST_ALU_LUI 4'b0010
|
||||||
`define INST_ALU_XOR 4'b1110
|
`define INST_ALU_AUIPC 4'b0011
|
||||||
`define INST_ALU_SLL 4'b1111
|
`define INST_ALU_SLTU 4'b0100
|
||||||
`define INST_ALU_OTHER 4'b0111
|
`define INST_ALU_SLT 4'b0101
|
||||||
`define INST_ALU_BITS 4
|
`define INST_ALU_SUB 4'b0111
|
||||||
`define INST_ALU_CLASS(op) op[3:2]
|
`define INST_ALU_SRL 4'b1000
|
||||||
`define INST_ALU_SIGNED(op) op[0]
|
`define INST_ALU_SRA 4'b1001
|
||||||
`define INST_ALU_IS_SUB(op) op[1]
|
`define INST_ALU_AND 4'b1100
|
||||||
`define INST_ALU_IS_BR(mod) mod[0]
|
`define INST_ALU_OR 4'b1101
|
||||||
`define INST_ALU_IS_M(mod) mod[1]
|
`define INST_ALU_XOR 4'b1110
|
||||||
`define INST_ALU_IS_W(mod) mod[2]
|
`define INST_ALU_SLL 4'b1111
|
||||||
|
`define INST_ALU_OTHER 4'b0111
|
||||||
`define INST_BR_EQ 4'b0000
|
`define INST_ALU_BITS 4
|
||||||
`define INST_BR_NE 4'b0010
|
`define INST_ALU_CLASS(op) op[3:2]
|
||||||
`define INST_BR_LTU 4'b0100
|
`define INST_ALU_SIGNED(op) op[0]
|
||||||
`define INST_BR_GEU 4'b0110
|
`define INST_ALU_IS_SUB(op) op[1]
|
||||||
`define INST_BR_LT 4'b0101
|
`define INST_ALU_IS_BR(mod) mod[0]
|
||||||
`define INST_BR_GE 4'b0111
|
`define INST_ALU_IS_M(mod) mod[1]
|
||||||
`define INST_BR_JAL 4'b1000
|
`define INST_ALU_IS_W(mod) mod[2]
|
||||||
`define INST_BR_JALR 4'b1001
|
|
||||||
`define INST_BR_ECALL 4'b1010
|
`define INST_BR_EQ 4'b0000
|
||||||
`define INST_BR_EBREAK 4'b1011
|
`define INST_BR_NE 4'b0010
|
||||||
`define INST_BR_URET 4'b1100
|
`define INST_BR_LTU 4'b0100
|
||||||
`define INST_BR_SRET 4'b1101
|
`define INST_BR_GEU 4'b0110
|
||||||
`define INST_BR_MRET 4'b1110
|
`define INST_BR_LT 4'b0101
|
||||||
`define INST_BR_OTHER 4'b1111
|
`define INST_BR_GE 4'b0111
|
||||||
`define INST_BR_BITS 4
|
`define INST_BR_JAL 4'b1000
|
||||||
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
|
`define INST_BR_JALR 4'b1001
|
||||||
`define INST_BR_IS_NEG(op) op[1]
|
`define INST_BR_ECALL 4'b1010
|
||||||
`define INST_BR_IS_LESS(op) op[2]
|
`define INST_BR_EBREAK 4'b1011
|
||||||
`define INST_BR_IS_STATIC(op) op[3]
|
`define INST_BR_URET 4'b1100
|
||||||
|
`define INST_BR_SRET 4'b1101
|
||||||
`define INST_M_MUL 3'b000
|
`define INST_BR_MRET 4'b1110
|
||||||
`define INST_M_MULHU 3'b001
|
`define INST_BR_OTHER 4'b1111
|
||||||
`define INST_M_MULH 3'b010
|
`define INST_BR_BITS 4
|
||||||
`define INST_M_MULHSU 3'b011
|
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
|
||||||
`define INST_M_DIV 3'b100
|
`define INST_BR_IS_NEG(op) op[1]
|
||||||
`define INST_M_DIVU 3'b101
|
`define INST_BR_IS_LESS(op) op[2]
|
||||||
`define INST_M_REM 3'b110
|
`define INST_BR_IS_STATIC(op) op[3]
|
||||||
`define INST_M_REMU 3'b111
|
|
||||||
`define INST_M_BITS 3
|
`define INST_M_MUL 3'b000
|
||||||
`define INST_M_SIGNED(op) (~op[0])
|
`define INST_M_MULHU 3'b001
|
||||||
`define INST_M_IS_MULX(op) (~op[2])
|
`define INST_M_MULH 3'b010
|
||||||
`define INST_M_IS_MULH(op) (op[1:0] != 0)
|
`define INST_M_MULHSU 3'b011
|
||||||
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
|
`define INST_M_DIV 3'b100
|
||||||
`define INST_M_IS_REM(op) op[1]
|
`define INST_M_DIVU 3'b101
|
||||||
|
`define INST_M_REM 3'b110
|
||||||
`define INST_FMT_B 3'b000
|
`define INST_M_REMU 3'b111
|
||||||
`define INST_FMT_H 3'b001
|
`define INST_M_BITS 3
|
||||||
`define INST_FMT_W 3'b010
|
`define INST_M_SIGNED(op) (~op[0])
|
||||||
`define INST_FMT_D 3'b011
|
`define INST_M_IS_MULX(op) (~op[2])
|
||||||
`define INST_FMT_BU 3'b100
|
`define INST_M_IS_MULH(op) (op[1:0] != 0)
|
||||||
`define INST_FMT_HU 3'b101
|
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
|
||||||
`define INST_FMT_WU 3'b110
|
`define INST_M_IS_REM(op) op[1]
|
||||||
|
|
||||||
`define INST_LSU_LB 4'b0000
|
`define INST_FMT_B 3'b000
|
||||||
`define INST_LSU_LH 4'b0001
|
`define INST_FMT_H 3'b001
|
||||||
`define INST_LSU_LW 4'b0010
|
`define INST_FMT_W 3'b010
|
||||||
`define INST_LSU_LD 4'b0011 // new for RV64I LD
|
`define INST_FMT_D 3'b011
|
||||||
`define INST_LSU_LBU 4'b0100
|
`define INST_FMT_BU 3'b100
|
||||||
`define INST_LSU_LHU 4'b0101
|
`define INST_FMT_HU 3'b101
|
||||||
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
|
`define INST_FMT_WU 3'b110
|
||||||
`define INST_LSU_SB 4'b1000
|
|
||||||
`define INST_LSU_SH 4'b1001
|
`define INST_LSU_LB 4'b0000
|
||||||
`define INST_LSU_SW 4'b1010
|
`define INST_LSU_LH 4'b0001
|
||||||
`define INST_LSU_SD 4'b1011 // new for RV64I SD
|
`define INST_LSU_LW 4'b0010
|
||||||
`define INST_LSU_FENCE 4'b1111
|
`define INST_LSU_LD 4'b0011 // new for RV64I LD
|
||||||
`define INST_LSU_BITS 4
|
`define INST_LSU_LBU 4'b0100
|
||||||
`define INST_LSU_FMT(op) op[2:0]
|
`define INST_LSU_LHU 4'b0101
|
||||||
`define INST_LSU_WSIZE(op) op[1:0]
|
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
|
||||||
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
|
`define INST_LSU_SB 4'b1000
|
||||||
|
`define INST_LSU_SH 4'b1001
|
||||||
`define INST_FENCE_BITS 1
|
`define INST_LSU_SW 4'b1010
|
||||||
`define INST_FENCE_D 1'h0
|
`define INST_LSU_SD 4'b1011 // new for RV64I SD
|
||||||
`define INST_FENCE_I 1'h1
|
`define INST_LSU_FENCE 4'b1111
|
||||||
|
`define INST_LSU_BITS 4
|
||||||
`define INST_FPU_ADD 4'b0000
|
`define INST_LSU_FMT(op) op[2:0]
|
||||||
`define INST_FPU_SUB 4'b0001
|
`define INST_LSU_WSIZE(op) op[1:0]
|
||||||
`define INST_FPU_MUL 4'b0010
|
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
|
||||||
`define INST_FPU_DIV 4'b0011
|
|
||||||
`define INST_FPU_SQRT 4'b0100
|
`define INST_FENCE_BITS 1
|
||||||
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
|
`define INST_FENCE_D 1'h0
|
||||||
`define INST_FPU_F2F 4'b0110
|
`define INST_FENCE_I 1'h1
|
||||||
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
|
||||||
`define INST_FPU_F2I 4'b1000
|
`define INST_FPU_ADD 4'b0000
|
||||||
`define INST_FPU_F2U 4'b1001
|
`define INST_FPU_SUB 4'b0001
|
||||||
`define INST_FPU_I2F 4'b1010
|
`define INST_FPU_MUL 4'b0010
|
||||||
`define INST_FPU_U2F 4'b1011
|
`define INST_FPU_DIV 4'b0011
|
||||||
`define INST_FPU_MADD 4'b1100
|
`define INST_FPU_SQRT 4'b0100
|
||||||
`define INST_FPU_MSUB 4'b1101
|
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
|
||||||
`define INST_FPU_NMSUB 4'b1110
|
`define INST_FPU_F2F 4'b0110
|
||||||
`define INST_FPU_NMADD 4'b1111
|
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||||
`define INST_FPU_BITS 4
|
`define INST_FPU_F2I 4'b1000
|
||||||
`define INST_FPU_IS_W(mod) (mod[4])
|
`define INST_FPU_F2U 4'b1001
|
||||||
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
|
`define INST_FPU_I2F 4'b1010
|
||||||
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
|
`define INST_FPU_U2F 4'b1011
|
||||||
|
`define INST_FPU_MADD 4'b1100
|
||||||
`define INST_SFU_TMC 4'h0
|
`define INST_FPU_MSUB 4'b1101
|
||||||
`define INST_SFU_WSPAWN 4'h1
|
`define INST_FPU_NMSUB 4'b1110
|
||||||
`define INST_SFU_SPLIT 4'h2
|
`define INST_FPU_NMADD 4'b1111
|
||||||
`define INST_SFU_JOIN 4'h3
|
`define INST_FPU_BITS 4
|
||||||
`define INST_SFU_BAR 4'h4
|
`define INST_FPU_IS_W(mod) (mod[4])
|
||||||
`define INST_SFU_PRED 4'h5
|
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
|
||||||
`define INST_SFU_CSRRW 4'h6
|
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
|
||||||
`define INST_SFU_CSRRS 4'h7
|
|
||||||
`define INST_SFU_CSRRC 4'h8
|
`define INST_SFU_TMC 4'h0
|
||||||
`define INST_SFU_CMOV 4'h9
|
`define INST_SFU_WSPAWN 4'h1
|
||||||
`define INST_SFU_BITS 4
|
`define INST_SFU_SPLIT 4'h2
|
||||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
`define INST_SFU_JOIN 4'h3
|
||||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
`define INST_SFU_BAR 4'h4
|
||||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
`define INST_SFU_PRED 4'h5
|
||||||
|
`define INST_SFU_CSRRW 4'h6
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
`define INST_SFU_CSRRS 4'h7
|
||||||
|
`define INST_SFU_CSRRC 4'h8
|
||||||
// non-cacheable tag bits
|
`define INST_SFU_CMOV 4'h9
|
||||||
`define NC_TAG_BITS 1
|
`define INST_SFU_BITS 4
|
||||||
|
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||||
// cache address type bits
|
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||||
`ifdef SM_ENABLE
|
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||||
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
|
|
||||||
`else
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
|
|
||||||
`endif
|
// non-cacheable tag bits
|
||||||
|
`define NC_TAG_BITS 1
|
||||||
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
|
|
||||||
|
// cache address type bits
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
`ifdef SM_ENABLE
|
||||||
|
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
|
||||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
|
`else
|
||||||
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
|
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
|
||||||
|
`endif
|
||||||
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
|
||||||
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
|
||||||
|
|
||||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
|
|
||||||
|
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
|
||||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
|
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
|
||||||
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
|
|
||||||
|
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
||||||
|
|
||||||
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
|
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||||
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
|
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
|
||||||
|
|
||||||
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
|
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
|
||||||
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
|
||||||
|
|
||||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
|
||||||
|
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
|
||||||
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
|
||||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
|
||||||
|
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
|
||||||
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
||||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
|
|
||||||
|
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
|
||||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
||||||
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
|
|
||||||
|
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||||
|
|
||||||
`ifdef L2_ENABLE
|
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||||
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
|
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
|
||||||
`else
|
|
||||||
`define L2_LINE_SIZE `L1_LINE_SIZE
|
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||||
`endif
|
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
|
||||||
|
|
||||||
`ifdef L3_ENABLE
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
|
||||||
`else
|
`ifdef ICACHE_ENABLE
|
||||||
`define L3_LINE_SIZE `L2_LINE_SIZE
|
`define L1_ENABLE
|
||||||
`endif
|
`endif
|
||||||
|
`ifdef DCACHE_ENABLE
|
||||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
`define L1_ENABLE
|
||||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
`endif
|
||||||
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
|
||||||
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
|
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||||
|
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||||
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
|
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
||||||
`define VX_DCR_DATA_WIDTH 32
|
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
|
||||||
|
|
||||||
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
|
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
|
||||||
|
`define VX_DCR_DATA_WIDTH 32
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
|
||||||
`define BUFFER_BUSY(dst, src, enable) \
|
|
||||||
logic __busy; \
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
if (enable) begin \
|
|
||||||
always @(posedge clk) begin \
|
`define BUFFER_EX(dst, src, ena, latency) \
|
||||||
if (reset) begin \
|
VX_pipe_register #( \
|
||||||
__busy <= 1'b0; \
|
.DATAW ($bits(dst)), \
|
||||||
end else begin \
|
.RESETW ($bits(dst)), \
|
||||||
__busy <= src; \
|
.DEPTH (latency) \
|
||||||
end \
|
) __``dst ( \
|
||||||
end \
|
.clk (clk), \
|
||||||
end else begin \
|
.reset (reset), \
|
||||||
assign __busy = src; \
|
.enable (ena), \
|
||||||
end \
|
.data_in (src), \
|
||||||
assign dst = __busy
|
.data_out (dst) \
|
||||||
|
)
|
||||||
`define POP_COUNT_EX(out, in, model) \
|
|
||||||
VX_popcount #( \
|
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
|
||||||
.N ($bits(in)), \
|
|
||||||
.MODEL (model) \
|
`define POP_COUNT_EX(out, in, model) \
|
||||||
) __``out ( \
|
VX_popcount #( \
|
||||||
.data_in (in), \
|
.N ($bits(in)), \
|
||||||
.data_out (out) \
|
.MODEL (model) \
|
||||||
)
|
) __``out ( \
|
||||||
|
.data_in (in), \
|
||||||
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
|
.data_out (out) \
|
||||||
|
)
|
||||||
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
|
|
||||||
assign dst.req_valid = src.req_valid; \
|
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
|
||||||
assign dst.req_data = src.req_data; \
|
|
||||||
assign src.req_ready = dst.req_ready; \
|
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
|
||||||
assign src.rsp_valid = dst.rsp_valid; \
|
assign dst.req_valid = src.req_valid; \
|
||||||
assign src.rsp_data = dst.rsp_data; \
|
assign dst.req_data = src.req_data; \
|
||||||
assign dst.rsp_ready = src.rsp_ready
|
assign src.req_ready = dst.req_ready; \
|
||||||
|
assign src.rsp_valid = dst.rsp_valid; \
|
||||||
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
|
assign src.rsp_data = dst.rsp_data; \
|
||||||
assign dst.req_valid = src.req_valid; \
|
assign dst.rsp_ready = src.rsp_ready
|
||||||
assign dst.req_data.rw = src.req_data.rw; \
|
|
||||||
assign dst.req_data.byteen = src.req_data.byteen; \
|
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
|
||||||
assign dst.req_data.addr = src.req_data.addr; \
|
assign dst.req_valid = src.req_valid; \
|
||||||
assign dst.req_data.data = src.req_data.data; \
|
assign dst.req_data.rw = src.req_data.rw; \
|
||||||
if (TD != TS) \
|
assign dst.req_data.byteen = src.req_data.byteen; \
|
||||||
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
|
assign dst.req_data.addr = src.req_data.addr; \
|
||||||
else \
|
assign dst.req_data.data = src.req_data.data; \
|
||||||
assign dst.req_data.tag = src.req_data.tag; \
|
if (TD != TS) \
|
||||||
assign src.req_ready = dst.req_ready; \
|
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
|
||||||
assign src.rsp_valid = dst.rsp_valid; \
|
else \
|
||||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
assign dst.req_data.tag = src.req_data.tag; \
|
||||||
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
assign src.req_ready = dst.req_ready; \
|
||||||
assign dst.rsp_ready = src.rsp_ready
|
assign src.rsp_valid = dst.rsp_valid; \
|
||||||
|
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||||
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
|
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
||||||
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
|
assign dst.rsp_ready = src.rsp_ready
|
||||||
if (enable) begin \
|
|
||||||
always @(posedge clk) begin \
|
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
|
||||||
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
|
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
|
||||||
end \
|
if (enable) begin \
|
||||||
end else begin \
|
always @(posedge clk) begin \
|
||||||
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
|
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
|
||||||
end \
|
end \
|
||||||
VX_dcr_bus_if dst(); \
|
end else begin \
|
||||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
|
||||||
|
end \
|
||||||
`define PERF_REDUCE(dst, src, field, width, count) \
|
VX_dcr_bus_if dst(); \
|
||||||
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
||||||
wire [width-1:0] __reduce_add_o_``dst``field; \
|
|
||||||
reg [width-1:0] __reduce_add_r_``dst``field; \
|
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
|
||||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
for (genvar __d = 0; __d < dst_count; ++__d) begin \
|
||||||
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
|
localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
|
||||||
end \
|
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
||||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
|
wire [width-1:0] __reduce_add_o_``dst``field; \
|
||||||
__reduce_add_i_``src``field, \
|
for (genvar __i = 0; __i < __count; ++__i) begin \
|
||||||
__reduce_add_o_``dst``field \
|
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
|
||||||
); \
|
end \
|
||||||
always @(posedge clk) begin \
|
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
|
||||||
if (reset) begin \
|
__reduce_add_i_``src``field, \
|
||||||
__reduce_add_r_``dst``field <= '0; \
|
__reduce_add_o_``dst``field \
|
||||||
end else begin \
|
); \
|
||||||
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
if (reg_enable) begin \
|
||||||
end \
|
reg [width-1:0] __reduce_add_r_``dst``field; \
|
||||||
end \
|
always @(posedge clk) begin \
|
||||||
assign ``dst.``field = __reduce_add_r_``dst``field
|
if (reset) begin \
|
||||||
|
__reduce_add_r_``dst``field <= '0; \
|
||||||
`define PERF_CACHE_REDUCE(dst, src, count) \
|
end else begin \
|
||||||
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
|
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
||||||
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
|
end \
|
||||||
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
|
end \
|
||||||
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \
|
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
|
||||||
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
|
end else begin \
|
||||||
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
|
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
|
||||||
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
|
end \
|
||||||
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
|
end
|
||||||
|
|
||||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||||
if (block_size != 1) begin \
|
if (block_size != 1) begin \
|
||||||
if (block_size != `NUM_WARPS) begin \
|
if (block_size != `NUM_WARPS) begin \
|
||||||
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
||||||
end else begin \
|
end else begin \
|
||||||
assign dst = `NW_WIDTH'(block_idx); \
|
assign dst = `NW_WIDTH'(block_idx); \
|
||||||
end \
|
end \
|
||||||
end else begin \
|
end else begin \
|
||||||
assign dst = src; \
|
assign dst = src; \
|
||||||
end
|
end
|
||||||
|
|
||||||
`define TO_DISPATCH_DATA(data, tid) \
|
`define TO_DISPATCH_DATA(data, tid) { \
|
||||||
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
|
data.uuid, \
|
||||||
|
data.wis, \
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
data.tmask, \
|
||||||
|
data.op_type, \
|
||||||
`endif // VX_DEFINE_VH
|
data.op_mod, \
|
||||||
|
data.wb, \
|
||||||
|
data.use_PC, \
|
||||||
|
data.use_imm, \
|
||||||
|
data.PC, \
|
||||||
|
data.imm, \
|
||||||
|
data.rd, \
|
||||||
|
tid, \
|
||||||
|
data.rs1_data, \
|
||||||
|
data.rs2_data, \
|
||||||
|
data.rs3_data}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
`endif // VX_DEFINE_VH
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ package VX_gpu_pkg;
|
|||||||
`ifdef ICACHE_ENABLE
|
`ifdef ICACHE_ENABLE
|
||||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||||
`else
|
`else
|
||||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
|
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
////////////////////////// Dcache Parameters //////////////////////////////
|
////////////////////////// Dcache Parameters //////////////////////////////
|
||||||
@@ -142,10 +142,13 @@ package VX_gpu_pkg;
|
|||||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||||
|
|
||||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||||
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
||||||
|
|
||||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||||
|
|
||||||
|
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||||
|
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||||
|
|
||||||
// Word size in bytes
|
// Word size in bytes
|
||||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||||
|
|
||||||
@@ -190,42 +193,46 @@ package VX_gpu_pkg;
|
|||||||
|
|
||||||
/////////////////////////////// Issue parameters //////////////////////////
|
/////////////////////////////// Issue parameters //////////////////////////
|
||||||
|
|
||||||
localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH);
|
localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
|
||||||
|
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
|
||||||
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
|
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
|
||||||
localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO);
|
localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
|
||||||
localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO));
|
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
|
||||||
|
|
||||||
`IGNORE_UNUSED_BEGIN
|
`IGNORE_UNUSED_BEGIN
|
||||||
function logic [ISSUE_IDX_W-1:0] wid_to_isw(
|
function logic [`NW_WIDTH-1:0] wis_to_wid(
|
||||||
|
input logic [ISSUE_WIS_W-1:0] wis,
|
||||||
|
input logic [ISSUE_ISW_W-1:0] isw
|
||||||
|
);
|
||||||
|
if (ISSUE_WIS == 0) begin
|
||||||
|
wis_to_wid = `NW_WIDTH'(isw);
|
||||||
|
end else if (ISSUE_ISW == 0) begin
|
||||||
|
wis_to_wid = `NW_WIDTH'(wis);
|
||||||
|
end else begin
|
||||||
|
wis_to_wid = `NW_WIDTH'({wis, isw});
|
||||||
|
end
|
||||||
|
endfunction
|
||||||
|
|
||||||
|
function logic [ISSUE_ISW_W-1:0] wid_to_isw(
|
||||||
input logic [`NW_WIDTH-1:0] wid
|
input logic [`NW_WIDTH-1:0] wid
|
||||||
);
|
);
|
||||||
if (`ISSUE_WIDTH > 1) begin
|
if (ISSUE_ISW != 0) begin
|
||||||
wid_to_isw = ISSUE_IDX_W'(wid);
|
wid_to_isw = wid[ISSUE_ISW_W-1:0];
|
||||||
end else begin
|
end else begin
|
||||||
wid_to_isw = 0;
|
wid_to_isw = 0;
|
||||||
end
|
end
|
||||||
endfunction
|
endfunction
|
||||||
`IGNORE_UNUSED_END
|
|
||||||
|
|
||||||
function logic [`NW_WIDTH-1:0] wis_to_wid(
|
|
||||||
input logic [ISSUE_WIS_W-1:0] wis,
|
|
||||||
input logic [ISSUE_IDX_W-1:0] isw
|
|
||||||
);
|
|
||||||
wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH)));
|
|
||||||
endfunction
|
|
||||||
|
|
||||||
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
|
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
|
||||||
input logic [`NW_WIDTH-1:0] wid
|
input logic [`NW_WIDTH-1:0] wid
|
||||||
);
|
);
|
||||||
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
|
if (ISSUE_WIS != 0) begin
|
||||||
endfunction
|
wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW);
|
||||||
|
end else begin
|
||||||
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
|
wid_to_wis = 0;
|
||||||
input logic [`NR_BITS-1:0] rid,
|
end
|
||||||
input logic [ISSUE_WIS_W-1:0] wis
|
|
||||||
);
|
|
||||||
wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO)));
|
|
||||||
endfunction
|
endfunction
|
||||||
|
`IGNORE_UNUSED_END
|
||||||
|
|
||||||
endpackage
|
endpackage
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@
|
|||||||
`ifndef VX_PLATFORM_VH
|
`ifndef VX_PLATFORM_VH
|
||||||
`define VX_PLATFORM_VH
|
`define VX_PLATFORM_VH
|
||||||
|
|
||||||
`ifndef SYNTHESIS
|
`ifdef SV_DPI
|
||||||
`include "util_dpi.vh"
|
`include "util_dpi.vh"
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|||||||
@@ -65,58 +65,11 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_tmp_if();
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
cache_perf_t perf_icache;
|
|
||||||
cache_perf_t perf_dcache;
|
|
||||||
|
|
||||||
assign mem_perf_tmp_if.icache = perf_icache;
|
|
||||||
assign mem_perf_tmp_if.dcache = perf_dcache;
|
|
||||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
assign mem_perf_tmp_if.smem = 'x;
|
assign mem_perf_tmp_if.smem = 'x;
|
||||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
|
||||||
) icache_mem_bus_if();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
|
||||||
) dcache_mem_bus_if();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
|
||||||
) cache_mem_bus_if[2]();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
|
||||||
) mem_bus_tmp_if[1]();
|
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
|
||||||
|
|
||||||
`RESET_RELAY (mem_arb_reset, reset);
|
|
||||||
|
|
||||||
VX_mem_arb #(
|
|
||||||
.NUM_INPUTS (2),
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
|
||||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
|
||||||
.ARBITER ("R"),
|
|
||||||
.OUT_REG_REQ (2),
|
|
||||||
.OUT_REG_RSP (2)
|
|
||||||
) mem_arb (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (mem_arb_reset),
|
|
||||||
.bus_in_if (cache_mem_bus_if),
|
|
||||||
.bus_out_if (mem_bus_tmp_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@@ -125,6 +78,11 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.TAG_WIDTH (ICACHE_TAG_WIDTH)
|
.TAG_WIDTH (ICACHE_TAG_WIDTH)
|
||||||
) per_core_icache_bus_if[`SOCKET_SIZE]();
|
) per_core_icache_bus_if[`SOCKET_SIZE]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||||
|
) icache_mem_bus_if();
|
||||||
|
|
||||||
`RESET_RELAY (icache_reset, reset);
|
`RESET_RELAY (icache_reset, reset);
|
||||||
|
|
||||||
VX_cache_cluster #(
|
VX_cache_cluster #(
|
||||||
@@ -149,7 +107,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.MEM_OUT_REG (2)
|
.MEM_OUT_REG (2)
|
||||||
) icache (
|
) icache (
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_icache),
|
.cache_perf (mem_perf_tmp_if.icache),
|
||||||
`endif
|
`endif
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (icache_reset),
|
.reset (icache_reset),
|
||||||
@@ -160,9 +118,14 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||||
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||||
|
) dcache_mem_bus_if();
|
||||||
|
|
||||||
`RESET_RELAY (dcache_reset, reset);
|
`RESET_RELAY (dcache_reset, reset);
|
||||||
|
|
||||||
@@ -189,7 +152,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.MEM_OUT_REG (2)
|
.MEM_OUT_REG (2)
|
||||||
) dcache (
|
) dcache (
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_dcache),
|
.cache_perf (mem_perf_tmp_if.dcache),
|
||||||
`endif
|
`endif
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (dcache_reset),
|
.reset (dcache_reset),
|
||||||
@@ -197,6 +160,40 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.mem_bus_if (dcache_mem_bus_if)
|
.mem_bus_if (dcache_mem_bus_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||||
|
) l1_mem_bus_if[2]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||||
|
) l1_mem_arb_bus_if[1]();
|
||||||
|
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||||
|
|
||||||
|
`RESET_RELAY (mem_arb_reset, reset);
|
||||||
|
|
||||||
|
VX_mem_arb #(
|
||||||
|
.NUM_INPUTS (2),
|
||||||
|
.DATA_SIZE (`L1_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||||
|
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||||
|
.ARBITER ("R"),
|
||||||
|
.OUT_REG_REQ (2),
|
||||||
|
.OUT_REG_RSP (2)
|
||||||
|
) mem_arb (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (mem_arb_reset),
|
||||||
|
.bus_in_if (l1_mem_bus_if),
|
||||||
|
.bus_out_if (l1_mem_arb_bus_if)
|
||||||
|
);
|
||||||
|
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
|
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
|
||||||
@@ -245,6 +242,6 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
end
|
end
|
||||||
|
|
||||||
`BUFFER_BUSY (busy, (| per_core_busy), (`SOCKET_SIZE > 1));
|
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -58,6 +58,8 @@
|
|||||||
|
|
||||||
`define VX_CSR_MPM_BASE 12'hB00
|
`define VX_CSR_MPM_BASE 12'hB00
|
||||||
`define VX_CSR_MPM_BASE_H 12'hB80
|
`define VX_CSR_MPM_BASE_H 12'hB80
|
||||||
|
`define VX_CSR_MPM_USER 12'hB03
|
||||||
|
`define VX_CSR_MPM_USER_H 12'hB83
|
||||||
|
|
||||||
// Machine Performance-monitoring core counters
|
// Machine Performance-monitoring core counters
|
||||||
// PERF: Standard
|
// PERF: Standard
|
||||||
@@ -68,29 +70,38 @@
|
|||||||
`define VX_CSR_MINSTRET 12'hB02
|
`define VX_CSR_MINSTRET 12'hB02
|
||||||
`define VX_CSR_MINSTRET_H 12'hB82
|
`define VX_CSR_MINSTRET_H 12'hB82
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
`define VX_CSR_MPM_IBUF_ST 12'hB03
|
`define VX_CSR_MPM_SCHED_ID 12'hB03
|
||||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB83
|
`define VX_CSR_MPM_SCHED_ID_H 12'hB83
|
||||||
`define VX_CSR_MPM_SCRB_ST 12'hB04
|
`define VX_CSR_MPM_SCHED_ST 12'hB04
|
||||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB84
|
`define VX_CSR_MPM_SCHED_ST_H 12'hB84
|
||||||
`define VX_CSR_MPM_ALU_ST 12'hB05
|
`define VX_CSR_MPM_IBUF_ST 12'hB05
|
||||||
`define VX_CSR_MPM_ALU_ST_H 12'hB85
|
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
||||||
`define VX_CSR_MPM_LSU_ST 12'hB06
|
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
||||||
`define VX_CSR_MPM_LSU_ST_H 12'hB86
|
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
|
||||||
`define VX_CSR_MPM_FPU_ST 12'hB07
|
`define VX_CSR_MPM_SCRB_ALU 12'hB07
|
||||||
`define VX_CSR_MPM_FPU_ST_H 12'hB87
|
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
|
||||||
`define VX_CSR_MPM_SFU_ST 12'hB08
|
`define VX_CSR_MPM_SCRB_FPU 12'hB08
|
||||||
`define VX_CSR_MPM_SFU_ST_H 12'hB88
|
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
|
||||||
|
`define VX_CSR_MPM_SCRB_LSU 12'hB09
|
||||||
|
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
|
||||||
|
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
|
||||||
|
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`define VX_CSR_MPM_IFETCHES 12'hB0A
|
`define VX_CSR_MPM_IFETCHES 12'hB0B
|
||||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8A
|
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
|
||||||
`define VX_CSR_MPM_LOADS 12'hB0B
|
`define VX_CSR_MPM_LOADS 12'hB0C
|
||||||
`define VX_CSR_MPM_LOADS_H 12'hB8B
|
`define VX_CSR_MPM_LOADS_H 12'hB8C
|
||||||
`define VX_CSR_MPM_STORES 12'hB0C
|
`define VX_CSR_MPM_STORES 12'hB0D
|
||||||
`define VX_CSR_MPM_STORES_H 12'hB8C
|
`define VX_CSR_MPM_STORES_H 12'hB8D
|
||||||
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D
|
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
|
||||||
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D
|
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
|
||||||
`define VX_CSR_MPM_LOAD_LAT 12'hB0E
|
`define VX_CSR_MPM_LOAD_LT 12'hB0F
|
||||||
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E
|
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
|
||||||
|
// SFU: scoreboard
|
||||||
|
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
|
||||||
|
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
|
||||||
|
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
|
||||||
|
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
|
||||||
|
|
||||||
// Machine Performance-monitoring memory counters
|
// Machine Performance-monitoring memory counters
|
||||||
// PERF: icache
|
// PERF: icache
|
||||||
@@ -98,59 +109,61 @@
|
|||||||
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
|
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
|
||||||
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
|
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
|
||||||
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
|
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
|
||||||
|
`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls
|
||||||
|
`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85
|
||||||
// PERF: dcache
|
// PERF: dcache
|
||||||
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads
|
`define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads
|
||||||
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85
|
`define VX_CSR_MPM_DCACHE_READS_H 12'hB86
|
||||||
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
|
`define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes
|
||||||
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86
|
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87
|
||||||
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
|
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses
|
||||||
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87
|
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88
|
||||||
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
|
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses
|
||||||
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88
|
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89
|
||||||
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
|
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts
|
||||||
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89
|
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A
|
||||||
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
|
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls
|
||||||
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
|
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B
|
||||||
// PERF: smem
|
|
||||||
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
|
|
||||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
|
|
||||||
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
|
|
||||||
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
|
|
||||||
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
|
|
||||||
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
|
|
||||||
// PERF: l2cache
|
// PERF: l2cache
|
||||||
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads
|
`define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads
|
||||||
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E
|
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C
|
||||||
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
|
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes
|
||||||
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F
|
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D
|
||||||
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
|
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses
|
||||||
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90
|
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E
|
||||||
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
|
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses
|
||||||
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91
|
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F
|
||||||
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
|
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts
|
||||||
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
|
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90
|
||||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
|
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls
|
||||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
|
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91
|
||||||
// PERF: l3cache
|
// PERF: l3cache
|
||||||
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads
|
`define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads
|
||||||
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94
|
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB92
|
||||||
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
|
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes
|
||||||
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95
|
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93
|
||||||
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
|
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses
|
||||||
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96
|
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94
|
||||||
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
|
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses
|
||||||
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97
|
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95
|
||||||
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
|
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts
|
||||||
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
|
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96
|
||||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
|
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls
|
||||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
|
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads
|
`define VX_CSR_MPM_MEM_READS 12'hB18 // total reads
|
||||||
`define VX_CSR_MPM_MEM_READS_H 12'hB9A
|
`define VX_CSR_MPM_MEM_READS_H 12'hB98
|
||||||
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes
|
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
|
||||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B
|
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
||||||
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency
|
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
|
||||||
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C
|
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
|
||||||
|
// PERF: smem
|
||||||
|
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
|
||||||
|
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
|
||||||
|
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes
|
||||||
|
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C
|
||||||
|
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts
|
||||||
|
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D
|
||||||
|
|
||||||
// Machine Information Registers
|
// Machine Information Registers
|
||||||
|
|
||||||
|
|||||||
@@ -22,15 +22,15 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
|
|
||||||
// Memory request
|
// Memory request
|
||||||
output wire mem_req_valid,
|
output wire mem_req_valid,
|
||||||
output wire mem_req_rw,
|
output wire mem_req_rw,
|
||||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||||
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
|
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
|
||||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||||
input wire mem_req_ready,
|
input wire mem_req_ready,
|
||||||
|
|
||||||
// Memory response
|
// Memory response
|
||||||
input wire mem_rsp_valid,
|
input wire mem_rsp_valid,
|
||||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
||||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||||
output wire mem_rsp_ready,
|
output wire mem_rsp_ready,
|
||||||
@@ -45,17 +45,11 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
);
|
);
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_if();
|
VX_mem_perf_if mem_perf_if();
|
||||||
cache_perf_t perf_l3cache;
|
assign mem_perf_if.icache = 'x;
|
||||||
mem_perf_t mem_perf;
|
assign mem_perf_if.dcache = 'x;
|
||||||
|
|
||||||
assign mem_perf_if.icache = 'x;
|
|
||||||
assign mem_perf_if.dcache = 'x;
|
|
||||||
assign mem_perf_if.l2cache = 'x;
|
assign mem_perf_if.l2cache = 'x;
|
||||||
assign mem_perf_if.l3cache = perf_l3cache;
|
`endif
|
||||||
assign mem_perf_if.smem = 'x;
|
|
||||||
assign mem_perf_if.mem = mem_perf;
|
|
||||||
`endif
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (`L2_LINE_SIZE),
|
.DATA_SIZE (`L2_LINE_SIZE),
|
||||||
@@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
.reset (l3_reset),
|
.reset (l3_reset),
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_l3cache),
|
.cache_perf (mem_perf_if.l3cache),
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.core_bus_if (per_cluster_mem_bus_if),
|
.core_bus_if (per_cluster_mem_bus_if),
|
||||||
@@ -166,11 +160,12 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
);
|
);
|
||||||
end
|
end
|
||||||
|
|
||||||
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
|
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||||
|
mem_perf_t mem_perf;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
@@ -181,19 +176,19 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
|
||||||
|
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
mem_perf <= '0;
|
mem_perf <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
|
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
|
||||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1);
|
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
|
||||||
end
|
|
||||||
if (mem_req_fire && mem_bus_if.req_data.rw) begin
|
|
||||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
|
|
||||||
end
|
|
||||||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
assign mem_perf_if.mem = mem_perf;
|
||||||
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|||||||
@@ -262,7 +262,7 @@ module VX_afu_wrap #(
|
|||||||
.m_axi_awready (m_axi_mem_awready_a),
|
.m_axi_awready (m_axi_mem_awready_a),
|
||||||
.m_axi_awaddr (m_axi_mem_awaddr_w),
|
.m_axi_awaddr (m_axi_mem_awaddr_w),
|
||||||
.m_axi_awid (m_axi_mem_awid_a),
|
.m_axi_awid (m_axi_mem_awid_a),
|
||||||
`UNUSED_PIN (m_axi_awlen),
|
.m_axi_awlen (m_axi_mem_awlen_a),
|
||||||
`UNUSED_PIN (m_axi_awsize),
|
`UNUSED_PIN (m_axi_awsize),
|
||||||
`UNUSED_PIN (m_axi_awburst),
|
`UNUSED_PIN (m_axi_awburst),
|
||||||
`UNUSED_PIN (m_axi_awlock),
|
`UNUSED_PIN (m_axi_awlock),
|
||||||
|
|||||||
11
hw/rtl/cache/VX_cache.sv
vendored
11
hw/rtl/cache/VX_cache.sv
vendored
@@ -530,14 +530,17 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||||
|
|
||||||
wire [NUM_REQS-1:0] perf_core_reads_per_req = core_req_valid & core_req_ready & ~core_req_rw;
|
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||||
wire [NUM_REQS-1:0] perf_core_writes_per_req = core_req_valid & core_req_ready & core_req_rw;
|
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||||
|
|
||||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle;
|
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||||
|
|
||||||
|
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
|
||||||
|
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
|
||||||
|
|
||||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
||||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
||||||
@@ -560,7 +563,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||||||
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
|
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
|
|||||||
12
hw/rtl/cache/VX_cache_bypass.sv
vendored
12
hw/rtl/cache/VX_cache_bypass.sv
vendored
@@ -130,20 +130,20 @@ module VX_cache_bypass #(
|
|||||||
|
|
||||||
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_idxs;
|
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_idxs;
|
||||||
|
|
||||||
wire core_req_in_fire = | (core_req_valid_in & core_req_ready_in);
|
wire core_req_nc_ready = ~mem_req_valid_in && mem_req_ready_out;
|
||||||
|
|
||||||
VX_generic_arbiter #(
|
VX_generic_arbiter #(
|
||||||
.NUM_REQS (NUM_REQS),
|
.NUM_REQS (NUM_REQS),
|
||||||
.TYPE (PASSTHRU ? "R" : "P"),
|
.TYPE (PASSTHRU ? "R" : "P"),
|
||||||
.LOCK_ENABLE (1)
|
.LOCK_ENABLE (1)
|
||||||
) req_arb (
|
) core_req_nc_arb (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.unlock (core_req_in_fire),
|
|
||||||
.requests (core_req_valid_in_nc),
|
.requests (core_req_valid_in_nc),
|
||||||
.grant_index (core_req_nc_idx),
|
.grant_index (core_req_nc_idx),
|
||||||
.grant_onehot (core_req_nc_sel),
|
.grant_onehot (core_req_nc_sel),
|
||||||
.grant_valid (core_req_nc_valid)
|
.grant_valid (core_req_nc_valid),
|
||||||
|
.grant_unlock (core_req_nc_ready)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_idxs;
|
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_idxs;
|
||||||
@@ -164,7 +164,7 @@ module VX_cache_bypass #(
|
|||||||
end
|
end
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||||
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i])
|
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (core_req_nc_ready && core_req_nc_sel[i])
|
||||||
: core_req_ready_out[i];
|
: core_req_ready_out[i];
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
@@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
cache_perf_t perf_cache_unit[NUM_CACHES];
|
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
|
||||||
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES);
|
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
|
||||||
|
assign cache_perf = perf_cache_tmp[0];
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
|
|||||||
190
hw/rtl/cache/VX_cache_cluster_top.sv
vendored
190
hw/rtl/cache/VX_cache_cluster_top.sv
vendored
@@ -1,190 +0,0 @@
|
|||||||
// Copyright © 2019-2023
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
`include "VX_cache_define.vh"
|
|
||||||
|
|
||||||
module VX_cache_cluster_top import VX_gpu_pkg::*; #(
|
|
||||||
parameter `STRING INSTANCE_ID = "",
|
|
||||||
|
|
||||||
parameter NUM_UNITS = 2,
|
|
||||||
parameter NUM_INPUTS = 4,
|
|
||||||
parameter TAG_SEL_IDX = 0,
|
|
||||||
|
|
||||||
// Number of Word requests per cycle
|
|
||||||
parameter NUM_REQS = 4,
|
|
||||||
|
|
||||||
// Size of cache in bytes
|
|
||||||
parameter CACHE_SIZE = 16384,
|
|
||||||
// Size of line inside a bank in bytes
|
|
||||||
parameter LINE_SIZE = 16,
|
|
||||||
// Number of banks
|
|
||||||
parameter NUM_BANKS = 4,
|
|
||||||
// Number of associative ways
|
|
||||||
parameter NUM_WAYS = 4,
|
|
||||||
// Size of a word in bytes
|
|
||||||
parameter WORD_SIZE = 4,
|
|
||||||
|
|
||||||
// Core Response Queue Size
|
|
||||||
parameter CRSQ_SIZE = 2,
|
|
||||||
// Miss Reserv Queue Knob
|
|
||||||
parameter MSHR_SIZE = 16,
|
|
||||||
// Memory Response Queue Size
|
|
||||||
parameter MRSQ_SIZE = 0,
|
|
||||||
// Memory Request Queue Size
|
|
||||||
parameter MREQ_SIZE = 4,
|
|
||||||
|
|
||||||
// Enable cache writeable
|
|
||||||
parameter WRITE_ENABLE = 1,
|
|
||||||
|
|
||||||
// Request debug identifier
|
|
||||||
parameter UUID_WIDTH = 0,
|
|
||||||
|
|
||||||
// core request tag size
|
|
||||||
parameter TAG_WIDTH = UUID_WIDTH + 16,
|
|
||||||
|
|
||||||
// enable bypass for non-cacheable addresses
|
|
||||||
parameter NC_ENABLE = 1,
|
|
||||||
|
|
||||||
// Core response output register
|
|
||||||
parameter CORE_OUT_REG = 2,
|
|
||||||
|
|
||||||
// Memory request output register
|
|
||||||
parameter MEM_OUT_REG = 2,
|
|
||||||
|
|
||||||
parameter NUM_CACHES = `UP(NUM_UNITS),
|
|
||||||
parameter PASSTHRU = (NUM_UNITS == 0),
|
|
||||||
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
|
|
||||||
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
|
||||||
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
|
|
||||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
|
||||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
|
|
||||||
parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
|
|
||||||
) (
|
|
||||||
input wire clk,
|
|
||||||
input wire reset,
|
|
||||||
|
|
||||||
// PERF
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
output cache_perf_t cache_perf,
|
|
||||||
`endif
|
|
||||||
|
|
||||||
// Core request
|
|
||||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
|
|
||||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
|
|
||||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
|
|
||||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
|
|
||||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
|
|
||||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
|
|
||||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
|
|
||||||
|
|
||||||
// Core response
|
|
||||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
|
|
||||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
|
|
||||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
|
|
||||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
|
|
||||||
|
|
||||||
// Memory request
|
|
||||||
output wire mem_req_valid,
|
|
||||||
output wire mem_req_rw,
|
|
||||||
output wire [LINE_SIZE-1:0] mem_req_byteen,
|
|
||||||
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
|
||||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
|
|
||||||
output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag,
|
|
||||||
input wire mem_req_ready,
|
|
||||||
|
|
||||||
// Memory response
|
|
||||||
input wire mem_rsp_valid,
|
|
||||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
|
|
||||||
input wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag,
|
|
||||||
output wire mem_rsp_ready
|
|
||||||
);
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (WORD_SIZE),
|
|
||||||
.TAG_WIDTH (TAG_WIDTH)
|
|
||||||
) core_bus_if[NUM_INPUTS * NUM_REQS]();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (LINE_SIZE),
|
|
||||||
.TAG_WIDTH (MEM_TAG_X_WIDTH)
|
|
||||||
) mem_bus_if();
|
|
||||||
|
|
||||||
// Core request
|
|
||||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
|
||||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
|
||||||
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
|
|
||||||
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
|
|
||||||
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
|
|
||||||
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
|
|
||||||
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
|
|
||||||
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
|
|
||||||
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
// Core response
|
|
||||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
|
||||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
|
||||||
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
|
|
||||||
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
|
|
||||||
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
|
|
||||||
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
// Memory request
|
|
||||||
assign mem_req_valid = mem_bus_if.req_valid;
|
|
||||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
|
||||||
assign mem_req_byteen = mem_bus_if.req_data.byteen;
|
|
||||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
|
||||||
assign mem_req_data = mem_bus_if.req_data.data;
|
|
||||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
|
||||||
assign mem_bus_if.req_ready = mem_req_ready;
|
|
||||||
|
|
||||||
// Memory response
|
|
||||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
|
||||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
|
||||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
|
||||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
|
||||||
|
|
||||||
VX_cache_cluster #(
|
|
||||||
.INSTANCE_ID (INSTANCE_ID),
|
|
||||||
.NUM_UNITS (NUM_UNITS),
|
|
||||||
.NUM_INPUTS (NUM_INPUTS),
|
|
||||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
|
||||||
.NUM_REQS (NUM_REQS),
|
|
||||||
.CACHE_SIZE (CACHE_SIZE),
|
|
||||||
.LINE_SIZE (LINE_SIZE),
|
|
||||||
.NUM_BANKS (NUM_BANKS),
|
|
||||||
.NUM_WAYS (NUM_WAYS),
|
|
||||||
.WORD_SIZE (WORD_SIZE),
|
|
||||||
.CRSQ_SIZE (CRSQ_SIZE),
|
|
||||||
.MSHR_SIZE (MSHR_SIZE),
|
|
||||||
.MRSQ_SIZE (MRSQ_SIZE),
|
|
||||||
.MREQ_SIZE (MREQ_SIZE),
|
|
||||||
.WRITE_ENABLE (WRITE_ENABLE),
|
|
||||||
.UUID_WIDTH (UUID_WIDTH),
|
|
||||||
.TAG_WIDTH (TAG_WIDTH),
|
|
||||||
.NC_ENABLE (NC_ENABLE),
|
|
||||||
.CORE_OUT_REG (CORE_OUT_REG),
|
|
||||||
.MEM_OUT_REG (MEM_OUT_REG)
|
|
||||||
) cache (
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
.cache_perf (cache_perf),
|
|
||||||
`endif
|
|
||||||
.clk (clk),
|
|
||||||
.reset (reset),
|
|
||||||
.core_bus_if (core_bus_if),
|
|
||||||
.mem_bus_if (mem_bus_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
endmodule
|
|
||||||
2
hw/rtl/cache/VX_cache_data.sv
vendored
2
hw/rtl/cache/VX_cache_data.sv
vendored
@@ -93,7 +93,7 @@ module VX_cache_data #(
|
|||||||
assign wren = fill;
|
assign wren = fill;
|
||||||
end
|
end
|
||||||
|
|
||||||
wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
|
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
|
||||||
|
|
||||||
VX_onehot_encoder #(
|
VX_onehot_encoder #(
|
||||||
.N (NUM_WAYS)
|
.N (NUM_WAYS)
|
||||||
|
|||||||
12
hw/rtl/cache/VX_cache_define.vh
vendored
12
hw/rtl/cache/VX_cache_define.vh
vendored
@@ -62,4 +62,16 @@
|
|||||||
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
||||||
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
|
||||||
|
|
||||||
`endif // VX_CACHE_DEFINE_VH
|
`endif // VX_CACHE_DEFINE_VH
|
||||||
|
|||||||
4
hw/rtl/cache/VX_cache_top.sv
vendored
4
hw/rtl/cache/VX_cache_top.sv
vendored
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
`include "VX_cache_define.vh"
|
`include "VX_cache_define.vh"
|
||||||
|
|
||||||
module VX_cache_top #(
|
module VX_cache_top import VX_gpu_pkg::*; #(
|
||||||
parameter `STRING INSTANCE_ID = "",
|
parameter `STRING INSTANCE_ID = "",
|
||||||
|
|
||||||
// Number of Word requests per cycle
|
// Number of Word requests per cycle
|
||||||
@@ -22,7 +22,7 @@ module VX_cache_top #(
|
|||||||
// Size of cache in bytes
|
// Size of cache in bytes
|
||||||
parameter CACHE_SIZE = 16384,
|
parameter CACHE_SIZE = 16384,
|
||||||
// Size of line inside a bank in bytes
|
// Size of line inside a bank in bytes
|
||||||
parameter LINE_SIZE = 16,
|
parameter LINE_SIZE = 64,
|
||||||
// Number of banks
|
// Number of banks
|
||||||
parameter NUM_BANKS = 4,
|
parameter NUM_BANKS = 4,
|
||||||
// Number of associative ways
|
// Number of associative ways
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
VX_commit_if commit_if[`ISSUE_WIDTH]();
|
VX_commit_if commit_if[`ISSUE_WIDTH]();
|
||||||
|
|
||||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||||
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
|
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
|
||||||
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
|
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
|
||||||
wire [`ISSUE_WIDTH-1:0] commit_eop;
|
wire [`ISSUE_WIDTH-1:0] commit_eop;
|
||||||
@@ -91,24 +91,24 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
`UNUSED_PIN (sel_out)
|
`UNUSED_PIN (sel_out)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||||
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||||
assign commit_wid[i] = commit_if[i].data.wid;
|
assign commit_wid[i] = commit_if[i].data.wid;
|
||||||
assign commit_eop[i] = commit_if[i].data.eop;
|
assign commit_eop[i] = commit_if[i].data.eop;
|
||||||
end
|
end
|
||||||
|
|
||||||
// CSRs update
|
// CSRs update
|
||||||
|
|
||||||
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
|
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
|
||||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
|
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
|
||||||
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
|
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
|
||||||
|
|
||||||
assign commit_fire_any = (| commit_fire);
|
assign commit_fire_any = (| commit_fire);
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
wire [COMMIT_SIZEW-1:0] pop_count;
|
wire [COMMIT_SIZEW-1:0] count;
|
||||||
`POP_COUNT(pop_count, commit_tmask[i]);
|
`POP_COUNT(count, commit_tmask[i]);
|
||||||
assign commit_size[i] = pop_count;
|
assign commit_size[i] = count;
|
||||||
end
|
end
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
@@ -129,7 +129,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
.OP ("+")
|
.OP ("+")
|
||||||
) commit_size_reduce (
|
) commit_size_reduce (
|
||||||
.data_in (commit_size_r),
|
.data_in (commit_size_r),
|
||||||
.data_out (commit_size_all)
|
.data_out (commit_size_all_r)
|
||||||
);
|
);
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
@@ -139,26 +139,26 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (1'b1),
|
.enable (1'b1),
|
||||||
.data_in ({commit_fire_any_r, commit_size_all}),
|
.data_in ({commit_fire_any_r, commit_size_all_r}),
|
||||||
.data_out ({commit_fire_any_rr, commit_size_all_r})
|
.data_out ({commit_fire_any_rr, commit_size_all_rr})
|
||||||
);
|
);
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] instret;
|
reg [`PERF_CTR_BITS-1:0] instret;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
instret <= '0;
|
instret <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
if (commit_fire_any_rr) begin
|
if (commit_fire_any_rr) begin
|
||||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
|
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign commit_csr_if.instret = instret;
|
assign commit_csr_if.instret = instret;
|
||||||
|
|
||||||
// Committed instructions
|
// Committed instructions
|
||||||
|
|
||||||
|
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||||
.RESETW (`ISSUE_WIDTH)
|
.RESETW (`ISSUE_WIDTH)
|
||||||
@@ -166,23 +166,23 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (1'b1),
|
.enable (1'b1),
|
||||||
.data_in ({(commit_fire & commit_eop), commit_wid}),
|
.data_in ({committed, commit_wid}),
|
||||||
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
|
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
|
||||||
);
|
);
|
||||||
|
|
||||||
// Writeback
|
// Writeback
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||||
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
||||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||||
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
|
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
|
||||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||||
assign writeback_if[i].data.data = commit_if[i].data.data;
|
assign writeback_if[i].data.data = commit_if[i].data.data;
|
||||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||||
assign commit_if[i].ready = 1'b1;
|
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
|
||||||
end
|
end
|
||||||
|
|
||||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||||
|
|||||||
@@ -1,336 +1,338 @@
|
|||||||
// Copyright © 2019-2023
|
// Copyright © 2019-2023
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
// you may not use this file except in compliance with the License.
|
// you may not use this file except in compliance with the License.
|
||||||
// You may obtain a copy of the License at
|
// You may obtain a copy of the License at
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
//
|
//
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
`include "VX_fpu_define.vh"
|
`include "VX_fpu_define.vh"
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
module VX_core import VX_gpu_pkg::*; #(
|
module VX_core import VX_gpu_pkg::*; #(
|
||||||
parameter CORE_ID = 0
|
parameter CORE_ID = 0
|
||||||
) (
|
) (
|
||||||
`SCOPE_IO_DECL
|
`SCOPE_IO_DECL
|
||||||
|
|
||||||
// Clock
|
// Clock
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if.slave mem_perf_if,
|
VX_mem_perf_if.slave mem_perf_if,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_dcr_bus_if.slave dcr_bus_if,
|
VX_dcr_bus_if.slave dcr_bus_if,
|
||||||
|
|
||||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||||
|
|
||||||
VX_mem_bus_if.master icache_bus_if,
|
VX_mem_bus_if.master icache_bus_if,
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
VX_gbar_bus_if.master gbar_bus_if,
|
VX_gbar_bus_if.master gbar_bus_if,
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// simulation helper signals
|
// simulation helper signals
|
||||||
output wire sim_ebreak,
|
output wire sim_ebreak,
|
||||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||||
|
|
||||||
// Status
|
// Status
|
||||||
output wire busy
|
output wire busy
|
||||||
);
|
);
|
||||||
VX_schedule_if schedule_if();
|
VX_schedule_if schedule_if();
|
||||||
VX_fetch_if fetch_if();
|
VX_fetch_if fetch_if();
|
||||||
VX_decode_if decode_if();
|
VX_decode_if decode_if();
|
||||||
VX_sched_csr_if sched_csr_if();
|
VX_sched_csr_if sched_csr_if();
|
||||||
VX_decode_sched_if decode_sched_if();
|
VX_decode_sched_if decode_sched_if();
|
||||||
VX_commit_sched_if commit_sched_if();
|
VX_commit_sched_if commit_sched_if();
|
||||||
VX_commit_csr_if commit_csr_if();
|
VX_commit_csr_if commit_csr_if();
|
||||||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||||
VX_warp_ctl_if warp_ctl_if();
|
VX_warp_ctl_if warp_ctl_if();
|
||||||
|
|
||||||
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
|
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
|
||||||
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
|
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
|
||||||
|
|
||||||
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
|
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
|
||||||
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
|
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||||
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
||||||
`endif
|
`endif
|
||||||
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||||
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
||||||
|
|
||||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_pipeline_perf_if pipeline_perf_if();
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
VX_mem_perf_if mem_perf_tmp_if();
|
VX_pipeline_perf_if pipeline_perf_if();
|
||||||
|
|
||||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
`ifdef SM_ENABLE
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
cache_perf_t smem_perf;
|
`endif
|
||||||
assign mem_perf_tmp_if.smem = smem_perf;
|
|
||||||
`else
|
`RESET_RELAY (dcr_data_reset, reset);
|
||||||
assign mem_perf_tmp_if.smem = '0;
|
`RESET_RELAY (schedule_reset, reset);
|
||||||
`endif
|
`RESET_RELAY (fetch_reset, reset);
|
||||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
`RESET_RELAY (decode_reset, reset);
|
||||||
`endif
|
`RESET_RELAY (issue_reset, reset);
|
||||||
|
`RESET_RELAY (execute_reset, reset);
|
||||||
`RESET_RELAY (dcr_data_reset, reset);
|
`RESET_RELAY (commit_reset, reset);
|
||||||
`RESET_RELAY (schedule_reset, reset);
|
|
||||||
`RESET_RELAY (fetch_reset, reset);
|
base_dcrs_t base_dcrs;
|
||||||
`RESET_RELAY (decode_reset, reset);
|
|
||||||
`RESET_RELAY (issue_reset, reset);
|
VX_dcr_data dcr_data (
|
||||||
`RESET_RELAY (execute_reset, reset);
|
.clk (clk),
|
||||||
`RESET_RELAY (commit_reset, reset);
|
.reset (dcr_data_reset),
|
||||||
|
.dcr_bus_if (dcr_bus_if),
|
||||||
base_dcrs_t base_dcrs;
|
.base_dcrs (base_dcrs)
|
||||||
|
);
|
||||||
VX_dcr_data dcr_data (
|
|
||||||
.clk (clk),
|
`SCOPE_IO_SWITCH (3)
|
||||||
.reset (dcr_data_reset),
|
|
||||||
.dcr_bus_if (dcr_bus_if),
|
VX_schedule #(
|
||||||
.base_dcrs (base_dcrs)
|
.CORE_ID (CORE_ID)
|
||||||
);
|
) schedule (
|
||||||
|
.clk (clk),
|
||||||
`SCOPE_IO_SWITCH (3)
|
.reset (schedule_reset),
|
||||||
|
|
||||||
VX_schedule #(
|
`ifdef PERF_ENABLE
|
||||||
.CORE_ID (CORE_ID)
|
.perf_schedule_if (pipeline_perf_if.schedule),
|
||||||
) schedule (
|
`endif
|
||||||
.clk (clk),
|
|
||||||
.reset (schedule_reset),
|
.base_dcrs (base_dcrs),
|
||||||
|
|
||||||
.base_dcrs (base_dcrs),
|
.warp_ctl_if (warp_ctl_if),
|
||||||
|
.branch_ctl_if (branch_ctl_if),
|
||||||
.warp_ctl_if (warp_ctl_if),
|
.decode_sched_if(decode_sched_if),
|
||||||
.branch_ctl_if (branch_ctl_if),
|
.commit_sched_if(commit_sched_if),
|
||||||
.decode_sched_if(decode_sched_if),
|
|
||||||
.commit_sched_if(commit_sched_if),
|
.schedule_if (schedule_if),
|
||||||
|
`ifdef GBAR_ENABLE
|
||||||
.schedule_if (schedule_if),
|
.gbar_bus_if (gbar_bus_if),
|
||||||
`ifdef GBAR_ENABLE
|
`endif
|
||||||
.gbar_bus_if (gbar_bus_if),
|
.sched_csr_if (sched_csr_if),
|
||||||
`endif
|
|
||||||
.sched_csr_if (sched_csr_if),
|
.busy (busy)
|
||||||
|
);
|
||||||
.busy (busy)
|
|
||||||
);
|
VX_fetch #(
|
||||||
|
.CORE_ID (CORE_ID)
|
||||||
VX_fetch #(
|
) fetch (
|
||||||
.CORE_ID (CORE_ID)
|
`SCOPE_IO_BIND (0)
|
||||||
) fetch (
|
.clk (clk),
|
||||||
`SCOPE_IO_BIND (0)
|
.reset (fetch_reset),
|
||||||
.clk (clk),
|
.icache_bus_if (icache_bus_if),
|
||||||
.reset (fetch_reset),
|
.schedule_if (schedule_if),
|
||||||
.icache_bus_if (icache_bus_if),
|
.fetch_if (fetch_if)
|
||||||
.schedule_if (schedule_if),
|
);
|
||||||
.fetch_if (fetch_if)
|
|
||||||
);
|
VX_decode #(
|
||||||
|
.CORE_ID (CORE_ID)
|
||||||
VX_decode #(
|
) decode (
|
||||||
.CORE_ID (CORE_ID)
|
.clk (clk),
|
||||||
) decode (
|
.reset (decode_reset),
|
||||||
.clk (clk),
|
.fetch_if (fetch_if),
|
||||||
.reset (decode_reset),
|
.decode_if (decode_if),
|
||||||
.fetch_if (fetch_if),
|
.decode_sched_if(decode_sched_if)
|
||||||
.decode_if (decode_if),
|
);
|
||||||
.decode_sched_if(decode_sched_if)
|
|
||||||
);
|
VX_issue #(
|
||||||
|
.CORE_ID (CORE_ID)
|
||||||
VX_issue #(
|
) issue (
|
||||||
.CORE_ID (CORE_ID)
|
`SCOPE_IO_BIND (1)
|
||||||
) issue (
|
|
||||||
`SCOPE_IO_BIND (1)
|
.clk (clk),
|
||||||
|
.reset (issue_reset),
|
||||||
.clk (clk),
|
|
||||||
.reset (issue_reset),
|
`ifdef PERF_ENABLE
|
||||||
|
.perf_issue_if (pipeline_perf_if.issue),
|
||||||
`ifdef PERF_ENABLE
|
`endif
|
||||||
.perf_issue_if (pipeline_perf_if.issue),
|
|
||||||
`endif
|
.decode_if (decode_if),
|
||||||
|
.writeback_if (writeback_if),
|
||||||
.decode_if (decode_if),
|
|
||||||
.writeback_if (writeback_if),
|
.alu_dispatch_if(alu_dispatch_if),
|
||||||
|
.lsu_dispatch_if(lsu_dispatch_if),
|
||||||
.alu_dispatch_if(alu_dispatch_if),
|
`ifdef EXT_F_ENABLE
|
||||||
.lsu_dispatch_if(lsu_dispatch_if),
|
.fpu_dispatch_if(fpu_dispatch_if),
|
||||||
`ifdef EXT_F_ENABLE
|
`endif
|
||||||
.fpu_dispatch_if(fpu_dispatch_if),
|
.sfu_dispatch_if(sfu_dispatch_if)
|
||||||
`endif
|
);
|
||||||
.sfu_dispatch_if(sfu_dispatch_if)
|
|
||||||
);
|
VX_execute #(
|
||||||
|
.CORE_ID (CORE_ID)
|
||||||
VX_execute #(
|
) execute (
|
||||||
.CORE_ID (CORE_ID)
|
`SCOPE_IO_BIND (2)
|
||||||
) execute (
|
|
||||||
`SCOPE_IO_BIND (2)
|
.clk (clk),
|
||||||
|
.reset (execute_reset),
|
||||||
.clk (clk),
|
|
||||||
.reset (execute_reset),
|
.base_dcrs (base_dcrs),
|
||||||
|
|
||||||
.base_dcrs (base_dcrs),
|
`ifdef PERF_ENABLE
|
||||||
|
.mem_perf_if (mem_perf_tmp_if),
|
||||||
`ifdef PERF_ENABLE
|
.pipeline_perf_if(pipeline_perf_if),
|
||||||
.mem_perf_if (mem_perf_tmp_if),
|
`endif
|
||||||
.pipeline_perf_if(pipeline_perf_if),
|
|
||||||
`endif
|
.dcache_bus_if (dcache_bus_tmp_if),
|
||||||
|
|
||||||
.dcache_bus_if (dcache_bus_tmp_if),
|
`ifdef EXT_F_ENABLE
|
||||||
|
.fpu_dispatch_if(fpu_dispatch_if),
|
||||||
`ifdef EXT_F_ENABLE
|
.fpu_commit_if (fpu_commit_if),
|
||||||
.fpu_dispatch_if(fpu_dispatch_if),
|
`endif
|
||||||
.fpu_commit_if (fpu_commit_if),
|
|
||||||
`endif
|
.commit_csr_if (commit_csr_if),
|
||||||
|
.sched_csr_if (sched_csr_if),
|
||||||
.commit_csr_if (commit_csr_if),
|
|
||||||
.sched_csr_if (sched_csr_if),
|
.alu_dispatch_if(alu_dispatch_if),
|
||||||
|
.lsu_dispatch_if(lsu_dispatch_if),
|
||||||
.alu_dispatch_if(alu_dispatch_if),
|
.sfu_dispatch_if(sfu_dispatch_if),
|
||||||
.lsu_dispatch_if(lsu_dispatch_if),
|
|
||||||
.sfu_dispatch_if(sfu_dispatch_if),
|
.warp_ctl_if (warp_ctl_if),
|
||||||
|
.branch_ctl_if (branch_ctl_if),
|
||||||
.warp_ctl_if (warp_ctl_if),
|
|
||||||
.branch_ctl_if (branch_ctl_if),
|
.alu_commit_if (alu_commit_if),
|
||||||
|
.lsu_commit_if (lsu_commit_if),
|
||||||
.alu_commit_if (alu_commit_if),
|
.sfu_commit_if (sfu_commit_if),
|
||||||
.lsu_commit_if (lsu_commit_if),
|
|
||||||
.sfu_commit_if (sfu_commit_if),
|
.sim_ebreak (sim_ebreak)
|
||||||
|
);
|
||||||
.sim_ebreak (sim_ebreak)
|
|
||||||
);
|
VX_commit #(
|
||||||
|
.CORE_ID (CORE_ID)
|
||||||
VX_commit #(
|
) commit (
|
||||||
.CORE_ID (CORE_ID)
|
.clk (clk),
|
||||||
) commit (
|
.reset (commit_reset),
|
||||||
.clk (clk),
|
|
||||||
.reset (commit_reset),
|
.alu_commit_if (alu_commit_if),
|
||||||
|
.lsu_commit_if (lsu_commit_if),
|
||||||
.alu_commit_if (alu_commit_if),
|
`ifdef EXT_F_ENABLE
|
||||||
.lsu_commit_if (lsu_commit_if),
|
.fpu_commit_if (fpu_commit_if),
|
||||||
`ifdef EXT_F_ENABLE
|
`endif
|
||||||
.fpu_commit_if (fpu_commit_if),
|
.sfu_commit_if (sfu_commit_if),
|
||||||
`endif
|
|
||||||
.sfu_commit_if (sfu_commit_if),
|
.writeback_if (writeback_if),
|
||||||
|
|
||||||
.writeback_if (writeback_if),
|
.commit_csr_if (commit_csr_if),
|
||||||
|
.commit_sched_if(commit_sched_if),
|
||||||
.commit_csr_if (commit_csr_if),
|
|
||||||
.commit_sched_if(commit_sched_if),
|
.sim_wb_value (sim_wb_value)
|
||||||
|
);
|
||||||
.sim_wb_value (sim_wb_value)
|
|
||||||
);
|
`ifdef SM_ENABLE
|
||||||
|
|
||||||
`ifdef SM_ENABLE
|
VX_smem_unit #(
|
||||||
|
.CORE_ID (CORE_ID)
|
||||||
VX_smem_unit #(
|
) smem_unit (
|
||||||
.CORE_ID (CORE_ID)
|
.clk (clk),
|
||||||
) smem_unit (
|
.reset (reset),
|
||||||
.clk (clk),
|
`ifdef PERF_ENABLE
|
||||||
.reset (reset),
|
.cache_perf (mem_perf_tmp_if.smem),
|
||||||
`ifdef PERF_ENABLE
|
`endif
|
||||||
.cache_perf (smem_perf),
|
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||||
`endif
|
.dcache_bus_out_if (dcache_bus_if)
|
||||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
);
|
||||||
.dcache_bus_out_if (dcache_bus_if)
|
|
||||||
);
|
`else
|
||||||
|
|
||||||
`else
|
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
||||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
end
|
||||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
|
||||||
end
|
`endif
|
||||||
|
|
||||||
`endif
|
`ifdef PERF_ENABLE
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||||
|
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
|
||||||
|
wire [1:0] perf_icache_pending_read_cycle;
|
||||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||||
|
|
||||||
wire perf_icache_pending_read_cycle;
|
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||||
|
|
||||||
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready;
|
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
|
||||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready;
|
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
||||||
|
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire;
|
|
||||||
|
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
|
||||||
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
|
||||||
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready;
|
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||||
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
end
|
||||||
end
|
|
||||||
|
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
|
||||||
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
|
`BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
|
||||||
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
|
|
||||||
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
|
||||||
|
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
|
||||||
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
|
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
||||||
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
|
||||||
|
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
|
||||||
always @(posedge clk) begin
|
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
||||||
if (reset) begin
|
|
||||||
perf_icache_pending_reads <= '0;
|
always @(posedge clk) begin
|
||||||
perf_dcache_pending_reads <= '0;
|
if (reset) begin
|
||||||
end else begin
|
perf_icache_pending_reads <= '0;
|
||||||
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
perf_dcache_pending_reads <= '0;
|
||||||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
end else begin
|
||||||
end
|
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||||
end
|
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||||
|
end
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
end
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||||
always @(posedge clk) begin
|
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||||
if (reset) begin
|
|
||||||
perf_ifetches <= '0;
|
always @(posedge clk) begin
|
||||||
perf_loads <= '0;
|
if (reset) begin
|
||||||
perf_stores <= '0;
|
perf_ifetches <= '0;
|
||||||
perf_icache_lat <= '0;
|
perf_loads <= '0;
|
||||||
perf_dcache_lat <= '0;
|
perf_stores <= '0;
|
||||||
end else begin
|
perf_icache_lat <= '0;
|
||||||
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
|
perf_dcache_lat <= '0;
|
||||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
end else begin
|
||||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
|
||||||
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||||
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||||
end
|
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
||||||
end
|
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
||||||
|
end
|
||||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
end
|
||||||
assign pipeline_perf_if.loads = perf_loads;
|
|
||||||
assign pipeline_perf_if.stores = perf_stores;
|
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
assign pipeline_perf_if.loads = perf_loads;
|
||||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
assign pipeline_perf_if.stores = perf_stores;
|
||||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||||
|
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||||
`endif
|
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||||
|
|
||||||
endmodule
|
`endif
|
||||||
|
|
||||||
|
endmodule
|
||||||
|
|||||||
@@ -129,7 +129,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||||||
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_if();
|
VX_mem_perf_if mem_perf_if();
|
||||||
|
assign mem_perf_if.icache = '0;
|
||||||
|
assign mem_perf_if.dcache = '0;
|
||||||
|
assign mem_perf_if.l2cache = '0;
|
||||||
|
assign mem_perf_if.l3cache = '0;
|
||||||
|
assign mem_perf_if.smem = '0;
|
||||||
|
assign mem_perf_if.mem = '0;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef SCOPE
|
`ifdef SCOPE
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ import VX_fpu_pkg::*;
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if.slave mem_perf_if,
|
VX_mem_perf_if.slave mem_perf_if,
|
||||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||||
VX_sfu_perf_if.slave sfu_perf_if,
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_commit_csr_if.slave commit_csr_if,
|
VX_commit_csr_if.slave commit_csr_if,
|
||||||
@@ -179,105 +178,115 @@ import VX_fpu_pkg::*;
|
|||||||
|
|
||||||
default: begin
|
default: begin
|
||||||
read_addr_valid_r = 0;
|
read_addr_valid_r = 0;
|
||||||
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|
||||||
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
|
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
|
||||||
read_addr_valid_r = 1;
|
read_addr_valid_r = 1;
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
case (base_dcrs.mpm_class)
|
case (base_dcrs.mpm_class)
|
||||||
`VX_DCR_MPM_CLASS_CORE: begin
|
`VX_DCR_MPM_CLASS_CORE: begin
|
||||||
case (read_addr)
|
case (read_addr)
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
|
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||||
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
|
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||||
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
|
||||||
|
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0];
|
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
|
||||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||||
`else
|
`else
|
||||||
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0;
|
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||||
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
|
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||||
`endif
|
`endif
|
||||||
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
|
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
|
||||||
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
|
||||||
|
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
|
||||||
|
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
|
||||||
|
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||||
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
`VX_DCR_MPM_CLASS_MEM: begin
|
`VX_DCR_MPM_CLASS_MEM: begin
|
||||||
case (read_addr)
|
case (read_addr)
|
||||||
// PERF: icache
|
// PERF: icache
|
||||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||||
|
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: dcache
|
// PERF: dcache
|
||||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: smem
|
// PERF: smem
|
||||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: l2cache
|
// PERF: l2cache
|
||||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: l3cache
|
// PERF: l3cache
|
||||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||||
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
@@ -297,8 +306,6 @@ import VX_fpu_pkg::*;
|
|||||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
|
|
||||||
`UNUSED_VAR (perf_wctl_stalls);
|
|
||||||
`UNUSED_VAR (mem_perf_if.icache);
|
`UNUSED_VAR (mem_perf_if.icache);
|
||||||
`UNUSED_VAR (mem_perf_if.smem);
|
`UNUSED_VAR (mem_perf_if.smem);
|
||||||
`endif
|
`endif
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if.slave mem_perf_if,
|
VX_mem_perf_if.slave mem_perf_if,
|
||||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||||
VX_sfu_perf_if.slave sfu_perf_if,
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (mem_perf_if),
|
.mem_perf_if (mem_perf_if),
|
||||||
.pipeline_perf_if(pipeline_perf_if),
|
.pipeline_perf_if(pipeline_perf_if),
|
||||||
.sfu_perf_if (sfu_perf_if),
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.commit_csr_if (commit_csr_if),
|
.commit_csr_if (commit_csr_if),
|
||||||
|
|||||||
@@ -533,8 +533,9 @@ module VX_decode #(
|
|||||||
assign decode_sched_if.valid = fetch_fire;
|
assign decode_sched_if.valid = fetch_fire;
|
||||||
assign decode_sched_if.wid = fetch_if.data.wid;
|
assign decode_sched_if.wid = fetch_if.data.wid;
|
||||||
assign decode_sched_if.is_wstall = is_wstall;
|
assign decode_sched_if.is_wstall = is_wstall;
|
||||||
|
`ifndef L1_ENABLE
|
||||||
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
|
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
|
|||||||
@@ -12,6 +12,7 @@
|
|||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
`include "VX_trace.vh"
|
||||||
|
|
||||||
module VX_dispatch import VX_gpu_pkg::*; #(
|
module VX_dispatch import VX_gpu_pkg::*; #(
|
||||||
parameter CORE_ID = 0
|
parameter CORE_ID = 0
|
||||||
@@ -174,30 +175,38 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||||||
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
||||||
end
|
end
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r;
|
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
|
||||||
wire [`ISSUE_WIDTH-1:0] operands_stall;
|
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
|
||||||
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
|
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
|
||||||
|
|
||||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
|
always @(*) begin
|
||||||
assign operands_ex_type[i] = operands_if[i].data.ex_type;
|
perf_issue_unit_stalls_per_cycle[i] = '0;
|
||||||
end
|
if (operands_if[i].valid && ~operands_if[i].ready) begin
|
||||||
|
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
|
||||||
always @(*) begin
|
|
||||||
perf_stalls_n = perf_stalls_r;
|
|
||||||
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
|
|
||||||
if (operands_stall[i]) begin
|
|
||||||
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
always @(posedge clk) begin
|
VX_reduce #(
|
||||||
if (reset) begin
|
.DATAW_IN (`NUM_EX_UNITS),
|
||||||
perf_stalls_r <= '0;
|
.N (`ISSUE_WIDTH),
|
||||||
end else begin
|
.OP ("|")
|
||||||
perf_stalls_r <= perf_stalls_n;
|
) reduce (
|
||||||
|
.data_in (perf_issue_unit_stalls_per_cycle),
|
||||||
|
.data_out (perf_unit_stalls_per_cycle)
|
||||||
|
);
|
||||||
|
|
||||||
|
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_stalls_r[i] <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
batch_idx <= '0;
|
batch_idx <= '0;
|
||||||
end else if (batch_done) begin
|
end else begin
|
||||||
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
|
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end else begin
|
end else begin
|
||||||
@@ -203,20 +203,20 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||||||
assign block_done[block_idx] = ~valid_p || ready_p;
|
assign block_done[block_idx] = ~valid_p || ready_p;
|
||||||
end
|
end
|
||||||
|
|
||||||
wire [ISSUE_IDX_W-1:0] wsi;
|
wire [ISSUE_ISW_W-1:0] isw;
|
||||||
if (BATCH_COUNT != 1) begin
|
if (BATCH_COUNT != 1) begin
|
||||||
if (BLOCK_SIZE != 1) begin
|
if (BLOCK_SIZE != 1) begin
|
||||||
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
||||||
end else begin
|
end else begin
|
||||||
assign wsi = batch_idx;
|
assign isw = batch_idx;
|
||||||
end
|
end
|
||||||
end else begin
|
end else begin
|
||||||
assign wsi = block_idx;
|
assign isw = block_idx;
|
||||||
end
|
end
|
||||||
|
|
||||||
`RESET_RELAY(buf_out_reset, reset);
|
`RESET_RELAY(buf_out_reset, reset);
|
||||||
|
|
||||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
|
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW (OUT_DATAW),
|
.DATAW (OUT_DATAW),
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
`UNUSED_VAR (reset)
|
`UNUSED_VAR (reset)
|
||||||
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
|
|
||||||
|
|
||||||
wire icache_req_valid;
|
wire icache_req_valid;
|
||||||
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
|
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
|
||||||
@@ -44,8 +43,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
wire icache_req_fire = icache_req_valid && icache_req_ready;
|
wire icache_req_fire = icache_req_valid && icache_req_ready;
|
||||||
|
|
||||||
wire [ISW_WIDTH-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
|
|
||||||
|
|
||||||
assign req_tag = schedule_if.data.wid;
|
assign req_tag = schedule_if.data.wid;
|
||||||
|
|
||||||
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
|
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
|
||||||
@@ -68,9 +65,12 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||||||
.rdata ({rsp_PC, rsp_tmask})
|
.rdata ({rsp_PC, rsp_tmask})
|
||||||
);
|
);
|
||||||
|
|
||||||
|
`ifndef L1_ENABLE
|
||||||
// Ensure that the ibuffer doesn't fill up.
|
// Ensure that the ibuffer doesn't fill up.
|
||||||
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
|
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
|
||||||
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
|
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
|
||||||
|
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
|
||||||
|
|
||||||
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
|
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
VX_pending_size #(
|
VX_pending_size #(
|
||||||
@@ -85,13 +85,16 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||||||
`UNUSED_PIN (empty)
|
`UNUSED_PIN (empty)
|
||||||
);
|
);
|
||||||
end
|
end
|
||||||
|
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
|
||||||
|
`else
|
||||||
|
wire ibuf_ready = 1'b1;
|
||||||
|
`endif
|
||||||
|
|
||||||
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
|
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
|
||||||
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
|
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
|
||||||
|
|
||||||
// Icache Request
|
// Icache Request
|
||||||
|
|
||||||
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
|
|
||||||
assign icache_req_valid = schedule_if.valid && ibuf_ready;
|
assign icache_req_valid = schedule_if.valid && ibuf_ready;
|
||||||
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
|
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
|
||||||
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
|
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||||||
localparam NUM_LANES = `NUM_FPU_LANES;
|
localparam NUM_LANES = `NUM_FPU_LANES;
|
||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
|
localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
|
||||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||||
|
|
||||||
VX_execute_if #(
|
VX_execute_if #(
|
||||||
@@ -87,7 +87,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||||||
|
|
||||||
VX_index_buffer #(
|
VX_index_buffer #(
|
||||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
|
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
|
||||||
.SIZE (`FPU_REQ_QUEUE_SIZE)
|
.SIZE (`FPUQ_SIZE)
|
||||||
) tag_store (
|
) tag_store (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
@@ -37,7 +37,7 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||||||
wire [BLOCK_SIZE-1:0] commit_in_valid;
|
wire [BLOCK_SIZE-1:0] commit_in_valid;
|
||||||
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
|
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
|
||||||
wire [BLOCK_SIZE-1:0] commit_in_ready;
|
wire [BLOCK_SIZE-1:0] commit_in_ready;
|
||||||
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi;
|
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
|
||||||
|
|
||||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||||
assign commit_in_valid[i] = commit_in_if[i].valid;
|
assign commit_in_valid[i] = commit_in_if[i].valid;
|
||||||
@@ -45,12 +45,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||||||
assign commit_in_if[i].ready = commit_in_ready[i];
|
assign commit_in_if[i].ready = commit_in_ready[i];
|
||||||
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
|
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
|
||||||
if (BLOCK_SIZE != 1) begin
|
if (BLOCK_SIZE != 1) begin
|
||||||
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
|
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
|
||||||
end else begin
|
end else begin
|
||||||
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W];
|
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
|
||||||
end
|
end
|
||||||
end else begin
|
end else begin
|
||||||
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i);
|
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -64,12 +64,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||||||
commit_out_data[i] = 'x;
|
commit_out_data[i] = 'x;
|
||||||
end
|
end
|
||||||
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
|
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
|
||||||
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i];
|
commit_out_valid[commit_in_isw[i]] = commit_in_valid[i];
|
||||||
commit_out_data[commit_in_wsi[i]] = commit_in_data[i];
|
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||||
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]];
|
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
|
||||||
end
|
end
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
|
|||||||
@@ -66,8 +66,9 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||||||
.valid_out (ibuffer_if[i].valid),
|
.valid_out (ibuffer_if[i].valid),
|
||||||
.ready_out(ibuffer_if[i].ready)
|
.ready_out(ibuffer_if[i].ready)
|
||||||
);
|
);
|
||||||
|
`ifndef L1_ENABLE
|
||||||
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
|
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
|
||||||
|
`endif
|
||||||
end
|
end
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -14,10 +14,10 @@
|
|||||||
`include "VX_platform.vh"
|
`include "VX_platform.vh"
|
||||||
|
|
||||||
module VX_ipdom_stack #(
|
module VX_ipdom_stack #(
|
||||||
parameter WIDTH = 1,
|
parameter WIDTH = 1,
|
||||||
parameter DEPTH = 1,
|
parameter DEPTH = 1,
|
||||||
parameter OUT_REG = 0,
|
parameter OUT_REG = 0,
|
||||||
parameter ADDRW = `LOG2UP(DEPTH)
|
parameter ADDRW = `LOG2UP(DEPTH)
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|||||||
@@ -59,6 +59,11 @@ module VX_issue #(
|
|||||||
) scoreboard (
|
) scoreboard (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (scoreboard_reset),
|
.reset (scoreboard_reset),
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||||
|
.perf_units_uses(perf_issue_if.units_uses),
|
||||||
|
.perf_sfu_uses (perf_issue_if.sfu_uses),
|
||||||
|
`endif
|
||||||
.writeback_if (writeback_if),
|
.writeback_if (writeback_if),
|
||||||
.ibuffer_if (ibuffer_if),
|
.ibuffer_if (ibuffer_if),
|
||||||
.scoreboard_if (scoreboard_if)
|
.scoreboard_if (scoreboard_if)
|
||||||
@@ -80,7 +85,7 @@ module VX_issue #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (dispatch_reset),
|
.reset (dispatch_reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.perf_stalls (perf_issue_if.dsp_stalls),
|
`UNUSED_PIN (perf_stalls),
|
||||||
`endif
|
`endif
|
||||||
.operands_if (operands_if),
|
.operands_if (operands_if),
|
||||||
.alu_dispatch_if(alu_dispatch_if),
|
.alu_dispatch_if(alu_dispatch_if),
|
||||||
@@ -152,29 +157,18 @@ module VX_issue #(
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
|
|
||||||
|
wire decode_stall = decode_if.valid && ~decode_if.ready;
|
||||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
|
||||||
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
|
||||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
|
||||||
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
|
||||||
end
|
|
||||||
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
perf_ibf_stalls <= '0;
|
perf_ibf_stalls <= '0;
|
||||||
perf_scb_stalls <= '0;
|
|
||||||
end else begin
|
end else begin
|
||||||
if (decode_if.valid && ~decode_if.ready) begin
|
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
|
||||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
|
|
||||||
end
|
|
||||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
// detect duplicate addresses
|
// detect duplicate addresses
|
||||||
|
|
||||||
wire lsu_is_dup;
|
wire lsu_is_dup;
|
||||||
`ifdef LSU_DUP
|
`ifdef LSU_DUP_ENABLE
|
||||||
if (NUM_LANES > 1) begin
|
if (NUM_LANES > 1) begin
|
||||||
wire [NUM_LANES-2:0] addr_matches;
|
wire [NUM_LANES-2:0] addr_matches;
|
||||||
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
|
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
|
||||||
@@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
assign mem_req_tag = {
|
assign mem_req_tag = {
|
||||||
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
||||||
`ifdef LSU_DUP
|
`ifdef LSU_DUP_ENABLE
|
||||||
, lsu_is_dup
|
, lsu_is_dup
|
||||||
`endif
|
`endif
|
||||||
};
|
};
|
||||||
@@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
wire [PID_WIDTH-1:0] rsp_pid;
|
wire [PID_WIDTH-1:0] rsp_pid;
|
||||||
wire rsp_is_dup;
|
wire rsp_is_dup;
|
||||||
|
|
||||||
`ifndef LSU_DUP
|
`ifndef LSU_DUP_ENABLE
|
||||||
assign rsp_is_dup = 0;
|
assign rsp_is_dup = 0;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
assign {
|
assign {
|
||||||
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
||||||
`ifdef LSU_DUP
|
`ifdef LSU_DUP_ENABLE
|
||||||
, rsp_is_dup
|
, rsp_is_dup
|
||||||
`endif
|
`endif
|
||||||
} = mem_rsp_tag;
|
} = mem_rsp_tag;
|
||||||
@@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||||||
VX_stream_arb #(
|
VX_stream_arb #(
|
||||||
.NUM_INPUTS (2),
|
.NUM_INPUTS (2),
|
||||||
.DATAW (RSP_ARB_DATAW),
|
.DATAW (RSP_ARB_DATAW),
|
||||||
.OUT_REG (1)
|
.OUT_REG (3)
|
||||||
) rsp_arb (
|
) rsp_arb (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (commit_reset),
|
.reset (commit_reset),
|
||||||
|
|||||||
@@ -220,8 +220,13 @@ module VX_muldiv_unit #(
|
|||||||
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
|
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
|
`ifdef XLEN_64
|
||||||
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
|
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
|
||||||
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
|
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
|
||||||
|
`else
|
||||||
|
assign div_in1[i] = execute_if.data.rs1_data[i];
|
||||||
|
assign div_in2[i] = execute_if.data.rs2_data[i];
|
||||||
|
`endif
|
||||||
end
|
end
|
||||||
|
|
||||||
`ifdef IDIV_DPI
|
`ifdef IDIV_DPI
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
|
||||||
|
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
|
||||||
|
|
||||||
localparam STATE_IDLE = 2'd0;
|
localparam STATE_IDLE = 2'd0;
|
||||||
localparam STATE_FETCH1 = 2'd1;
|
localparam STATE_FETCH1 = 2'd1;
|
||||||
@@ -38,14 +39,17 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||||||
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
|
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
|
||||||
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
|
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
|
||||||
|
|
||||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
|
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
|
||||||
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
|
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
|
||||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
|
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
|
||||||
|
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
|
||||||
|
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
|
||||||
|
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
|
||||||
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
|
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
|
||||||
|
|
||||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
|
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
|
||||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
|
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
|
||||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
|
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
|
||||||
|
|
||||||
reg [STATE_BITS-1:0] state, state_n;
|
reg [STATE_BITS-1:0] state, state_n;
|
||||||
reg [`NR_BITS-1:0] rs2, rs2_n;
|
reg [`NR_BITS-1:0] rs2, rs2_n;
|
||||||
@@ -54,11 +58,11 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||||||
reg rs3_ready, rs3_ready_n;
|
reg rs3_ready, rs3_ready_n;
|
||||||
reg data_ready, data_ready_n;
|
reg data_ready, data_ready_n;
|
||||||
|
|
||||||
|
wire stg_valid_in, stg_ready_in;
|
||||||
|
|
||||||
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
|
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
|
||||||
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
|
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
|
||||||
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
|
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
|
||||||
|
|
||||||
VX_operands_if staging_if();
|
|
||||||
|
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
state_n = state;
|
state_n = state;
|
||||||
@@ -79,7 +83,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
case (state)
|
case (state)
|
||||||
STATE_IDLE: begin
|
STATE_IDLE: begin
|
||||||
if (staging_if.valid && staging_if.ready) begin
|
if (operands_if[i].valid && operands_if[i].ready) begin
|
||||||
data_ready_n = 0;
|
data_ready_n = 0;
|
||||||
end
|
end
|
||||||
if (scoreboard_if[i].valid && data_ready_n == 0) begin
|
if (scoreboard_if[i].valid && data_ready_n == 0) begin
|
||||||
@@ -160,44 +164,98 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||||||
end
|
end
|
||||||
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
|
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
|
||||||
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
|
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
|
||||||
if (writeback_if[i].data.sop) begin
|
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
|
||||||
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
|
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
|
||||||
end else begin
|
|
||||||
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= STATE_IDLE;
|
state <= STATE_IDLE;
|
||||||
gpr_rd_rid <= '0;
|
|
||||||
gpr_rd_wis <= '0;
|
|
||||||
cache_eop <= {ISSUE_RATIO{1'b1}};
|
cache_eop <= {ISSUE_RATIO{1'b1}};
|
||||||
cache_reg <= '0;
|
|
||||||
data_ready <= 0;
|
data_ready <= 0;
|
||||||
end else begin
|
end else begin
|
||||||
state <= state_n;
|
state <= state_n;
|
||||||
rs2 <= rs2_n;
|
|
||||||
rs3 <= rs3_n;
|
|
||||||
rs2_ready <= rs2_ready_n;
|
|
||||||
rs3_ready <= rs3_ready_n;
|
|
||||||
rs1_data <= rs1_data_n;
|
|
||||||
rs2_data <= rs2_data_n;
|
|
||||||
rs3_data <= rs3_data_n;
|
|
||||||
gpr_rd_rid <= gpr_rd_rid_n;
|
|
||||||
gpr_rd_wis <= gpr_rd_wis_n;
|
|
||||||
cache_data <= cache_data_n;
|
|
||||||
cache_reg <= cache_reg_n;
|
|
||||||
cache_tmask <= cache_tmask_n;
|
|
||||||
cache_eop <= cache_eop_n;
|
cache_eop <= cache_eop_n;
|
||||||
data_ready <= data_ready_n;
|
data_ready <= data_ready_n;
|
||||||
end
|
end
|
||||||
|
gpr_rd_rid <= gpr_rd_rid_n;
|
||||||
|
gpr_rd_wis <= gpr_rd_wis_n;
|
||||||
|
rs2_ready <= rs2_ready_n;
|
||||||
|
rs3_ready <= rs3_ready_n;
|
||||||
|
rs2 <= rs2_n;
|
||||||
|
rs3 <= rs3_n;
|
||||||
|
rs1_data <= rs1_data_n;
|
||||||
|
rs2_data <= rs2_data_n;
|
||||||
|
rs3_data <= rs3_data_n;
|
||||||
|
cache_data <= cache_data_n;
|
||||||
|
cache_reg <= cache_reg_n;
|
||||||
|
cache_tmask <= cache_tmask_n;
|
||||||
end
|
end
|
||||||
|
|
||||||
|
assign stg_valid_in = scoreboard_if[i].valid && data_ready;
|
||||||
|
assign scoreboard_if[i].ready = stg_ready_in && data_ready;
|
||||||
|
|
||||||
|
VX_toggle_buffer #(
|
||||||
|
.DATAW (DATAW)
|
||||||
|
) staging_buffer (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (reset),
|
||||||
|
.valid_in (stg_valid_in),
|
||||||
|
.data_in ({
|
||||||
|
scoreboard_if[i].data.uuid,
|
||||||
|
scoreboard_if[i].data.wis,
|
||||||
|
scoreboard_if[i].data.tmask,
|
||||||
|
scoreboard_if[i].data.PC,
|
||||||
|
scoreboard_if[i].data.wb,
|
||||||
|
scoreboard_if[i].data.ex_type,
|
||||||
|
scoreboard_if[i].data.op_type,
|
||||||
|
scoreboard_if[i].data.op_mod,
|
||||||
|
scoreboard_if[i].data.use_PC,
|
||||||
|
scoreboard_if[i].data.use_imm,
|
||||||
|
scoreboard_if[i].data.imm,
|
||||||
|
scoreboard_if[i].data.rd
|
||||||
|
}),
|
||||||
|
.ready_in (stg_ready_in),
|
||||||
|
.valid_out (operands_if[i].valid),
|
||||||
|
.data_out ({
|
||||||
|
operands_if[i].data.uuid,
|
||||||
|
operands_if[i].data.wis,
|
||||||
|
operands_if[i].data.tmask,
|
||||||
|
operands_if[i].data.PC,
|
||||||
|
operands_if[i].data.wb,
|
||||||
|
operands_if[i].data.ex_type,
|
||||||
|
operands_if[i].data.op_type,
|
||||||
|
operands_if[i].data.op_mod,
|
||||||
|
operands_if[i].data.use_PC,
|
||||||
|
operands_if[i].data.use_imm,
|
||||||
|
operands_if[i].data.imm,
|
||||||
|
operands_if[i].data.rd
|
||||||
|
}),
|
||||||
|
.ready_out (operands_if[i].ready)
|
||||||
|
);
|
||||||
|
|
||||||
|
assign operands_if[i].data.rs1_data = rs1_data;
|
||||||
|
assign operands_if[i].data.rs2_data = rs2_data;
|
||||||
|
assign operands_if[i].data.rs3_data = rs3_data;
|
||||||
|
|
||||||
// GPR banks
|
// GPR banks
|
||||||
|
|
||||||
|
reg [RAM_ADDRW-1:0] gpr_rd_addr;
|
||||||
|
wire [RAM_ADDRW-1:0] gpr_wr_addr;
|
||||||
|
if (ISSUE_WIS != 0) begin
|
||||||
|
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
|
||||||
|
always @(posedge clk) begin
|
||||||
|
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
|
||||||
|
end
|
||||||
|
end else begin
|
||||||
|
assign gpr_wr_addr = writeback_if[i].data.rd;
|
||||||
|
always @(posedge clk) begin
|
||||||
|
gpr_rd_addr <= gpr_rd_rid_n;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
`ifdef GPR_RESET
|
`ifdef GPR_RESET
|
||||||
reg wr_enabled = 0;
|
reg wr_enabled = 0;
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
@@ -205,10 +263,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||||||
wr_enabled <= 1;
|
wr_enabled <= 1;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
`else
|
|
||||||
wire wr_enabled = 1;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
|
||||||
VX_dp_ram #(
|
VX_dp_ram #(
|
||||||
.DATAW (`XLEN),
|
.DATAW (`XLEN),
|
||||||
@@ -222,81 +278,17 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.read (1'b1),
|
.read (1'b1),
|
||||||
`UNUSED_PIN (wren),
|
`UNUSED_PIN (wren),
|
||||||
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
`ifdef GPR_RESET
|
||||||
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)),
|
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||||
|
`else
|
||||||
|
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
|
||||||
|
`endif
|
||||||
|
.waddr (gpr_wr_addr),
|
||||||
.wdata (writeback_if[i].data.data[j]),
|
.wdata (writeback_if[i].data.data[j]),
|
||||||
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)),
|
.raddr (gpr_rd_addr),
|
||||||
.rdata (gpr_rd_data[j])
|
.rdata (gpr_rd_data[j])
|
||||||
);
|
);
|
||||||
end
|
end
|
||||||
|
end
|
||||||
// staging buffer
|
|
||||||
|
|
||||||
`RESET_RELAY (stg_buf_reset, reset);
|
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
|
||||||
.DATAW (DATAW)
|
|
||||||
) stg_buf (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (stg_buf_reset),
|
|
||||||
.valid_in (scoreboard_if[i].valid),
|
|
||||||
.ready_in (scoreboard_if[i].ready),
|
|
||||||
.data_in ({
|
|
||||||
scoreboard_if[i].data.uuid,
|
|
||||||
scoreboard_if[i].data.wis,
|
|
||||||
scoreboard_if[i].data.tmask,
|
|
||||||
scoreboard_if[i].data.PC,
|
|
||||||
scoreboard_if[i].data.wb,
|
|
||||||
scoreboard_if[i].data.ex_type,
|
|
||||||
scoreboard_if[i].data.op_type,
|
|
||||||
scoreboard_if[i].data.op_mod,
|
|
||||||
scoreboard_if[i].data.use_PC,
|
|
||||||
scoreboard_if[i].data.use_imm,
|
|
||||||
scoreboard_if[i].data.imm,
|
|
||||||
scoreboard_if[i].data.rd}),
|
|
||||||
.data_out ({
|
|
||||||
staging_if.data.uuid,
|
|
||||||
staging_if.data.wis,
|
|
||||||
staging_if.data.tmask,
|
|
||||||
staging_if.data.PC,
|
|
||||||
staging_if.data.wb,
|
|
||||||
staging_if.data.ex_type,
|
|
||||||
staging_if.data.op_type,
|
|
||||||
staging_if.data.op_mod,
|
|
||||||
staging_if.data.use_PC,
|
|
||||||
staging_if.data.use_imm,
|
|
||||||
staging_if.data.imm,
|
|
||||||
staging_if.data.rd}),
|
|
||||||
.valid_out (staging_if.valid),
|
|
||||||
.ready_out (staging_if.ready)
|
|
||||||
);
|
|
||||||
|
|
||||||
assign staging_if.data.rs1_data = rs1_data;
|
|
||||||
assign staging_if.data.rs2_data = rs2_data;
|
|
||||||
assign staging_if.data.rs3_data = rs3_data;
|
|
||||||
|
|
||||||
// output buffer
|
|
||||||
|
|
||||||
wire valid_stg, ready_stg;
|
|
||||||
assign valid_stg = staging_if.valid && data_ready;
|
|
||||||
assign staging_if.ready = ready_stg && data_ready;
|
|
||||||
|
|
||||||
`RESET_RELAY (out_buf_reset, reset);
|
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
|
||||||
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
|
|
||||||
.SIZE (2),
|
|
||||||
.OUT_REG (2)
|
|
||||||
) out_buf (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (out_buf_reset),
|
|
||||||
.valid_in (valid_stg),
|
|
||||||
.ready_in (ready_stg),
|
|
||||||
.data_in (staging_if.data),
|
|
||||||
.data_out (operands_if[i].data),
|
|
||||||
.valid_out (operands_if[i].valid),
|
|
||||||
.ready_out (operands_if[i].ready)
|
|
||||||
);
|
|
||||||
end
|
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
VX_pipeline_perf_if.schedule perf_schedule_if,
|
||||||
|
`endif
|
||||||
|
|
||||||
// configuration
|
// configuration
|
||||||
input base_dcrs_t base_dcrs,
|
input base_dcrs_t base_dcrs,
|
||||||
|
|
||||||
@@ -304,13 +308,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
|
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
|
||||||
reg [`UUID_WIDTH-1:0] instr_uuid;
|
reg [`UUID_WIDTH-1:0] instr_uuid;
|
||||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
|
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
|
||||||
|
`ifdef SV_DPI
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
|
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
|
||||||
end else if (schedule_fire) begin
|
end else if (schedule_fire) begin
|
||||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
|
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
`else
|
||||||
|
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
|
||||||
|
always @(*) begin
|
||||||
|
instr_uuid = `UUID_WIDTH'(w_uuid);
|
||||||
|
end
|
||||||
|
`endif
|
||||||
`else
|
`else
|
||||||
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
|
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
|
||||||
`endif
|
`endif
|
||||||
@@ -349,7 +360,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
.empty (no_pending_instr)
|
.empty (no_pending_instr)
|
||||||
);
|
);
|
||||||
|
|
||||||
`BUFFER_BUSY (busy, (active_warps != 0 || ~no_pending_instr), 1);
|
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
|
||||||
|
|
||||||
// export CSRs
|
// export CSRs
|
||||||
assign sched_csr_if.cycles = cycles;
|
assign sched_csr_if.cycles = cycles;
|
||||||
@@ -376,4 +387,25 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||||||
end
|
end
|
||||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||||
|
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||||
|
|
||||||
|
wire schedule_idle = ~schedule_valid;
|
||||||
|
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_sched_idles <= '0;
|
||||||
|
perf_sched_stalls <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||||
|
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
assign perf_schedule_if.sched_idles = perf_sched_idles;
|
||||||
|
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||||
|
`endif
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -19,6 +19,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||||
|
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
|
||||||
|
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
|
||||||
|
`endif
|
||||||
|
|
||||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||||
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
|
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
|
||||||
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
|
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
|
||||||
@@ -26,114 +32,197 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
`ifdef PERF_ENABLE
|
||||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
|
||||||
reg [3:0] ready_masks, ready_masks_n;
|
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||||
VX_ibuffer_if staging_if();
|
|
||||||
|
|
||||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
|
||||||
|
|
||||||
always @(*) begin
|
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
|
||||||
inuse_regs_n = inuse_regs;
|
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||||
ready_masks_n = ready_masks;
|
|
||||||
if (writeback_fire) begin
|
|
||||||
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
|
|
||||||
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
|
|
||||||
& {(writeback_if[i].data.rd == staging_if.data.rd),
|
|
||||||
(writeback_if[i].data.rd == staging_if.data.rs1),
|
|
||||||
(writeback_if[i].data.rd == staging_if.data.rs2),
|
|
||||||
(writeback_if[i].data.rd == staging_if.data.rs3)};
|
|
||||||
end
|
|
||||||
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
|
|
||||||
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
|
|
||||||
ready_masks_n = '0;
|
|
||||||
end
|
|
||||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
|
||||||
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
|
|
||||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
|
|
||||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
|
|
||||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
|
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
|
||||||
|
|
||||||
|
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||||
|
|
||||||
|
VX_reduce #(
|
||||||
|
.DATAW_IN (`NUM_EX_UNITS),
|
||||||
|
.N (`ISSUE_WIDTH),
|
||||||
|
.OP ("|")
|
||||||
|
) perf_units_reduce (
|
||||||
|
.data_in (perf_issue_units_per_cycle),
|
||||||
|
.data_out (perf_units_per_cycle)
|
||||||
|
);
|
||||||
|
|
||||||
|
VX_reduce #(
|
||||||
|
.DATAW_IN (`NUM_SFU_UNITS),
|
||||||
|
.N (`ISSUE_WIDTH),
|
||||||
|
.OP ("|")
|
||||||
|
) perf_sfu_reduce (
|
||||||
|
.data_in (perf_issue_sfu_per_cycle),
|
||||||
|
.data_out (perf_sfu_per_cycle)
|
||||||
|
);
|
||||||
|
|
||||||
|
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||||
|
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
|
||||||
|
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_scb_stalls <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
inuse_regs <= '0;
|
perf_units_uses[i] <= '0;
|
||||||
ready_masks <= '0;
|
end else begin
|
||||||
end else begin
|
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
|
||||||
inuse_regs <= inuse_regs_n;
|
|
||||||
ready_masks <= ready_masks_n;
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_sfu_uses[i] <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`endif
|
||||||
|
|
||||||
// staging buffer
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
||||||
|
|
||||||
`RESET_RELAY (stg_buf_reset, reset);
|
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||||
|
|
||||||
|
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
|
||||||
|
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
|
||||||
|
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
|
||||||
|
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||||
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||||
|
|
||||||
|
reg [`SFU_WIDTH-1:0] sfu_type;
|
||||||
|
always @(*) begin
|
||||||
|
case (ibuffer_if[i].data.op_type)
|
||||||
|
`INST_SFU_CSRRW,
|
||||||
|
`INST_SFU_CSRRS,
|
||||||
|
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
|
||||||
|
default: sfu_type = `SFU_WCTL;
|
||||||
|
endcase
|
||||||
|
end
|
||||||
|
|
||||||
|
always @(*) begin
|
||||||
|
perf_issue_units_per_cycle[i] = '0;
|
||||||
|
perf_issue_sfu_per_cycle[i] = '0;
|
||||||
|
if (ibuffer_if[i].valid) begin
|
||||||
|
if (inuse_rd) begin
|
||||||
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if (inuse_rs1) begin
|
||||||
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if (inuse_rs2) begin
|
||||||
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if (inuse_rs3) begin
|
||||||
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||||
|
`endif
|
||||||
|
|
||||||
|
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||||
|
wire operands_ready = ~(| operands_busy);
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
wire stg_valid_in, stg_ready_in;
|
||||||
|
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
|
||||||
|
assign ibuffer_if[i].ready = stg_ready_in && operands_ready;
|
||||||
|
|
||||||
|
VX_stream_buffer #(
|
||||||
.DATAW (DATAW)
|
.DATAW (DATAW)
|
||||||
) stg_buf (
|
) staging_buffer (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (stg_buf_reset),
|
.reset (reset),
|
||||||
.valid_in (ibuffer_if[i].valid),
|
.valid_in (stg_valid_in),
|
||||||
.ready_in (ibuffer_if[i].ready),
|
|
||||||
.data_in (ibuffer_if[i].data),
|
.data_in (ibuffer_if[i].data),
|
||||||
.data_out (staging_if.data),
|
.ready_in (stg_ready_in),
|
||||||
.valid_out (staging_if.valid),
|
|
||||||
.ready_out (staging_if.ready)
|
|
||||||
);
|
|
||||||
|
|
||||||
// output buffer
|
|
||||||
|
|
||||||
wire valid_stg, ready_stg;
|
|
||||||
wire regs_ready = (& ready_masks);
|
|
||||||
assign valid_stg = staging_if.valid && regs_ready;
|
|
||||||
assign staging_if.ready = ready_stg && regs_ready;
|
|
||||||
|
|
||||||
`RESET_RELAY (out_buf_reset, reset);
|
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
|
||||||
.DATAW (DATAW),
|
|
||||||
.SIZE (2),
|
|
||||||
.OUT_REG (2)
|
|
||||||
) out_buf (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (out_buf_reset),
|
|
||||||
.valid_in (valid_stg),
|
|
||||||
.ready_in (ready_stg),
|
|
||||||
.data_in (staging_if.data),
|
|
||||||
.data_out (scoreboard_if[i].data),
|
|
||||||
.valid_out (scoreboard_if[i].valid),
|
.valid_out (scoreboard_if[i].valid),
|
||||||
|
.data_out (scoreboard_if[i].data),
|
||||||
.ready_out (scoreboard_if[i].ready)
|
.ready_out (scoreboard_if[i].ready)
|
||||||
);
|
);
|
||||||
|
|
||||||
reg [31:0] timeout_ctr;
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
inuse_regs <= '0;
|
||||||
|
end else begin
|
||||||
|
if (writeback_fire) begin
|
||||||
|
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
||||||
|
end
|
||||||
|
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||||
|
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
|
||||||
|
inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type;
|
||||||
|
if (ibuffer_if[i].data.ex_type == `EX_SFU) begin
|
||||||
|
inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= sfu_type;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`endif
|
||||||
|
end
|
||||||
|
|
||||||
|
`ifdef SIMULATION
|
||||||
|
reg [31:0] timeout_ctr;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
timeout_ctr <= '0;
|
timeout_ctr <= '0;
|
||||||
end else begin
|
end else begin
|
||||||
if (staging_if.valid && ~regs_ready) begin
|
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
|
||||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||||
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||||
~ready_masks, staging_if.data.uuid));
|
operands_busy, ibuffer_if[i].data.uuid));
|
||||||
`endif
|
`endif
|
||||||
timeout_ctr <= timeout_ctr + 1;
|
timeout_ctr <= timeout_ctr + 1;
|
||||||
end else if (staging_if.valid && staging_if.ready) begin
|
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||||
timeout_ctr <= '0;
|
timeout_ctr <= '0;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||||
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||||
~ready_masks, staging_if.data.uuid));
|
operands_busy, ibuffer_if[i].data.uuid));
|
||||||
|
|
||||||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
||||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||||
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
|
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
|
||||||
end
|
`endif
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
||||||
localparam RSP_ARB_SIZE = 1 + 1;
|
localparam RSP_ARB_SIZE = 1 + 1;
|
||||||
localparam RSP_ARB_IDX_WCTL = 0;
|
localparam RSP_ARB_IDX_WCTL = 0;
|
||||||
localparam RSP_ARB_IDX_CSR = 1;
|
localparam RSP_ARB_IDX_CSRS = 1;
|
||||||
|
|
||||||
VX_execute_if #(
|
VX_execute_if #(
|
||||||
.NUM_LANES (NUM_LANES)
|
.NUM_LANES (NUM_LANES)
|
||||||
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
||||||
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
VX_sfu_perf_if sfu_perf_if();
|
|
||||||
`endif
|
|
||||||
|
|
||||||
// Warp control block
|
// Warp control block
|
||||||
VX_execute_if #(
|
VX_execute_if #(
|
||||||
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (mem_perf_if),
|
.mem_perf_if (mem_perf_if),
|
||||||
.pipeline_perf_if(pipeline_perf_if),
|
.pipeline_perf_if(pipeline_perf_if),
|
||||||
.sfu_perf_if (sfu_perf_if),
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
@@ -141,21 +137,21 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
.commit_if (csr_commit_if)
|
.commit_if (csr_commit_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid;
|
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
|
||||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data;
|
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
|
||||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR];
|
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
|
||||||
|
|
||||||
// can accept new request?
|
// can accept new request?
|
||||||
|
|
||||||
reg sfu_req_ready;
|
reg sfu_req_ready;
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
case (execute_if[0].data.op_type)
|
case (execute_if[0].data.op_type)
|
||||||
`INST_SFU_CSRRW,
|
`INST_SFU_CSRRW,
|
||||||
`INST_SFU_CSRRS,
|
`INST_SFU_CSRRS,
|
||||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||||
default: sfu_req_ready = wctl_execute_if.ready;
|
default: sfu_req_ready = wctl_execute_if.ready;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
assign execute_if[0].ready = sfu_req_ready;
|
assign execute_if[0].ready = sfu_req_ready;
|
||||||
|
|
||||||
// response arbitration
|
// response arbitration
|
||||||
@@ -170,7 +166,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
.NUM_INPUTS (RSP_ARB_SIZE),
|
.NUM_INPUTS (RSP_ARB_SIZE),
|
||||||
.DATAW (RSP_ARB_DATAW),
|
.DATAW (RSP_ARB_DATAW),
|
||||||
.ARBITER ("R"),
|
.ARBITER ("R"),
|
||||||
.OUT_REG (1)
|
.OUT_REG (3)
|
||||||
) rsp_arb (
|
) rsp_arb (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (commit_reset),
|
.reset (commit_reset),
|
||||||
@@ -186,7 +182,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
VX_gather_unit #(
|
VX_gather_unit #(
|
||||||
.BLOCK_SIZE (BLOCK_SIZE),
|
.BLOCK_SIZE (BLOCK_SIZE),
|
||||||
.NUM_LANES (NUM_LANES),
|
.NUM_LANES (NUM_LANES),
|
||||||
.OUT_REG (3)
|
.OUT_REG (1)
|
||||||
) gather_unit (
|
) gather_unit (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (commit_reset),
|
.reset (commit_reset),
|
||||||
@@ -194,16 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
.commit_out_if (commit_if)
|
.commit_out_if (commit_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (reset) begin
|
|
||||||
perf_wctl_stalls <= '0;
|
|
||||||
end else begin
|
|
||||||
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
|
|
||||||
end
|
|
||||||
end
|
|
||||||
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
|
|
||||||
`endif
|
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -14,9 +14,7 @@
|
|||||||
`ifndef VX_TRACE_VH
|
`ifndef VX_TRACE_VH
|
||||||
`define VX_TRACE_VH
|
`define VX_TRACE_VH
|
||||||
|
|
||||||
`ifndef SYNTHESIS
|
`ifdef SIMULATION
|
||||||
|
|
||||||
`include "VX_define.vh"
|
|
||||||
|
|
||||||
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
||||||
case (ex_type)
|
case (ex_type)
|
||||||
|
|||||||
@@ -29,7 +29,6 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
`UNUSED_PARAM (CORE_ID)
|
`UNUSED_PARAM (CORE_ID)
|
||||||
localparam LANE_BITS = `CLOG2(NUM_LANES);
|
localparam LANE_BITS = `CLOG2(NUM_LANES);
|
||||||
localparam LANE_WIDTH = `UP(LANE_BITS);
|
|
||||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||||
localparam PID_WIDTH = `UP(PID_BITS);
|
localparam PID_WIDTH = `UP(PID_BITS);
|
||||||
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||||
@@ -50,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||||||
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
|
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
|
||||||
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
|
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
|
||||||
|
|
||||||
wire [LANE_WIDTH-1:0] tid;
|
wire [`UP(LANE_BITS)-1:0] tid;
|
||||||
if (LANE_BITS != 0) begin
|
if (LANE_BITS != 0) begin
|
||||||
assign tid = execute_if.data.tid[0 +: LANE_BITS];
|
assign tid = execute_if.data.tid[0 +: LANE_BITS];
|
||||||
end else begin
|
end else begin
|
||||||
|
|||||||
@@ -52,30 +52,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
|
|
||||||
localparam MAN_BITS = 23;
|
localparam MAN_BITS = 23;
|
||||||
localparam EXP_BITS = 8;
|
localparam EXP_BITS = 8;
|
||||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||||
|
|
||||||
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
|
|
||||||
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
|
|
||||||
|
|
||||||
// Use 32-bit integer
|
// Use 32-bit integer
|
||||||
localparam MAX_INT_WIDTH = 32;
|
localparam INT_WIDTH = 32;
|
||||||
|
|
||||||
// The internal mantissa includes normal bit or an entire integer
|
// The internal mantissa includes normal bit or an entire integer
|
||||||
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
|
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
|
||||||
|
|
||||||
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
|
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
|
||||||
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
|
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
|
||||||
|
|
||||||
// The internal exponent must be able to represent the smallest denormal input value as signed
|
// The internal exponent must be able to represent the smallest denormal input value as signed
|
||||||
// or the number of bits in an integer
|
// or the number of bits in an integer
|
||||||
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
|
localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
|
||||||
|
|
||||||
// shift amount for denormalization
|
|
||||||
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
|
|
||||||
|
|
||||||
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
|
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
|
||||||
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
|
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
|
||||||
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R
|
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R
|
||||||
|
|
||||||
// Input processing
|
// Input processing
|
||||||
|
|
||||||
@@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
.EXP_BITS (EXP_BITS),
|
.EXP_BITS (EXP_BITS),
|
||||||
.MAN_BITS (MAN_BITS)
|
.MAN_BITS (MAN_BITS)
|
||||||
) fp_class (
|
) fp_class (
|
||||||
.exp_i (dataa[i][30:23]),
|
.exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]),
|
||||||
.man_i (dataa[i][22:0]),
|
.man_i (dataa[i][MAN_BITS-1:0]),
|
||||||
.clss_o (fclass[i])
|
.clss_o (fclass[i])
|
||||||
);
|
);
|
||||||
end
|
end
|
||||||
@@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
wire [NUM_LANES-1:0] input_sign;
|
wire [NUM_LANES-1:0] input_sign;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
wire [INT_MAN_WIDTH-1:0] int_mantissa;
|
wire i2f_sign = dataa[i][INT_WIDTH-1];
|
||||||
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
|
wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
|
||||||
wire fmt_sign = dataa[i][31];
|
wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
|
||||||
wire int_sign = dataa[i][31] && is_signed;
|
wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
||||||
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
|
|
||||||
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
|
||||||
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
|
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
|
||||||
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
|
assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
|
||||||
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
|
assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
|
||||||
end
|
end
|
||||||
|
|
||||||
// Pipeline stage0
|
// Pipeline stage0
|
||||||
|
|
||||||
wire valid_in_s0;
|
wire valid_in_s0;
|
||||||
wire [NUM_LANES-1:0] lane_mask_s0;
|
wire [NUM_LANES-1:0] lane_mask_s0;
|
||||||
wire [TAGW-1:0] tag_in_s0;
|
wire [TAGW-1:0] tag_in_s0;
|
||||||
wire is_itof_s0;
|
wire is_itof_s0;
|
||||||
wire unsigned_s0;
|
wire is_signed_s0;
|
||||||
wire [2:0] rnd_mode_s0;
|
wire [2:0] rnd_mode_s0;
|
||||||
fclass_t [NUM_LANES-1:0] fclass_s0;
|
fclass_t [NUM_LANES-1:0] fclass_s0;
|
||||||
wire [NUM_LANES-1:0] input_sign_s0;
|
wire [NUM_LANES-1:0] input_sign_s0;
|
||||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
|
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
|
||||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
|
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
|
||||||
|
|
||||||
@@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (~stall),
|
.enable (~stall),
|
||||||
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
.data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
||||||
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
||||||
);
|
);
|
||||||
|
|
||||||
// Normalization
|
// Normalization
|
||||||
@@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
|
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
|
||||||
|
|
||||||
// Unbias exponent and compensate for shift
|
// Unbias exponent and compensate for shift
|
||||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||||
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||||
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
|
assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
|
||||||
end
|
end
|
||||||
|
|
||||||
// Pipeline stage1
|
// Pipeline stage1
|
||||||
|
|
||||||
wire valid_in_s1;
|
wire valid_in_s1;
|
||||||
wire [NUM_LANES-1:0] lane_mask_s1;
|
wire [NUM_LANES-1:0] lane_mask_s1;
|
||||||
wire [TAGW-1:0] tag_in_s1;
|
wire [TAGW-1:0] tag_in_s1;
|
||||||
wire is_itof_s1;
|
wire is_itof_s1;
|
||||||
wire unsigned_s1;
|
wire is_signed_s1;
|
||||||
wire [2:0] rnd_mode_s1;
|
wire [2:0] rnd_mode_s1;
|
||||||
fclass_t [NUM_LANES-1:0] fclass_s1;
|
fclass_t [NUM_LANES-1:0] fclass_s1;
|
||||||
wire [NUM_LANES-1:0] input_sign_s1;
|
wire [NUM_LANES-1:0] input_sign_s1;
|
||||||
wire [NUM_LANES-1:0] mant_is_zero_s1;
|
wire [NUM_LANES-1:0] mant_is_zero_s1;
|
||||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
|
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
|
||||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
|
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
|
||||||
|
|
||||||
@@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (~stall),
|
.enable (~stall),
|
||||||
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
|
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
|
||||||
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
|
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
|
||||||
);
|
);
|
||||||
|
|
||||||
// Perform adjustments to mantissa and exponent
|
// Perform adjustments to mantissa and exponent
|
||||||
|
|
||||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
|
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
|
||||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
|
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
|
||||||
wire [NUM_LANES-1:0] of_before_round_s1;
|
wire [NUM_LANES-1:0] of_before_round_s1;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
|
wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
|
||||||
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
|
wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
|
||||||
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
|
wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
|
||||||
reg of_before_round_tmp_s1;
|
reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
|
||||||
|
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
|
if (overflow) begin
|
||||||
preshift_mant_s1 = {input_mant_s1[i], 33'b0};
|
denorm_shamt_q = '0;
|
||||||
denorm_shamt_s1 = '0;
|
end else if (underflow) begin
|
||||||
of_before_round_tmp_s1 = 1'b0;
|
denorm_shamt_q = INT_WIDTH+1;
|
||||||
|
|
||||||
if (is_itof_s1) begin
|
|
||||||
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
|
|
||||||
// Overflow or infinities (for proper rounding)
|
|
||||||
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
|
|
||||||
preshift_mant_s1 = ~0; // largest normal value and RS bits set
|
|
||||||
of_before_round_tmp_s1 = 1'b1;
|
|
||||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
|
|
||||||
// Limit the shift to retain sticky bits
|
|
||||||
final_exp_tmp_s1 = '0; // denormal result
|
|
||||||
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
|
|
||||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
|
|
||||||
// Denormalize underflowing values
|
|
||||||
final_exp_tmp_s1 = '0; // denormal result
|
|
||||||
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
|
|
||||||
end
|
|
||||||
end else begin
|
end else begin
|
||||||
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
|
denorm_shamt_q = denorm_shamt;
|
||||||
// overflow: when converting to unsigned the range is larger by one
|
|
||||||
of_before_round_tmp_s1 = 1'b1;
|
|
||||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
|
|
||||||
// underflow
|
|
||||||
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
|
|
||||||
end else begin
|
|
||||||
// By default right shift mantissa to be an integer
|
|
||||||
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
|
||||||
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
|
assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
|
||||||
assign final_exp_s1[i] = final_exp_tmp_s1;
|
assign of_before_round_s1[i] = overflow;
|
||||||
assign of_before_round_s1[i] = of_before_round_tmp_s1;
|
|
||||||
end
|
end
|
||||||
|
|
||||||
// Pipeline stage2
|
// Pipeline stage2
|
||||||
|
|
||||||
wire valid_in_s2;
|
wire valid_in_s2;
|
||||||
wire [NUM_LANES-1:0] lane_mask_s2;
|
wire [NUM_LANES-1:0] lane_mask_s2;
|
||||||
wire [TAGW-1:0] tag_in_s2;
|
wire [TAGW-1:0] tag_in_s2;
|
||||||
wire is_itof_s2;
|
wire is_itof_s2;
|
||||||
wire unsigned_s2;
|
wire is_signed_s2;
|
||||||
wire [2:0] rnd_mode_s2;
|
wire [2:0] rnd_mode_s2;
|
||||||
fclass_t [NUM_LANES-1:0] fclass_s2;
|
fclass_t [NUM_LANES-1:0] fclass_s2;
|
||||||
wire [NUM_LANES-1:0] mant_is_zero_s2;
|
wire [NUM_LANES-1:0] mant_is_zero_s2;
|
||||||
wire [NUM_LANES-1:0] input_sign_s2;
|
wire [NUM_LANES-1:0] input_sign_s2;
|
||||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
|
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
|
||||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
|
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
|
||||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
|
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
|
||||||
@@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (~stall),
|
.enable (~stall),
|
||||||
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
||||||
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||||
);
|
);
|
||||||
|
|
||||||
wire [NUM_LANES-1:0] rounded_sign_s2;
|
wire [NUM_LANES-1:0] rounded_sign_s2;
|
||||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
|
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
|
||||||
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
|
wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
|
||||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
|
wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
|
||||||
|
|
||||||
// Rouding and classification
|
// Rouding and classification
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||||
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||||
wire [1:0] round_sticky_bits_s2;
|
wire [1:0] round_sticky_bits_s2;
|
||||||
wire [31:0] fmt_pre_round_abs_s2;
|
wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
|
||||||
wire [31:0] pre_round_abs_s2;
|
wire [INT_WIDTH-1:0] pre_round_abs_s2;
|
||||||
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
|
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
|
||||||
|
|
||||||
// Extract final mantissa and round bit, discard the normal bit (for FP)
|
// Extract final mantissa and round bit, discard the normal bit (for FP)
|
||||||
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||||
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
|
assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1];
|
||||||
|
|
||||||
// Collapse sticky bits
|
// Collapse sticky bits
|
||||||
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||||
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||||
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
|
assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2);
|
||||||
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
|
assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2);
|
||||||
|
|
||||||
// select RS bits for destination operation
|
// select RS bits for destination operation
|
||||||
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
|
assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
|
||||||
|
|
||||||
// Pack exponent and mantissa into proper rounding form
|
// Pack exponent and mantissa into proper rounding form
|
||||||
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
|
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
|
||||||
@@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
wire [NUM_LANES-1:0] lane_mask_s3;
|
wire [NUM_LANES-1:0] lane_mask_s3;
|
||||||
wire [TAGW-1:0] tag_in_s3;
|
wire [TAGW-1:0] tag_in_s3;
|
||||||
wire is_itof_s3;
|
wire is_itof_s3;
|
||||||
wire unsigned_s3;
|
wire is_signed_s3;
|
||||||
fclass_t [NUM_LANES-1:0] fclass_s3;
|
fclass_t [NUM_LANES-1:0] fclass_s3;
|
||||||
wire [NUM_LANES-1:0] mant_is_zero_s3;
|
wire [NUM_LANES-1:0] mant_is_zero_s3;
|
||||||
wire [NUM_LANES-1:0] input_sign_s3;
|
wire [NUM_LANES-1:0] input_sign_s3;
|
||||||
wire [NUM_LANES-1:0] rounded_sign_s3;
|
wire [NUM_LANES-1:0] rounded_sign_s3;
|
||||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
|
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
|
||||||
wire [NUM_LANES-1:0] of_before_round_s3;
|
wire [NUM_LANES-1:0] of_before_round_s3;
|
||||||
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
|
wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
|
||||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
|
wire [NUM_LANES-1:0] i2f_round_has_sticky_s3;
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
|
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
|
||||||
@@ -339,105 +304,71 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.enable (~stall),
|
.enable (~stall),
|
||||||
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
|
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
|
||||||
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
|
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
|
||||||
);
|
);
|
||||||
|
|
||||||
wire [NUM_LANES-1:0] of_after_round_s3;
|
wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
|
||||||
wire [NUM_LANES-1:0] uf_after_round_s3;
|
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
|
||||||
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
|
|
||||||
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
|
|
||||||
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
|
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
// Assemble regular result, nan box short ones. Int zeroes need to be detected
|
// Assemble regular result, nan box short ones. Int zeroes need to be detected
|
||||||
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||||
|
|
||||||
// Classification after rounding select by destination format
|
|
||||||
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
|
|
||||||
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
|
|
||||||
|
|
||||||
// Negative integer result needs to be brought into two's complement
|
// Negative integer result needs to be brought into two's complement
|
||||||
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
|
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
|
||||||
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
|
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
|
||||||
end
|
end
|
||||||
|
|
||||||
// FP Special case handling
|
// F2I Special case handling
|
||||||
|
|
||||||
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
|
reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
|
||||||
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
|
fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
|
||||||
wire [NUM_LANES-1:0] fp_result_is_special_s3;
|
wire [NUM_LANES-1:0] f2i_result_is_special_s3;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
|
||||||
// Detect special case from source format, I2F casts don't produce a special result
|
|
||||||
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
|
|
||||||
|
|
||||||
// Signalling input NaNs raise invalid flag, otherwise no flags set
|
|
||||||
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
|
|
||||||
|
|
||||||
// Assemble result according to destination format
|
|
||||||
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
|
|
||||||
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
|
|
||||||
end
|
|
||||||
|
|
||||||
// INT Special case handling
|
|
||||||
|
|
||||||
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
|
|
||||||
fflags_t [NUM_LANES-1:0] int_special_status_s3;
|
|
||||||
wire [NUM_LANES-1:0] int_result_is_special_s3;
|
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
// Assemble result according to destination format
|
// Assemble result according to destination format
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
|
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
|
||||||
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
|
f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1
|
||||||
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
|
f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31
|
||||||
end else begin
|
end else begin
|
||||||
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
|
f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
|
||||||
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
|
f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
|
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
|
||||||
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
|
assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan
|
||||||
| fclass_s3[i].is_inf
|
| fclass_s3[i].is_inf
|
||||||
| of_before_round_s3[i]
|
| of_before_round_s3[i]
|
||||||
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
|
| (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
|
||||||
|
|
||||||
// All integer special cases are invalid
|
// All integer special cases are invalid
|
||||||
assign int_special_status_s3[i] = {1'b1, 4'h0};
|
assign f2i_special_status_s3[i] = {1'b1, 4'h0};
|
||||||
end
|
end
|
||||||
|
|
||||||
// Result selection and Output handshake
|
// Result selection and Output handshake
|
||||||
|
|
||||||
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
|
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
|
||||||
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
|
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||||
fflags_t fp_regular_status_s3, int_regular_status_s3;
|
fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
|
||||||
fflags_t fp_status_s3, int_status_s3;
|
fflags_t i2f_status_s3, f2i_status_s3;
|
||||||
wire [31:0] fp_result_s3, int_result_s3;
|
|
||||||
|
|
||||||
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
|
assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
|
||||||
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
|
assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
|
||||||
|
|
||||||
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
|
|
||||||
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
|
|
||||||
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
|
|
||||||
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
|
|
||||||
assign fp_regular_status_s3.NX = inexact_s3;
|
|
||||||
|
|
||||||
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
|
assign i2f_status_s3 = i2f_regular_status_s3;
|
||||||
|
assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
|
||||||
|
|
||||||
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
|
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
|
||||||
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
|
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
|
||||||
|
|
||||||
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
|
assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
|
||||||
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
|
assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
|
||||||
|
|
||||||
// Select output depending on special case detection
|
|
||||||
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
|
|
||||||
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
|
|
||||||
end
|
end
|
||||||
|
|
||||||
assign stall = ~ready_out && valid_out;
|
assign stall = ~ready_out && valid_out;
|
||||||
@@ -457,7 +388,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||||||
);
|
);
|
||||||
|
|
||||||
assign ready_in = ~stall;
|
assign ready_in = ~stall;
|
||||||
|
|
||||||
assign has_fflags = 1'b1;
|
assign has_fflags = 1'b1;
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
`ifndef SYNTHESIS
|
`ifdef SV_DPI
|
||||||
`include "float_dpi.vh"
|
`include "float_dpi.vh"
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|||||||
@@ -54,7 +54,6 @@ module VX_fpu_rounding #(
|
|||||||
2'b01: round_up = 1'b0; // < ulp/2 away, round down
|
2'b01: round_up = 1'b0; // < ulp/2 away, round down
|
||||||
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
|
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
|
||||||
2'b11: round_up = 1'b1; // > ulp/2 away, round up
|
2'b11: round_up = 1'b1; // > ulp/2 away, round up
|
||||||
default: round_up = 1'bx;
|
|
||||||
endcase
|
endcase
|
||||||
`INST_FRM_RTZ: round_up = 1'b0; // always round down
|
`INST_FRM_RTZ: round_up = 1'b0; // always round down
|
||||||
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
|
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
|
||||||
|
|||||||
@@ -36,21 +36,26 @@ interface VX_decode_if ();
|
|||||||
logic valid;
|
logic valid;
|
||||||
data_t data;
|
data_t data;
|
||||||
logic ready;
|
logic ready;
|
||||||
|
`ifndef L1_ENABLE
|
||||||
wire [`ISSUE_WIDTH-1:0] ibuf_pop;
|
logic [`ISSUE_WIDTH-1:0] ibuf_pop;
|
||||||
|
`endif
|
||||||
|
|
||||||
modport master (
|
modport master (
|
||||||
output valid,
|
output valid,
|
||||||
output data,
|
output data,
|
||||||
input ibuf_pop,
|
|
||||||
input ready
|
input ready
|
||||||
|
`ifndef L1_ENABLE
|
||||||
|
, input ibuf_pop
|
||||||
|
`endif
|
||||||
);
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
input valid,
|
input valid,
|
||||||
input data,
|
input data,
|
||||||
output ibuf_pop,
|
|
||||||
output ready
|
output ready
|
||||||
|
`ifndef L1_ENABLE
|
||||||
|
, output ibuf_pop
|
||||||
|
`endif
|
||||||
);
|
);
|
||||||
|
|
||||||
endinterface
|
endinterface
|
||||||
|
|||||||
@@ -26,21 +26,26 @@ interface VX_fetch_if ();
|
|||||||
logic valid;
|
logic valid;
|
||||||
data_t data;
|
data_t data;
|
||||||
logic ready;
|
logic ready;
|
||||||
|
`ifndef L1_ENABLE
|
||||||
logic [`ISSUE_WIDTH-1:0] ibuf_pop;
|
logic [`ISSUE_WIDTH-1:0] ibuf_pop;
|
||||||
|
`endif
|
||||||
|
|
||||||
modport master (
|
modport master (
|
||||||
output valid,
|
output valid,
|
||||||
output data,
|
output data,
|
||||||
input ibuf_pop,
|
|
||||||
input ready
|
input ready
|
||||||
|
`ifndef L1_ENABLE
|
||||||
|
, input ibuf_pop
|
||||||
|
`endif
|
||||||
);
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
input valid,
|
input valid,
|
||||||
input data,
|
input data,
|
||||||
output ibuf_pop,
|
|
||||||
output ready
|
output ready
|
||||||
|
`ifndef L1_ENABLE
|
||||||
|
, output ibuf_pop
|
||||||
|
`endif
|
||||||
);
|
);
|
||||||
|
|
||||||
endinterface
|
endinterface
|
||||||
|
|||||||
@@ -14,26 +14,38 @@
|
|||||||
`include "VX_define.vh"
|
`include "VX_define.vh"
|
||||||
|
|
||||||
interface VX_pipeline_perf_if ();
|
interface VX_pipeline_perf_if ();
|
||||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
wire [`PERF_CTR_BITS-1:0] sched_idles;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
|
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||||
|
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||||
|
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
|
||||||
|
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||||
wire [`PERF_CTR_BITS-1:0] loads;
|
wire [`PERF_CTR_BITS-1:0] loads;
|
||||||
wire [`PERF_CTR_BITS-1:0] stores;
|
wire [`PERF_CTR_BITS-1:0] stores;
|
||||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||||
|
|
||||||
|
modport schedule (
|
||||||
|
output sched_idles,
|
||||||
|
output sched_stalls
|
||||||
|
);
|
||||||
|
|
||||||
modport issue (
|
modport issue (
|
||||||
output ibf_stalls,
|
output ibf_stalls,
|
||||||
output scb_stalls,
|
output scb_stalls,
|
||||||
output dsp_stalls
|
output units_uses,
|
||||||
);
|
output sfu_uses
|
||||||
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
|
input sched_idles,
|
||||||
|
input sched_stalls,
|
||||||
input ibf_stalls,
|
input ibf_stalls,
|
||||||
input scb_stalls,
|
input scb_stalls,
|
||||||
input dsp_stalls,
|
input units_uses,
|
||||||
|
input sfu_uses,
|
||||||
input ifetches,
|
input ifetches,
|
||||||
input loads,
|
input loads,
|
||||||
input stores,
|
input stores,
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ module VX_avs_adapter #(
|
|||||||
parameter NUM_BANKS = 1,
|
parameter NUM_BANKS = 1,
|
||||||
parameter TAG_WIDTH = 1,
|
parameter TAG_WIDTH = 1,
|
||||||
parameter RD_QUEUE_SIZE = 1,
|
parameter RD_QUEUE_SIZE = 1,
|
||||||
parameter OUT_REG_REQ = 0,
|
parameter OUT_REG_REQ = 0,
|
||||||
parameter OUT_REG_RSP = 0
|
parameter OUT_REG_RSP = 0
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ module VX_axi_adapter #(
|
|||||||
parameter TAG_WIDTH = 8,
|
parameter TAG_WIDTH = 8,
|
||||||
parameter NUM_BANKS = 1,
|
parameter NUM_BANKS = 1,
|
||||||
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
|
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
|
||||||
parameter OUT_REG_RSP = 0
|
parameter OUT_REG_RSP = 0
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|||||||
@@ -11,6 +11,14 @@
|
|||||||
// See the License for the specific language governing permissions and
|
// See the License for the specific language governing permissions and
|
||||||
// limitations under the License.
|
// limitations under the License.
|
||||||
|
|
||||||
|
// A bypass elastic buffer operates at full bandwidth where pop can happen if the buffer is empty but is going full
|
||||||
|
// It has the following benefits:
|
||||||
|
// + Full-bandwidth throughput
|
||||||
|
// + use only one register for storage
|
||||||
|
// It has the following limitations:
|
||||||
|
// + data_out is not registered
|
||||||
|
// + ready_in and ready_out are coupled
|
||||||
|
|
||||||
`include "VX_platform.vh"
|
`include "VX_platform.vh"
|
||||||
|
|
||||||
`TRACING_OFF
|
`TRACING_OFF
|
||||||
@@ -35,30 +43,27 @@ module VX_bypass_buffer #(
|
|||||||
assign data_out = data_in;
|
assign data_out = data_in;
|
||||||
end else begin
|
end else begin
|
||||||
reg [DATAW-1:0] buffer;
|
reg [DATAW-1:0] buffer;
|
||||||
reg buffer_valid;
|
reg has_data;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
buffer_valid <= 0;
|
has_data <= 0;
|
||||||
end else begin
|
end else begin
|
||||||
if (ready_out) begin
|
if (ready_out) begin
|
||||||
buffer_valid <= 0;
|
has_data <= 0;
|
||||||
end
|
end else if (~has_data) begin
|
||||||
if (valid_in && ~ready_out) begin
|
has_data <= valid_in;
|
||||||
`ASSERT(!buffer_valid, ("runtime error"));
|
|
||||||
buffer_valid <= 1;
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
if (~has_data) begin
|
||||||
if (valid_in && ~ready_out) begin
|
|
||||||
buffer <= data_in;
|
buffer <= data_in;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign ready_in = ready_out || !buffer_valid;
|
assign ready_in = ready_out || ~has_data;
|
||||||
assign data_out = buffer_valid ? buffer : data_in;
|
assign data_out = has_data ? buffer : data_in;
|
||||||
assign valid_out = valid_in || buffer_valid;
|
assign valid_out = valid_in || has_data;
|
||||||
end
|
end
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
`TRACING_ON
|
`TRACING_ON
|
||||||
|
|||||||
@@ -21,15 +21,12 @@ module VX_cyclic_arbiter #(
|
|||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
input wire [NUM_REQS-1:0] requests,
|
input wire [NUM_REQS-1:0] requests,
|
||||||
input wire unlock,
|
|
||||||
output wire [LOG_NUM_REQS-1:0] grant_index,
|
output wire [LOG_NUM_REQS-1:0] grant_index,
|
||||||
output wire [NUM_REQS-1:0] grant_onehot,
|
output wire [NUM_REQS-1:0] grant_onehot,
|
||||||
output wire grant_valid
|
output wire grant_valid,
|
||||||
|
input wire grant_unlock
|
||||||
);
|
);
|
||||||
`UNUSED_PARAM (LOCK_ENABLE)
|
|
||||||
`UNUSED_VAR (unlock)
|
|
||||||
|
|
||||||
if (NUM_REQS == 1) begin
|
if (NUM_REQS == 1) begin
|
||||||
|
|
||||||
`UNUSED_VAR (clk)
|
`UNUSED_VAR (clk)
|
||||||
@@ -51,7 +48,7 @@ module VX_cyclic_arbiter #(
|
|||||||
end else begin
|
end else begin
|
||||||
if (!IS_POW2 && grant_index_r == LOG_NUM_REQS'(NUM_REQS-1)) begin
|
if (!IS_POW2 && grant_index_r == LOG_NUM_REQS'(NUM_REQS-1)) begin
|
||||||
grant_index_r <= '0;
|
grant_index_r <= '0;
|
||||||
end else begin
|
end else if (!LOCK_ENABLE || ~grant_valid || grant_unlock) begin
|
||||||
grant_index_r <= grant_index_r + LOG_NUM_REQS'(1);
|
grant_index_r <= grant_index_r + LOG_NUM_REQS'(1);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -42,34 +42,33 @@ module VX_elastic_buffer #(
|
|||||||
|
|
||||||
end else if (SIZE == 1) begin
|
end else if (SIZE == 1) begin
|
||||||
|
|
||||||
wire stall = valid_out && ~ready_out;
|
VX_pipe_buffer #(
|
||||||
|
.DATAW (DATAW)
|
||||||
VX_pipe_register #(
|
) pipe_buffer (
|
||||||
.DATAW (1 + DATAW),
|
.clk (clk),
|
||||||
.RESETW (1)
|
.reset (reset),
|
||||||
) pipe_register (
|
.valid_in (valid_in),
|
||||||
.clk (clk),
|
.data_in (data_in),
|
||||||
.reset (reset),
|
.ready_in (ready_in),
|
||||||
.enable (~stall),
|
.valid_out (valid_out),
|
||||||
.data_in ({valid_in, data_in}),
|
.data_out (data_out),
|
||||||
.data_out ({valid_out, data_out})
|
.ready_out (ready_out)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign ready_in = ~stall;
|
|
||||||
|
|
||||||
end else if (SIZE == 2) begin
|
end else if (SIZE == 2) begin
|
||||||
|
|
||||||
VX_skid_buffer #(
|
VX_skid_buffer #(
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
|
.FULL_BW (OUT_REG != 2),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) skid_buffer (
|
) skid_buffer (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in (valid_in),
|
.valid_in (valid_in),
|
||||||
|
.data_in (data_in),
|
||||||
.ready_in (ready_in),
|
.ready_in (ready_in),
|
||||||
.data_in (data_in),
|
|
||||||
.data_out (data_out),
|
|
||||||
.valid_out (valid_out),
|
.valid_out (valid_out),
|
||||||
|
.data_out (data_out),
|
||||||
.ready_out (ready_out)
|
.ready_out (ready_out)
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -111,10 +110,10 @@ module VX_elastic_buffer #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.valid_in (~empty),
|
.valid_in (~empty),
|
||||||
.ready_in (ready_out_t),
|
|
||||||
.data_in (data_out_t),
|
.data_in (data_out_t),
|
||||||
.data_out (data_out),
|
.ready_in (ready_out_t),
|
||||||
.valid_out (valid_out),
|
.valid_out (valid_out),
|
||||||
|
.data_out (data_out),
|
||||||
.ready_out (ready_out)
|
.ready_out (ready_out)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|||||||
@@ -21,17 +21,17 @@ module VX_fair_arbiter #(
|
|||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
input wire unlock,
|
|
||||||
input wire [NUM_REQS-1:0] requests,
|
input wire [NUM_REQS-1:0] requests,
|
||||||
output wire [LOG_NUM_REQS-1:0] grant_index,
|
output wire [LOG_NUM_REQS-1:0] grant_index,
|
||||||
output wire [NUM_REQS-1:0] grant_onehot,
|
output wire [NUM_REQS-1:0] grant_onehot,
|
||||||
output wire grant_valid
|
output wire grant_valid,
|
||||||
|
input wire grant_unlock
|
||||||
);
|
);
|
||||||
if (NUM_REQS == 1) begin
|
if (NUM_REQS == 1) begin
|
||||||
|
|
||||||
`UNUSED_VAR (clk)
|
`UNUSED_VAR (clk)
|
||||||
`UNUSED_VAR (reset)
|
`UNUSED_VAR (reset)
|
||||||
`UNUSED_VAR (unlock)
|
`UNUSED_VAR (grant_unlock)
|
||||||
|
|
||||||
assign grant_index = '0;
|
assign grant_index = '0;
|
||||||
assign grant_onehot = requests;
|
assign grant_onehot = requests;
|
||||||
@@ -48,18 +48,14 @@ module VX_fair_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
buffer <= '0;
|
buffer <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
buffer <= buffer_n;
|
buffer <= buffer_n;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
VX_priority_arbiter #(
|
VX_priority_arbiter #(
|
||||||
.NUM_REQS (NUM_REQS),
|
.NUM_REQS (NUM_REQS)
|
||||||
.LOCK_ENABLE (LOCK_ENABLE)
|
|
||||||
) priority_arbiter (
|
) priority_arbiter (
|
||||||
.clk (clk),
|
|
||||||
.reset (reset),
|
|
||||||
.unlock (unlock),
|
|
||||||
.requests (requests_qual),
|
.requests (requests_qual),
|
||||||
.grant_index (grant_index),
|
.grant_index (grant_index),
|
||||||
.grant_onehot (grant_onehot),
|
.grant_onehot (grant_onehot),
|
||||||
|
|||||||
@@ -201,9 +201,7 @@ module VX_fifo_queue #(
|
|||||||
rd_ptr_r <= '0;
|
rd_ptr_r <= '0;
|
||||||
rd_ptr_n_r <= 1;
|
rd_ptr_n_r <= 1;
|
||||||
end else begin
|
end else begin
|
||||||
if (push) begin
|
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
|
||||||
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
|
|
||||||
end
|
|
||||||
if (pop) begin
|
if (pop) begin
|
||||||
rd_ptr_r <= rd_ptr_n_r;
|
rd_ptr_r <= rd_ptr_n_r;
|
||||||
if (DEPTH > 2) begin
|
if (DEPTH > 2) begin
|
||||||
|
|||||||
@@ -21,22 +21,23 @@ module VX_generic_arbiter #(
|
|||||||
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
input wire unlock,
|
|
||||||
input wire [NUM_REQS-1:0] requests,
|
input wire [NUM_REQS-1:0] requests,
|
||||||
output wire [LOG_NUM_REQS-1:0] grant_index,
|
output wire [LOG_NUM_REQS-1:0] grant_index,
|
||||||
output wire [NUM_REQS-1:0] grant_onehot,
|
output wire [NUM_REQS-1:0] grant_onehot,
|
||||||
output wire grant_valid
|
output wire grant_valid,
|
||||||
|
input wire grant_unlock
|
||||||
);
|
);
|
||||||
if (TYPE == "P") begin
|
if (TYPE == "P") begin
|
||||||
|
|
||||||
|
`UNUSED_PARAM (LOCK_ENABLE)
|
||||||
|
`UNUSED_VAR (clk)
|
||||||
|
`UNUSED_VAR (reset)
|
||||||
|
`UNUSED_VAR (grant_unlock)
|
||||||
|
|
||||||
VX_priority_arbiter #(
|
VX_priority_arbiter #(
|
||||||
.NUM_REQS (NUM_REQS),
|
.NUM_REQS (NUM_REQS)
|
||||||
.LOCK_ENABLE (LOCK_ENABLE)
|
|
||||||
) priority_arbiter (
|
) priority_arbiter (
|
||||||
.clk (clk),
|
|
||||||
.reset (reset),
|
|
||||||
.unlock (unlock),
|
|
||||||
.requests (requests),
|
.requests (requests),
|
||||||
.grant_valid (grant_valid),
|
.grant_valid (grant_valid),
|
||||||
.grant_index (grant_index),
|
.grant_index (grant_index),
|
||||||
@@ -50,12 +51,12 @@ module VX_generic_arbiter #(
|
|||||||
.LOCK_ENABLE (LOCK_ENABLE)
|
.LOCK_ENABLE (LOCK_ENABLE)
|
||||||
) rr_arbiter (
|
) rr_arbiter (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.unlock (unlock),
|
|
||||||
.requests (requests),
|
.requests (requests),
|
||||||
.grant_valid (grant_valid),
|
.grant_valid (grant_valid),
|
||||||
.grant_index (grant_index),
|
.grant_index (grant_index),
|
||||||
.grant_onehot (grant_onehot)
|
.grant_onehot (grant_onehot),
|
||||||
|
.grant_unlock (grant_unlock)
|
||||||
);
|
);
|
||||||
|
|
||||||
end else if (TYPE == "F") begin
|
end else if (TYPE == "F") begin
|
||||||
@@ -66,11 +67,11 @@ module VX_generic_arbiter #(
|
|||||||
) fair_arbiter (
|
) fair_arbiter (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.unlock (unlock),
|
|
||||||
.requests (requests),
|
.requests (requests),
|
||||||
.grant_valid (grant_valid),
|
.grant_valid (grant_valid),
|
||||||
.grant_index (grant_index),
|
.grant_index (grant_index),
|
||||||
.grant_onehot (grant_onehot)
|
.grant_onehot (grant_onehot),
|
||||||
|
.grant_unlock (grant_unlock)
|
||||||
);
|
);
|
||||||
|
|
||||||
end else if (TYPE == "M") begin
|
end else if (TYPE == "M") begin
|
||||||
@@ -81,11 +82,11 @@ module VX_generic_arbiter #(
|
|||||||
) matrix_arbiter (
|
) matrix_arbiter (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.unlock (unlock),
|
|
||||||
.requests (requests),
|
.requests (requests),
|
||||||
.grant_valid (grant_valid),
|
.grant_valid (grant_valid),
|
||||||
.grant_index (grant_index),
|
.grant_index (grant_index),
|
||||||
.grant_onehot (grant_onehot)
|
.grant_onehot (grant_onehot),
|
||||||
|
.grant_unlock (grant_unlock)
|
||||||
);
|
);
|
||||||
|
|
||||||
end else if (TYPE == "C") begin
|
end else if (TYPE == "C") begin
|
||||||
@@ -96,11 +97,11 @@ module VX_generic_arbiter #(
|
|||||||
) cyclic_arbiter (
|
) cyclic_arbiter (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.unlock (unlock),
|
|
||||||
.requests (requests),
|
.requests (requests),
|
||||||
.grant_valid (grant_valid),
|
.grant_valid (grant_valid),
|
||||||
.grant_index (grant_index),
|
.grant_index (grant_index),
|
||||||
.grant_onehot (grant_onehot)
|
.grant_onehot (grant_onehot),
|
||||||
|
.grant_unlock (grant_unlock)
|
||||||
);
|
);
|
||||||
|
|
||||||
end else begin
|
end else begin
|
||||||
|
|||||||
@@ -20,18 +20,18 @@ module VX_matrix_arbiter #(
|
|||||||
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
input wire unlock,
|
|
||||||
input wire [NUM_REQS-1:0] requests,
|
input wire [NUM_REQS-1:0] requests,
|
||||||
output wire [LOG_NUM_REQS-1:0] grant_index,
|
output wire [LOG_NUM_REQS-1:0] grant_index,
|
||||||
output wire [NUM_REQS-1:0] grant_onehot,
|
output wire [NUM_REQS-1:0] grant_onehot,
|
||||||
output wire grant_valid
|
output wire grant_valid,
|
||||||
|
input wire grant_unlock
|
||||||
);
|
);
|
||||||
if (NUM_REQS == 1) begin
|
if (NUM_REQS == 1) begin
|
||||||
|
|
||||||
`UNUSED_VAR (clk)
|
`UNUSED_VAR (clk)
|
||||||
`UNUSED_VAR (reset)
|
`UNUSED_VAR (reset)
|
||||||
`UNUSED_VAR (unlock)
|
`UNUSED_VAR (grant_unlock)
|
||||||
|
|
||||||
assign grant_index = '0;
|
assign grant_index = '0;
|
||||||
assign grant_onehot = requests;
|
assign grant_onehot = requests;
|
||||||
@@ -71,18 +71,18 @@ module VX_matrix_arbiter #(
|
|||||||
end
|
end
|
||||||
|
|
||||||
if (LOCK_ENABLE == 0) begin
|
if (LOCK_ENABLE == 0) begin
|
||||||
`UNUSED_VAR (unlock)
|
`UNUSED_VAR (grant_unlock)
|
||||||
assign grant_onehot = grant_unqual;
|
assign grant_onehot = grant_unqual;
|
||||||
end else begin
|
end else begin
|
||||||
reg [NUM_REQS-1:0] grant_unqual_prev;
|
reg [NUM_REQS-1:0] grant_unqual_prev;
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
grant_unqual_prev <= '0;
|
grant_unqual_prev <= '0;
|
||||||
end else if (unlock) begin
|
end else if (grant_unlock) begin
|
||||||
grant_unqual_prev <= grant_unqual;
|
grant_unqual_prev <= grant_unqual;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
assign grant_onehot = unlock ? grant_unqual : grant_unqual_prev;
|
assign grant_onehot = grant_unlock ? grant_unqual : grant_unqual_prev;
|
||||||
end
|
end
|
||||||
|
|
||||||
VX_onehot_encoder #(
|
VX_onehot_encoder #(
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ module VX_mem_adapter #(
|
|||||||
parameter DST_ADDR_WIDTH = 1,
|
parameter DST_ADDR_WIDTH = 1,
|
||||||
parameter SRC_TAG_WIDTH = 1,
|
parameter SRC_TAG_WIDTH = 1,
|
||||||
parameter DST_TAG_WIDTH = 1,
|
parameter DST_TAG_WIDTH = 1,
|
||||||
parameter OUT_REG_REQ = 0,
|
parameter OUT_REG_REQ = 0,
|
||||||
parameter OUT_REG_RSP = 0
|
parameter OUT_REG_RSP = 0
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ module VX_mem_rsp_sel #(
|
|||||||
parameter TAG_SEL_BITS = 0,
|
parameter TAG_SEL_BITS = 0,
|
||||||
parameter OUT_REG = 0
|
parameter OUT_REG = 0
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
|
|
||||||
// input response
|
// input response
|
||||||
@@ -46,18 +46,20 @@ input wire clk,
|
|||||||
|
|
||||||
wire [LOG_NUM_REQS-1:0] grant_index;
|
wire [LOG_NUM_REQS-1:0] grant_index;
|
||||||
wire grant_valid;
|
wire grant_valid;
|
||||||
wire rsp_fire;
|
wire grant_ready;
|
||||||
|
|
||||||
VX_priority_arbiter #(
|
VX_generic_arbiter #(
|
||||||
.NUM_REQS (NUM_REQS)
|
.NUM_REQS (NUM_REQS),
|
||||||
|
.LOCK_ENABLE (1),
|
||||||
|
.TYPE ("P")
|
||||||
) arbiter (
|
) arbiter (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.unlock (rsp_fire),
|
|
||||||
.requests (rsp_valid_in),
|
.requests (rsp_valid_in),
|
||||||
.grant_valid (grant_valid),
|
.grant_valid (grant_valid),
|
||||||
.grant_index (grant_index),
|
.grant_index (grant_index),
|
||||||
`UNUSED_PIN (grant_onehot)
|
`UNUSED_PIN (grant_onehot),
|
||||||
|
.grant_unlock(grant_ready)
|
||||||
);
|
);
|
||||||
|
|
||||||
reg [NUM_REQS-1:0] rsp_valid_sel;
|
reg [NUM_REQS-1:0] rsp_valid_sel;
|
||||||
@@ -78,7 +80,7 @@ input wire clk,
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
assign rsp_fire = grant_valid && rsp_ready_unqual;
|
assign grant_ready = rsp_ready_unqual;
|
||||||
|
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW (NUM_REQS + TAG_WIDTH + (NUM_REQS * DATA_WIDTH)),
|
.DATAW (NUM_REQS + TAG_WIDTH + (NUM_REQS * DATA_WIDTH)),
|
||||||
|
|||||||
63
hw/rtl/libs/VX_pipe_buffer.sv
Normal file
63
hw/rtl/libs/VX_pipe_buffer.sv
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
// Copyright 2024 blaise
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// A pipelined elastic buffer operates at full bandwidth where push can happen if the buffer is not empty but is going empty
|
||||||
|
// It has the following benefits:
|
||||||
|
// + Full-bandwidth throughput
|
||||||
|
// + use only one register for storage
|
||||||
|
// + data_out is fully registered
|
||||||
|
// It has the following limitations:
|
||||||
|
// + ready_in and ready_out are coupled
|
||||||
|
|
||||||
|
`include "VX_platform.vh"
|
||||||
|
|
||||||
|
`TRACING_OFF
|
||||||
|
module VX_pipe_buffer #(
|
||||||
|
parameter DATAW = 1,
|
||||||
|
parameter PASSTHRU = 0
|
||||||
|
) (
|
||||||
|
input wire clk,
|
||||||
|
input wire reset,
|
||||||
|
input wire valid_in,
|
||||||
|
output wire ready_in,
|
||||||
|
input wire [DATAW-1:0] data_in,
|
||||||
|
output wire [DATAW-1:0] data_out,
|
||||||
|
input wire ready_out,
|
||||||
|
output wire valid_out
|
||||||
|
);
|
||||||
|
if (PASSTHRU != 0) begin
|
||||||
|
`UNUSED_VAR (clk)
|
||||||
|
`UNUSED_VAR (reset)
|
||||||
|
assign ready_in = ready_out;
|
||||||
|
assign valid_out = valid_in;
|
||||||
|
assign data_out = data_in;
|
||||||
|
end else begin
|
||||||
|
wire stall = valid_out && ~ready_out;
|
||||||
|
|
||||||
|
VX_pipe_register #(
|
||||||
|
.DATAW (1 + DATAW),
|
||||||
|
.RESETW (1)
|
||||||
|
) pipe_register (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (reset),
|
||||||
|
.enable (~stall),
|
||||||
|
.data_in ({valid_in, data_in}),
|
||||||
|
.data_out ({valid_out, data_out})
|
||||||
|
);
|
||||||
|
|
||||||
|
assign ready_in = ~stall;
|
||||||
|
end
|
||||||
|
|
||||||
|
endmodule
|
||||||
|
`TRACING_ON
|
||||||
@@ -16,22 +16,13 @@
|
|||||||
`TRACING_OFF
|
`TRACING_OFF
|
||||||
module VX_priority_arbiter #(
|
module VX_priority_arbiter #(
|
||||||
parameter NUM_REQS = 1,
|
parameter NUM_REQS = 1,
|
||||||
parameter LOCK_ENABLE = 0,
|
|
||||||
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire [NUM_REQS-1:0] requests,
|
||||||
input wire reset,
|
|
||||||
input wire [NUM_REQS-1:0] requests,
|
|
||||||
input wire unlock,
|
|
||||||
output wire [LOG_NUM_REQS-1:0] grant_index,
|
output wire [LOG_NUM_REQS-1:0] grant_index,
|
||||||
output wire [NUM_REQS-1:0] grant_onehot,
|
output wire [NUM_REQS-1:0] grant_onehot,
|
||||||
output wire grant_valid
|
output wire grant_valid
|
||||||
);
|
);
|
||||||
`UNUSED_PARAM (LOCK_ENABLE)
|
|
||||||
`UNUSED_VAR (clk)
|
|
||||||
`UNUSED_VAR (reset)
|
|
||||||
`UNUSED_VAR (unlock)
|
|
||||||
|
|
||||||
if (NUM_REQS == 1) begin
|
if (NUM_REQS == 1) begin
|
||||||
|
|
||||||
assign grant_index = '0;
|
assign grant_index = '0;
|
||||||
|
|||||||
@@ -21,18 +21,18 @@ module VX_rr_arbiter #(
|
|||||||
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
input wire reset,
|
input wire reset,
|
||||||
input wire unlock,
|
|
||||||
input wire [NUM_REQS-1:0] requests,
|
input wire [NUM_REQS-1:0] requests,
|
||||||
output wire [LOG_NUM_REQS-1:0] grant_index,
|
output wire [LOG_NUM_REQS-1:0] grant_index,
|
||||||
output wire [NUM_REQS-1:0] grant_onehot,
|
output wire [NUM_REQS-1:0] grant_onehot,
|
||||||
output wire grant_valid
|
output wire grant_valid,
|
||||||
|
input wire grant_unlock
|
||||||
);
|
);
|
||||||
if (NUM_REQS == 1) begin
|
if (NUM_REQS == 1) begin
|
||||||
|
|
||||||
`UNUSED_VAR (clk)
|
`UNUSED_VAR (clk)
|
||||||
`UNUSED_VAR (reset)
|
`UNUSED_VAR (reset)
|
||||||
`UNUSED_VAR (unlock)
|
`UNUSED_VAR (grant_unlock)
|
||||||
|
|
||||||
assign grant_index = '0;
|
assign grant_index = '0;
|
||||||
assign grant_onehot = requests;
|
assign grant_onehot = requests;
|
||||||
@@ -55,7 +55,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -85,7 +85,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -121,7 +121,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -165,7 +165,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -219,7 +219,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -285,7 +285,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -365,7 +365,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -399,7 +399,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
pointer_reg <= {NUM_REQS{1'b1}};
|
pointer_reg <= {NUM_REQS{1'b1}};
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
if (|req_masked) begin
|
if (|req_masked) begin
|
||||||
pointer_reg <= mask_higher_pri_regs;
|
pointer_reg <= mask_higher_pri_regs;
|
||||||
end else if (|requests) begin
|
end else if (|requests) begin
|
||||||
@@ -443,7 +443,7 @@ module VX_rr_arbiter #(
|
|||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
state <= '0;
|
state <= '0;
|
||||||
end else if (!LOCK_ENABLE || unlock) begin
|
end else if (!LOCK_ENABLE || grant_unlock) begin
|
||||||
state <= grant_index_r;
|
state <= grant_index_r;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
module VX_skid_buffer #(
|
module VX_skid_buffer #(
|
||||||
parameter DATAW = 32,
|
parameter DATAW = 32,
|
||||||
parameter PASSTHRU = 0,
|
parameter PASSTHRU = 0,
|
||||||
|
parameter FULL_BW = 0,
|
||||||
parameter OUT_REG = 0
|
parameter OUT_REG = 0
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
@@ -30,8 +31,6 @@ module VX_skid_buffer #(
|
|||||||
input wire ready_out,
|
input wire ready_out,
|
||||||
output wire valid_out
|
output wire valid_out
|
||||||
);
|
);
|
||||||
`STATIC_ASSERT ((OUT_REG <= 2), ("invalid parameter"))
|
|
||||||
|
|
||||||
if (PASSTHRU != 0) begin
|
if (PASSTHRU != 0) begin
|
||||||
|
|
||||||
`UNUSED_VAR (clk)
|
`UNUSED_VAR (clk)
|
||||||
@@ -41,112 +40,36 @@ module VX_skid_buffer #(
|
|||||||
assign data_out = data_in;
|
assign data_out = data_in;
|
||||||
assign ready_in = ready_out;
|
assign ready_in = ready_out;
|
||||||
|
|
||||||
end else if (OUT_REG == 0) begin
|
end else if (FULL_BW != 0) begin
|
||||||
|
|
||||||
reg [1:0][DATAW-1:0] shift_reg;
|
VX_stream_buffer #(
|
||||||
reg valid_out_r, ready_in_r, rd_ptr_r;
|
.DATAW (DATAW),
|
||||||
|
.OUT_REG (OUT_REG)
|
||||||
wire push = valid_in && ready_in;
|
) stream_buffer (
|
||||||
wire pop = valid_out_r && ready_out;
|
.clk (clk),
|
||||||
|
.reset (reset),
|
||||||
always @(posedge clk) begin
|
.valid_in (valid_in),
|
||||||
if (reset) begin
|
.data_in (data_in),
|
||||||
valid_out_r <= 0;
|
.ready_in (ready_in),
|
||||||
ready_in_r <= 1;
|
.valid_out (valid_out),
|
||||||
rd_ptr_r <= 1;
|
.data_out (data_out),
|
||||||
end else begin
|
.ready_out (ready_out)
|
||||||
if (push) begin
|
);
|
||||||
if (!pop) begin
|
|
||||||
ready_in_r <= rd_ptr_r;
|
|
||||||
valid_out_r <= 1;
|
|
||||||
end
|
|
||||||
end else if (pop) begin
|
|
||||||
ready_in_r <= 1;
|
|
||||||
valid_out_r <= rd_ptr_r;
|
|
||||||
end
|
|
||||||
rd_ptr_r <= rd_ptr_r ^ (push ^ pop);
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (push) begin
|
|
||||||
shift_reg[1] <= shift_reg[0];
|
|
||||||
shift_reg[0] <= data_in;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
assign ready_in = ready_in_r;
|
|
||||||
assign valid_out = valid_out_r;
|
|
||||||
assign data_out = shift_reg[rd_ptr_r];
|
|
||||||
|
|
||||||
end else if (OUT_REG == 1) begin
|
|
||||||
|
|
||||||
// Full-bandwidth operation: input is consummed every cycle.
|
|
||||||
// However, data_out register has an additional multiplexer.
|
|
||||||
|
|
||||||
reg [DATAW-1:0] data_out_r;
|
|
||||||
reg [DATAW-1:0] buffer;
|
|
||||||
reg valid_out_r;
|
|
||||||
reg use_buffer;
|
|
||||||
|
|
||||||
wire push = valid_in && ready_in;
|
|
||||||
wire stall_out = valid_out_r && ~ready_out;
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (reset) begin
|
|
||||||
valid_out_r <= 0;
|
|
||||||
use_buffer <= 0;
|
|
||||||
end else begin
|
|
||||||
if (ready_out) begin
|
|
||||||
use_buffer <= 0;
|
|
||||||
end else if (valid_in && valid_out) begin
|
|
||||||
use_buffer <= 1;
|
|
||||||
end
|
|
||||||
if (~stall_out) begin
|
|
||||||
valid_out_r <= valid_in || use_buffer;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (push) begin
|
|
||||||
buffer <= data_in;
|
|
||||||
end
|
|
||||||
if (~stall_out) begin
|
|
||||||
data_out_r <= use_buffer ? buffer : data_in;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
assign ready_in = ~use_buffer;
|
|
||||||
assign valid_out = valid_out_r;
|
|
||||||
assign data_out = data_out_r;
|
|
||||||
|
|
||||||
end else begin
|
end else begin
|
||||||
|
|
||||||
// Half-bandwidth operation: input is consummed every other cycle.
|
VX_toggle_buffer #(
|
||||||
// However, data_out register has no additional multiplexer.
|
.DATAW (DATAW)
|
||||||
|
) toggle_buffer (
|
||||||
reg [DATAW-1:0] data_out_r;
|
.clk (clk),
|
||||||
reg has_data;
|
.reset (reset),
|
||||||
|
.valid_in (valid_in),
|
||||||
always @(posedge clk) begin
|
.data_in (data_in),
|
||||||
if (reset) begin
|
.ready_in (ready_in),
|
||||||
has_data <= 0;
|
.valid_out (valid_out),
|
||||||
end else begin
|
.data_out (data_out),
|
||||||
if (~has_data) begin
|
.ready_out (ready_out)
|
||||||
has_data <= valid_in;
|
);
|
||||||
end else if (ready_out) begin
|
|
||||||
has_data <= 0;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
if (~has_data) begin
|
|
||||||
data_out_r <= data_in;
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
assign ready_in = ~has_data;
|
|
||||||
assign valid_out = has_data;
|
|
||||||
assign data_out = data_out_r;
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|||||||
@@ -19,9 +19,8 @@ module VX_stream_arb #(
|
|||||||
parameter NUM_OUTPUTS = 1,
|
parameter NUM_OUTPUTS = 1,
|
||||||
parameter DATAW = 1,
|
parameter DATAW = 1,
|
||||||
parameter `STRING ARBITER = "P",
|
parameter `STRING ARBITER = "P",
|
||||||
parameter LOCK_ENABLE = 1,
|
|
||||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||||
parameter OUT_REG = 0 ,
|
parameter OUT_REG = 0 ,
|
||||||
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
|
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
|
||||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||||
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
|
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
|
||||||
@@ -57,7 +56,6 @@ module VX_stream_arb #(
|
|||||||
.NUM_OUTPUTS (1),
|
.NUM_OUTPUTS (1),
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) arb_slice (
|
) arb_slice (
|
||||||
@@ -102,7 +100,6 @@ module VX_stream_arb #(
|
|||||||
.NUM_OUTPUTS (1),
|
.NUM_OUTPUTS (1),
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) fanout_slice_arb (
|
) fanout_slice_arb (
|
||||||
@@ -129,7 +126,6 @@ module VX_stream_arb #(
|
|||||||
.NUM_OUTPUTS (1),
|
.NUM_OUTPUTS (1),
|
||||||
.DATAW (DATAW + LOG_NUM_REQS2),
|
.DATAW (DATAW + LOG_NUM_REQS2),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) fanout_join_arb (
|
) fanout_join_arb (
|
||||||
@@ -158,25 +154,25 @@ module VX_stream_arb #(
|
|||||||
wire arb_valid;
|
wire arb_valid;
|
||||||
wire [NUM_REQS_W-1:0] arb_index;
|
wire [NUM_REQS_W-1:0] arb_index;
|
||||||
wire [NUM_REQS-1:0] arb_onehot;
|
wire [NUM_REQS-1:0] arb_onehot;
|
||||||
wire arb_unlock;
|
wire arb_ready;
|
||||||
|
|
||||||
VX_generic_arbiter #(
|
VX_generic_arbiter #(
|
||||||
.NUM_REQS (NUM_REQS),
|
.NUM_REQS (NUM_REQS),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
.LOCK_ENABLE (1),
|
||||||
.TYPE (ARBITER)
|
.TYPE (ARBITER)
|
||||||
) arbiter (
|
) arbiter (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.requests (valid_in),
|
.requests (valid_in),
|
||||||
.unlock (arb_unlock),
|
|
||||||
.grant_valid (arb_valid),
|
.grant_valid (arb_valid),
|
||||||
.grant_index (arb_index),
|
.grant_index (arb_index),
|
||||||
.grant_onehot (arb_onehot)
|
.grant_onehot (arb_onehot),
|
||||||
|
.grant_unlock (arb_ready)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign valid_in_r = arb_valid;
|
assign valid_in_r = arb_valid;
|
||||||
assign data_in_r = data_in[arb_index];
|
assign data_in_r = data_in[arb_index];
|
||||||
assign arb_unlock = | (valid_in_r & ready_in_r);
|
assign arb_ready = ready_in_r;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||||
assign ready_in[i] = ready_in_r & arb_onehot[i];
|
assign ready_in[i] = ready_in_r & arb_onehot[i];
|
||||||
@@ -217,7 +213,6 @@ module VX_stream_arb #(
|
|||||||
.NUM_OUTPUTS (BATCH_SIZE),
|
.NUM_OUTPUTS (BATCH_SIZE),
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) arb_slice (
|
) arb_slice (
|
||||||
@@ -252,7 +247,6 @@ module VX_stream_arb #(
|
|||||||
.NUM_OUTPUTS (NUM_BATCHES),
|
.NUM_OUTPUTS (NUM_BATCHES),
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) fanout_fork_arb (
|
) fanout_fork_arb (
|
||||||
@@ -280,7 +274,6 @@ module VX_stream_arb #(
|
|||||||
.NUM_OUTPUTS (BATCH_SIZE),
|
.NUM_OUTPUTS (BATCH_SIZE),
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) fanout_slice_arb (
|
) fanout_slice_arb (
|
||||||
@@ -305,24 +298,24 @@ module VX_stream_arb #(
|
|||||||
wire [NUM_OUTPUTS-1:0] arb_requests;
|
wire [NUM_OUTPUTS-1:0] arb_requests;
|
||||||
wire arb_valid;
|
wire arb_valid;
|
||||||
wire [NUM_OUTPUTS-1:0] arb_onehot;
|
wire [NUM_OUTPUTS-1:0] arb_onehot;
|
||||||
wire arb_unlock;
|
wire arb_ready;
|
||||||
|
|
||||||
VX_generic_arbiter #(
|
VX_generic_arbiter #(
|
||||||
.NUM_REQS (NUM_OUTPUTS),
|
.NUM_REQS (NUM_OUTPUTS),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
.LOCK_ENABLE (1),
|
||||||
.TYPE (ARBITER)
|
.TYPE (ARBITER)
|
||||||
) arbiter (
|
) arbiter (
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
.requests (arb_requests),
|
.requests (arb_requests),
|
||||||
.unlock (arb_unlock),
|
|
||||||
.grant_valid (arb_valid),
|
.grant_valid (arb_valid),
|
||||||
`UNUSED_PIN (grant_index),
|
`UNUSED_PIN (grant_index),
|
||||||
.grant_onehot (arb_onehot)
|
.grant_onehot (arb_onehot),
|
||||||
|
.grant_unlock (arb_ready)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign arb_requests = ready_in_r;
|
assign arb_requests = ready_in_r;
|
||||||
assign arb_unlock = | (valid_in & ready_in);
|
assign arb_ready = valid_in[0];
|
||||||
assign ready_in = arb_valid;
|
assign ready_in = arb_valid;
|
||||||
|
|
||||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
|
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
|
||||||
|
|||||||
128
hw/rtl/libs/VX_stream_buffer.sv
Normal file
128
hw/rtl/libs/VX_stream_buffer.sv
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
// Copyright 2024 blaise
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// A stream elastic buffer operates at full-bandwidth where push and pop can happen simultaneously
|
||||||
|
// It has the following benefits:
|
||||||
|
// + full-bandwidth throughput
|
||||||
|
// + ready_in and ready_out are decoupled
|
||||||
|
// + data_out can be fully registered
|
||||||
|
// It has the following limitations:
|
||||||
|
// - requires two registers for storage
|
||||||
|
|
||||||
|
`include "VX_platform.vh"
|
||||||
|
|
||||||
|
`TRACING_OFF
|
||||||
|
module VX_stream_buffer #(
|
||||||
|
parameter DATAW = 1,
|
||||||
|
parameter OUT_REG = 0,
|
||||||
|
parameter PASSTHRU = 0
|
||||||
|
) (
|
||||||
|
input wire clk,
|
||||||
|
input wire reset,
|
||||||
|
input wire valid_in,
|
||||||
|
output wire ready_in,
|
||||||
|
input wire [DATAW-1:0] data_in,
|
||||||
|
output wire [DATAW-1:0] data_out,
|
||||||
|
input wire ready_out,
|
||||||
|
output wire valid_out
|
||||||
|
);
|
||||||
|
if (PASSTHRU != 0) begin
|
||||||
|
`UNUSED_VAR (clk)
|
||||||
|
`UNUSED_VAR (reset)
|
||||||
|
assign ready_in = ready_out;
|
||||||
|
assign valid_out = valid_in;
|
||||||
|
assign data_out = data_in;
|
||||||
|
end else begin
|
||||||
|
if (OUT_REG != 0) begin
|
||||||
|
|
||||||
|
reg [DATAW-1:0] data_out_r;
|
||||||
|
reg [DATAW-1:0] buffer;
|
||||||
|
reg valid_out_r;
|
||||||
|
reg use_buffer;
|
||||||
|
|
||||||
|
wire push = valid_in && ready_in;
|
||||||
|
wire stall_out = valid_out_r && ~ready_out;
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
valid_out_r <= 0;
|
||||||
|
use_buffer <= 0;
|
||||||
|
end else begin
|
||||||
|
if (ready_out) begin
|
||||||
|
use_buffer <= 0;
|
||||||
|
end else if (valid_in && valid_out) begin
|
||||||
|
use_buffer <= 1;
|
||||||
|
end
|
||||||
|
if (~stall_out) begin
|
||||||
|
valid_out_r <= valid_in || use_buffer;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (push) begin
|
||||||
|
buffer <= data_in;
|
||||||
|
end
|
||||||
|
if (~stall_out) begin
|
||||||
|
data_out_r <= use_buffer ? buffer : data_in;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
assign ready_in = ~use_buffer;
|
||||||
|
assign valid_out = valid_out_r;
|
||||||
|
assign data_out = data_out_r;
|
||||||
|
|
||||||
|
end else begin
|
||||||
|
|
||||||
|
reg [1:0][DATAW-1:0] shift_reg;
|
||||||
|
reg valid_out_r, ready_in_r, rd_ptr_r;
|
||||||
|
|
||||||
|
wire push = valid_in && ready_in;
|
||||||
|
wire pop = valid_out_r && ready_out;
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
valid_out_r <= 0;
|
||||||
|
ready_in_r <= 1;
|
||||||
|
rd_ptr_r <= 1;
|
||||||
|
end else begin
|
||||||
|
if (push) begin
|
||||||
|
if (!pop) begin
|
||||||
|
ready_in_r <= rd_ptr_r;
|
||||||
|
valid_out_r <= 1;
|
||||||
|
end
|
||||||
|
end else if (pop) begin
|
||||||
|
ready_in_r <= 1;
|
||||||
|
valid_out_r <= rd_ptr_r;
|
||||||
|
end
|
||||||
|
rd_ptr_r <= rd_ptr_r ^ (push ^ pop);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (push) begin
|
||||||
|
shift_reg[1] <= shift_reg[0];
|
||||||
|
shift_reg[0] <= data_in;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
assign ready_in = ready_in_r;
|
||||||
|
assign valid_out = valid_out_r;
|
||||||
|
assign data_out = shift_reg[rd_ptr_r];
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
endmodule
|
||||||
|
`TRACING_ON
|
||||||
|
|
||||||
@@ -21,8 +21,7 @@ module VX_stream_xbar #(
|
|||||||
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
|
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
|
||||||
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
|
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
|
||||||
parameter ARBITER = "P",
|
parameter ARBITER = "P",
|
||||||
parameter LOCK_ENABLE = 0,
|
parameter OUT_REG = 0,
|
||||||
parameter OUT_REG = 0,
|
|
||||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||||
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
|
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
|
||||||
) (
|
) (
|
||||||
@@ -66,7 +65,6 @@ module VX_stream_xbar #(
|
|||||||
.NUM_OUTPUTS (1),
|
.NUM_OUTPUTS (1),
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) xbar_arb (
|
) xbar_arb (
|
||||||
@@ -95,7 +93,6 @@ module VX_stream_xbar #(
|
|||||||
.NUM_OUTPUTS (1),
|
.NUM_OUTPUTS (1),
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
.ARBITER (ARBITER),
|
.ARBITER (ARBITER),
|
||||||
.LOCK_ENABLE (LOCK_ENABLE),
|
|
||||||
.MAX_FANOUT (MAX_FANOUT),
|
.MAX_FANOUT (MAX_FANOUT),
|
||||||
.OUT_REG (OUT_REG)
|
.OUT_REG (OUT_REG)
|
||||||
) xbar_arb (
|
) xbar_arb (
|
||||||
@@ -173,25 +170,27 @@ module VX_stream_xbar #(
|
|||||||
end
|
end
|
||||||
|
|
||||||
// compute inputs collision
|
// compute inputs collision
|
||||||
// we have a collision when there exists a valid transfer with mutiple input candicates
|
// we have a collision when there exists a valid transfer with multiple input candicates
|
||||||
// we caount the unique duplicates each cycle.
|
// we count the unique duplicates each cycle.
|
||||||
|
|
||||||
|
reg [NUM_INPUTS-1:0] per_cycle_collision, per_cycle_collision_r;
|
||||||
|
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
|
||||||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||||
reg [NUM_INPUTS-1:0] per_cycle_collision;
|
|
||||||
|
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
per_cycle_collision = 0;
|
per_cycle_collision = 0;
|
||||||
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
||||||
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
|
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
|
||||||
if (valid_in[i] && valid_in[j+i] && sel_in[i] == sel_in[j+i]) begin
|
per_cycle_collision[i] |= valid_in[i]
|
||||||
per_cycle_collision[i] |= ready_in[i] | ready_in[j+i];
|
&& valid_in[j+i]
|
||||||
end
|
&& (sel_in[i] == sel_in[j+i])
|
||||||
|
&& (ready_in[i] | ready_in[j+i]);
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
wire [`CLOG2(NUM_INPUTS+1)-1:0] collision_count;
|
`BUFFER(per_cycle_collision_r, per_cycle_collision);
|
||||||
`POP_COUNT(collision_count, per_cycle_collision);
|
`POP_COUNT(collision_count, per_cycle_collision_r);
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
|
|||||||
70
hw/rtl/libs/VX_toggle_buffer.sv
Normal file
70
hw/rtl/libs/VX_toggle_buffer.sv
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
// Copyright 2024 blaise
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
// A toggle elastic buffer operates at half-bandwidth where push can only trigger after pop
|
||||||
|
// It has the following benefits:
|
||||||
|
// + use only one register for storage
|
||||||
|
// + ready_in and ready_out are decoupled
|
||||||
|
// + data_out is fully registered
|
||||||
|
// It has the following limitations:
|
||||||
|
// - Half-bandwidth throughput
|
||||||
|
|
||||||
|
`include "VX_platform.vh"
|
||||||
|
|
||||||
|
`TRACING_OFF
|
||||||
|
module VX_toggle_buffer #(
|
||||||
|
parameter DATAW = 1,
|
||||||
|
parameter PASSTHRU = 0
|
||||||
|
) (
|
||||||
|
input wire clk,
|
||||||
|
input wire reset,
|
||||||
|
input wire valid_in,
|
||||||
|
output wire ready_in,
|
||||||
|
input wire [DATAW-1:0] data_in,
|
||||||
|
output wire [DATAW-1:0] data_out,
|
||||||
|
input wire ready_out,
|
||||||
|
output wire valid_out
|
||||||
|
);
|
||||||
|
if (PASSTHRU != 0) begin
|
||||||
|
`UNUSED_VAR (clk)
|
||||||
|
`UNUSED_VAR (reset)
|
||||||
|
assign ready_in = ready_out;
|
||||||
|
assign valid_out = valid_in;
|
||||||
|
assign data_out = data_in;
|
||||||
|
end else begin
|
||||||
|
reg [DATAW-1:0] buffer;
|
||||||
|
reg has_data;
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
has_data <= 0;
|
||||||
|
end else begin
|
||||||
|
if (~has_data) begin
|
||||||
|
has_data <= valid_in;
|
||||||
|
end else if (ready_out) begin
|
||||||
|
has_data <= 0;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
if (~has_data) begin
|
||||||
|
buffer <= data_in;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
assign ready_in = ~has_data;
|
||||||
|
assign valid_out = has_data;
|
||||||
|
assign data_out = buffer;
|
||||||
|
end
|
||||||
|
|
||||||
|
endmodule
|
||||||
|
`TRACING_ON
|
||||||
@@ -15,7 +15,7 @@
|
|||||||
|
|
||||||
module VX_gbar_arb #(
|
module VX_gbar_arb #(
|
||||||
parameter NUM_REQS = 1,
|
parameter NUM_REQS = 1,
|
||||||
parameter OUT_REG = 0,
|
parameter OUT_REG = 0,
|
||||||
parameter `STRING ARBITER = "R"
|
parameter `STRING ARBITER = "R"
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ module VX_mem_arb #(
|
|||||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||||
parameter TAG_WIDTH = 1,
|
parameter TAG_WIDTH = 1,
|
||||||
parameter TAG_SEL_IDX = 0,
|
parameter TAG_SEL_IDX = 0,
|
||||||
parameter OUT_REG_REQ = 0,
|
parameter OUT_REG_REQ = 0,
|
||||||
parameter OUT_REG_RSP = 0,
|
parameter OUT_REG_RSP = 0,
|
||||||
parameter `STRING ARBITER = "R"
|
parameter `STRING ARBITER = "R"
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
|
|||||||
@@ -233,10 +233,12 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle;
|
wire [`CLOG2(NUM_REQS+1)-1:0] perf_writes_per_cycle;
|
||||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||||
|
|
||||||
wire [NUM_REQS-1:0] perf_reads_per_req = req_valid & req_ready & ~req_rw;
|
wire [NUM_REQS-1:0] perf_reads_per_req, perf_writes_per_req;
|
||||||
wire [NUM_REQS-1:0] perf_writes_per_req = req_valid & req_ready & req_rw;
|
|
||||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready;
|
wire [NUM_REQS-1:0] perf_crsp_stall_per_req = rsp_valid & ~rsp_ready;
|
||||||
|
|
||||||
|
`BUFFER(perf_reads_per_req, req_valid & req_ready & ~req_rw);
|
||||||
|
`BUFFER(perf_writes_per_req, req_valid & req_ready & req_rw);
|
||||||
|
|
||||||
`POP_COUNT(perf_reads_per_cycle, perf_reads_per_req);
|
`POP_COUNT(perf_reads_per_cycle, perf_reads_per_req);
|
||||||
`POP_COUNT(perf_writes_per_cycle, perf_writes_per_req);
|
`POP_COUNT(perf_writes_per_cycle, perf_writes_per_req);
|
||||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||||
|
|||||||
@@ -19,8 +19,8 @@ module VX_smem_switch #(
|
|||||||
parameter TAG_WIDTH = 1,
|
parameter TAG_WIDTH = 1,
|
||||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||||
parameter TAG_SEL_IDX = 0,
|
parameter TAG_SEL_IDX = 0,
|
||||||
parameter OUT_REG_REQ = 0,
|
parameter OUT_REG_REQ = 0,
|
||||||
parameter OUT_REG_RSP = 0,
|
parameter OUT_REG_RSP = 0,
|
||||||
parameter `STRING ARBITER = "R"
|
parameter `STRING ARBITER = "R"
|
||||||
) (
|
) (
|
||||||
input wire clk,
|
input wire clk,
|
||||||
|
|||||||
@@ -56,17 +56,17 @@ TARGET=asesim make -C runtime/opae
|
|||||||
PREFIX=build_base CONFIGS="-DEXT_F_DISABLE -DL1_DISABLE -DSM_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" TARGET=asesim make
|
PREFIX=build_base CONFIGS="-DEXT_F_DISABLE -DL1_DISABLE -DSM_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" TARGET=asesim make
|
||||||
|
|
||||||
# ASE test runs
|
# ASE test runs
|
||||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t0
|
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t0
|
||||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n1 -t1
|
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t1
|
||||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/basic/basic -n16
|
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n16
|
||||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/demo/demo -n16
|
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/demo/demo -n16
|
||||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/regression/dogfood/dogfood -n16
|
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/dogfood/dogfood -n16
|
||||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/vecadd/vecadd
|
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/vecadd/vecadd
|
||||||
./run_ase.sh build_base_arria10_asesim_1c/synth ../../../../tests/opencl/sgemm/sgemm -n4
|
./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/sgemm/sgemm -n4
|
||||||
|
|
||||||
# modify "vsim_run.tcl" to dump VCD trace
|
# modify "vsim_run.tcl" to dump VCD trace
|
||||||
vcd file trace.vcd
|
vcd file trace.vcd
|
||||||
vcd add -r /*/Vortex/hw/rtl/*
|
vcd add -r /*/afu/*
|
||||||
run -all
|
run -all
|
||||||
|
|
||||||
# compress FPGA output files
|
# compress FPGA output files
|
||||||
|
|||||||
@@ -15,27 +15,27 @@
|
|||||||
|
|
||||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||||
|
|
||||||
BUILD_DIR=$1
|
BUILD_DIR=$(realpath $1)
|
||||||
|
|
||||||
PROGRAM=$(basename "$2")
|
PROGRAM=$(basename "$2")
|
||||||
PROGRAM_DIR=`dirname $2`
|
PROGRAM_DIR=`dirname $2`
|
||||||
|
|
||||||
|
POCL_RT_PATH=$TOOLDIR/pocl/runtime
|
||||||
VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime
|
VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime
|
||||||
|
|
||||||
# Export ASE_WORKDIR variable
|
# Export ASE_WORKDIR variable
|
||||||
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
|
export ASE_WORKDIR=$BUILD_DIR/synth/work
|
||||||
|
|
||||||
shift 2
|
|
||||||
|
|
||||||
# cleanup incomplete runs
|
# cleanup incomplete runs
|
||||||
rm -f $ASE_WORKDIR/.app_lock.pid
|
rm -f $ASE_WORKDIR/.app_lock.pid
|
||||||
rm -f $ASE_WORKDIR/.ase_ready.pid
|
rm -f $ASE_WORKDIR/.ase_ready.pid
|
||||||
rm -f $SCRIPT_DIR/$BUILD_DIR/nohup.out
|
rm -f $BUILD_DIR/synth/nohup.out
|
||||||
|
|
||||||
# Start Simulator in background
|
# Start Simulator in background (capture processs group pid)
|
||||||
pushd $SCRIPT_DIR/$BUILD_DIR
|
pushd $BUILD_DIR/synth
|
||||||
echo " [DBG] starting ASE simnulator (stdout saved to '$SCRIPT_DIR/$BUILD_DIR/nohup.out')"
|
echo " [DBG] starting ASE simnulator (stdout saved to '$BUILD_DIR/synth/nohup.out')"
|
||||||
nohup make sim &
|
setsid make sim &> /dev/null &
|
||||||
|
SIM_PID=$!
|
||||||
popd
|
popd
|
||||||
|
|
||||||
# Wait for simulator readiness
|
# Wait for simulator readiness
|
||||||
@@ -47,6 +47,11 @@ done
|
|||||||
|
|
||||||
# run application
|
# run application
|
||||||
pushd $PROGRAM_DIR
|
pushd $PROGRAM_DIR
|
||||||
|
shift 2
|
||||||
echo " [DBG] running ./$PROGRAM $*"
|
echo " [DBG] running ./$PROGRAM $*"
|
||||||
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $*
|
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $*
|
||||||
popd
|
popd
|
||||||
|
|
||||||
|
# stop the simulator (kill process group)
|
||||||
|
kill -- -$(ps -o pgid= $SIM_PID | grep -o '[0-9]*')
|
||||||
|
wait $SIM_PID 2> /dev/null
|
||||||
4
hw/syn/altera/quartus/cache/Makefile
vendored
4
hw/syn/altera/quartus/cache/Makefile
vendored
@@ -1,6 +1,6 @@
|
|||||||
PROJECT = VX_cache_cluster_top
|
PROJECT = VX_cache_top
|
||||||
TOP_LEVEL_ENTITY = $(PROJECT)
|
TOP_LEVEL_ENTITY = $(PROJECT)
|
||||||
SRC_FILE = VX_cache_cluster.sv
|
SRC_FILE = $(PROJECT).sv
|
||||||
|
|
||||||
include ../../common.mk
|
include ../../common.mk
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
PROJECT = VX_core_top
|
PROJECT = VX_core_top
|
||||||
TOP_LEVEL_ENTITY = $(PROJECT)
|
TOP_LEVEL_ENTITY = $(PROJECT)
|
||||||
SRC_FILE = VX_core.sv
|
SRC_FILE = $(PROJECT).sv
|
||||||
|
|
||||||
include ../../common.mk
|
include ../../common.mk
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,11 @@
|
|||||||
XLEN ?= 32
|
XLEN ?= 32
|
||||||
|
TOOLDIR ?= /opt
|
||||||
|
|
||||||
ifeq ($(XLEN),64)
|
ifeq ($(XLEN),64)
|
||||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||||
else
|
else
|
||||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|||||||
2
hw/unittest/cache/Makefile
vendored
2
hw/unittest/cache/Makefile
vendored
@@ -33,7 +33,7 @@ VL_FLAGS = --exe
|
|||||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||||
VL_FLAGS += --x-initial unique --x-assign unique
|
VL_FLAGS += --x-initial unique --x-assign unique
|
||||||
VL_FLAGS += -DSIMULATION
|
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||||
VL_FLAGS += $(CONFIGS)
|
VL_FLAGS += $(CONFIGS)
|
||||||
VL_FLAGS += $(PARAMS)
|
VL_FLAGS += $(PARAMS)
|
||||||
VL_FLAGS += $(RTL_INCLUDE)
|
VL_FLAGS += $(RTL_INCLUDE)
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ VL_FLAGS = --exe
|
|||||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||||
VL_FLAGS += --x-initial unique --x-assign unique
|
VL_FLAGS += --x-initial unique --x-assign unique
|
||||||
VL_FLAGS += -DSIMULATION
|
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||||
VL_FLAGS += $(CONFIGS)
|
VL_FLAGS += $(CONFIGS)
|
||||||
VL_FLAGS += $(PARAMS)
|
VL_FLAGS += $(PARAMS)
|
||||||
VL_FLAGS += $(RTL_INCLUDE)
|
VL_FLAGS += $(RTL_INCLUDE)
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ VL_FLAGS = --exe
|
|||||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||||
VL_FLAGS += --x-initial unique --x-assign unique
|
VL_FLAGS += --x-initial unique --x-assign unique
|
||||||
VL_FLAGS += -DSIMULATION
|
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||||
VL_FLAGS += $(CONFIGS)
|
VL_FLAGS += $(CONFIGS)
|
||||||
VL_FLAGS += $(PARAMS)
|
VL_FLAGS += $(PARAMS)
|
||||||
VL_FLAGS += $(RTL_INCLUDE)
|
VL_FLAGS += $(RTL_INCLUDE)
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ VL_FLAGS = --exe
|
|||||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
|
||||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||||
VL_FLAGS += --x-initial unique --x-assign unique
|
VL_FLAGS += --x-initial unique --x-assign unique
|
||||||
VL_FLAGS += -DSIMULATION
|
VL_FLAGS += -DSIMULATION -DSV_DPI
|
||||||
VL_FLAGS += $(CONFIGS)
|
VL_FLAGS += $(CONFIGS)
|
||||||
VL_FLAGS += $(PARAMS)
|
VL_FLAGS += $(PARAMS)
|
||||||
VL_FLAGS += $(RTL_INCLUDE)
|
VL_FLAGS += $(RTL_INCLUDE)
|
||||||
@@ -56,7 +56,6 @@ PROJECT = top_modules
|
|||||||
all: build
|
all: build
|
||||||
|
|
||||||
build: $(SRCS)
|
build: $(SRCS)
|
||||||
verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
|
|
||||||
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
|
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||||
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'
|
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,18 @@
|
|||||||
XLEN ?= 32
|
XLEN ?= 32
|
||||||
|
TOOLDIR ?= /opt
|
||||||
|
|
||||||
ifeq ($(XLEN),64)
|
ifeq ($(XLEN),64)
|
||||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||||
else
|
else
|
||||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||||
endif
|
endif
|
||||||
|
|
||||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||||
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||||
|
|
||||||
LLVM_VORTEX ?= /opt/llvm-vortex
|
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
|
||||||
|
|
||||||
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
|
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
|
||||||
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
|
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user