From 62cdd8e99385ce5470c96b7ce1808080d11e740f Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Sat, 11 Nov 2023 15:49:39 -0800
Subject: [PATCH 01/11] minor update

---
 .travis.yml                        |  6 ++--
 ci/toolchain_env.sh                |  7 ++---
 docs/fpga_setup.md                 |  3 --
 hw/syn/xilinx/test/kernel/Makefile |  5 +--
 kernel/Makefile                    |  7 +++--
 tests/kernel/common.mk             | 49 ++++++++++++++++++++++++++++++
 tests/kernel/conform/Makefile      | 49 +-----------------------------
 tests/kernel/fibonacci/Makefile    | 49 +-----------------------------
 tests/kernel/hello/Makefile        | 49 +-----------------------------
 tests/opencl/common.mk             | 13 ++++----
 tests/regression/common.mk         |  8 +++--
 tests/unittest/common.mk           | 30 ++++++++++++++++++
 tests/unittest/vx_malloc/Makefile  | 31 +------------------
 13 files changed, 107 insertions(+), 199 deletions(-)
 create mode 100644 tests/kernel/common.mk
 create mode 100644 tests/unittest/common.mk

diff --git a/.travis.yml b/.travis.yml
index f719ce86..236ed3b7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -38,7 +38,7 @@ jobs:
         - rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
         - rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
         - make -C $HOME/build32
-        - XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make -C $HOME/build64
+        - XLEN=64 make -C $HOME/build64
     - stage: test 
       name: unittest
       script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
@@ -47,13 +47,13 @@ jobs:
       script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
     - stage: test 
       name: isa64
-      script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa
+      script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
     - stage: test 
       name: regression
       script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
     - stage: test 
       name: regression64
-      script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression
+      script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
     - stage: test 
       name: opencl
       script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl    
diff --git a/ci/toolchain_env.sh b/ci/toolchain_env.sh
index 4046a6a7..440a899e 100644
--- a/ci/toolchain_env.sh
+++ b/ci/toolchain_env.sh
@@ -16,14 +16,11 @@
 
 TOOLDIR=${TOOLDIR:=/opt}
 
-export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
-export LLVM_POCL=$TOOLDIR/llvm-pocl
-export LLVM_VORTEX=$TOOLDIR/llvm-vortex
 export VERILATOR_ROOT=$TOOLDIR/verilator
 export PATH=$VERILATOR_ROOT/bin:$PATH
+
 export SV2V_PATH=$TOOLDIR/sv2v
 export PATH=$SV2V_PATH/bin:$PATH
+
 export YOSYS_PATH=$TOOLDIR/yosys
 export PATH=$YOSYS_PATH/bin:$PATH
-export POCL_CC_PATH=$TOOLDIR/pocl/compiler
-export POCL_RT_PATH=$TOOLDIR/pocl/runtime
diff --git a/docs/fpga_setup.md b/docs/fpga_setup.md
index 61ff481f..88e0c3c0 100644
--- a/docs/fpga_setup.md
+++ b/docs/fpga_setup.md
@@ -9,9 +9,6 @@ OPAE Environment Setup
     $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
     $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
     $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
-    $ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
-    $ export PATH=:/opt/verilator/bin:$PATH
-    $ export VERILATOR_ROOT=/opt/verilator
 
 OPAE Build
 ------------------
diff --git a/hw/syn/xilinx/test/kernel/Makefile b/hw/syn/xilinx/test/kernel/Makefile
index 55c21aa2..11457ab4 100644
--- a/hw/syn/xilinx/test/kernel/Makefile
+++ b/hw/syn/xilinx/test/kernel/Makefile
@@ -1,10 +1,11 @@
 XLEN ?= 32
+TOOLDIR ?= /opt
 
 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 CFLAGS += -march=rv64imafd -mabi=lp64d
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 CFLAGS += -march=rv32imaf -mabi=ilp32f
 endif
 
diff --git a/kernel/Makefile b/kernel/Makefile
index e4c975dc..07b8c97b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -1,17 +1,18 @@
 XLEN ?= 32
+TOOLDIR ?= /opt
 
 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 CFLAGS += -march=rv64imafd -mabi=lp64d
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 CFLAGS += -march=rv32imaf -mabi=ilp32f
 endif
 
 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
 RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
 
-LLVM_VORTEX ?= /opt/llvm-vortex
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
 
 LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
 LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
diff --git a/tests/kernel/common.mk b/tests/kernel/common.mk
new file mode 100644
index 00000000..7bf4b520
--- /dev/null
+++ b/tests/kernel/common.mk
@@ -0,0 +1,49 @@
+XLEN ?= 32
+TOOLDIR ?= /opt
+
+ifeq ($(XLEN),64)
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
+CFLAGS += -march=rv64imafd -mabi=lp64d
+else
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
+CFLAGS += -march=rv32imaf -mabi=ilp32f
+endif
+
+RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
+
+VORTEX_KN_PATH ?= $(realpath ../../../kernel)
+
+CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
+AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
+DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
+CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
+
+SIM_DIR = ../../../sim
+
+CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
+CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
+
+LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
+
+all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
+
+$(PROJECT).bin: $(PROJECT).elf
+	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
+
+$(PROJECT).elf: $(SRCS)
+	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
+
+run-rtlsim: $(PROJECT).bin
+	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
+
+run-simx: $(PROJECT).bin
+	$(SIM_DIR)/simx/simx $(PROJECT).bin
+
+.depend: $(SRCS)
+	$(CC) $(CFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf *.elf *.bin *.dump .depend 
diff --git a/tests/kernel/conform/Makefile b/tests/kernel/conform/Makefile
index e9897125..ee96978f 100644
--- a/tests/kernel/conform/Makefile
+++ b/tests/kernel/conform/Makefile
@@ -1,52 +1,5 @@
-XLEN ?= 32
-
-ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
-CFLAGS += -march=rv64imafd -mabi=lp64d
-else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
-CFLAGS += -march=rv32imaf -mabi=ilp32f
-endif
-
-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
-
-VORTEX_KN_PATH ?= $(realpath ../../../kernel)
-
-CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
-AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
-DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
-CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
-
-SIM_DIR = ../../../sim
-
-CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
-CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
-
-LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
-
 PROJECT = conform
 
 SRCS = main.cpp tests.cpp
 
-all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
-
-$(PROJECT).dump: $(PROJECT).elf
-	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
-
-$(PROJECT).bin: $(PROJECT).elf
-	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
-
-$(PROJECT).elf: $(SRCS)
-	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
-
-run-rtlsim: $(PROJECT).bin
-	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
-
-run-simx: $(PROJECT).bin
-	$(SIM_DIR)/simx/simx $(PROJECT).bin
-
-.depend: $(SRCS)
-	$(CC) $(CFLAGS) -MM $^ > .depend;
-
-clean:
-	rm -rf *.elf *.bin *.dump .depend 
+include ../common.mk
diff --git a/tests/kernel/fibonacci/Makefile b/tests/kernel/fibonacci/Makefile
index 1338b4ab..d4486c74 100644
--- a/tests/kernel/fibonacci/Makefile
+++ b/tests/kernel/fibonacci/Makefile
@@ -1,52 +1,5 @@
-XLEN ?= 32
-
-ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
-CFLAGS += -march=rv64imafd -mabi=lp64d
-else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
-CFLAGS += -march=rv32imaf -mabi=ilp32f
-endif
-
-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
-
-VORTEX_KN_PATH ?= $(realpath ../../../kernel)
-
-CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
-AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
-DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
-CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
-
-SIM_DIR = ../../../sim
-
-CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
-CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
-
-LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
-
 PROJECT = fibonacci
 
 SRCS = main.cpp
 
-all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
-
-$(PROJECT).dump: $(PROJECT).elf
-	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
-
-$(PROJECT).bin: $(PROJECT).elf
-	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
-
-$(PROJECT).elf: $(SRCS)
-	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
-
-run-rtlsim: $(PROJECT).bin
-	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
-
-run-simx: $(PROJECT).bin
-	$(SIM_DIR)/simx/simx $(PROJECT).bin
-
-.depend: $(SRCS)
-	$(CC) $(CFLAGS) -MM $^ > .depend;
-
-clean:
-	rm -rf *.elf *.bin *.dump .depend 
+include ../common.mk
diff --git a/tests/kernel/hello/Makefile b/tests/kernel/hello/Makefile
index 42d95256..4cff6cbd 100644
--- a/tests/kernel/hello/Makefile
+++ b/tests/kernel/hello/Makefile
@@ -1,52 +1,5 @@
-XLEN ?= 32
-
-ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
-CFLAGS += -march=rv64imafd -mabi=lp64d
-else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
-CFLAGS += -march=rv32imaf -mabi=ilp32f
-endif
-
-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
-
-VORTEX_KN_PATH ?= $(realpath ../../../kernel)
-
-CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
-AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
-DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
-CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
-
-SIM_DIR = ../../../sim
-
-CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
-CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
-
-LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
-
 PROJECT = hello
 
 SRCS = main.cpp
 
-all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
-
-$(PROJECT).dump: $(PROJECT).elf
-	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
-
-$(PROJECT).bin: $(PROJECT).elf
-	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
-
-$(PROJECT).elf: $(SRCS)
-	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
-
-run-rtlsim: $(PROJECT).bin
-	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
-
-run-simx: $(PROJECT).bin
-	$(SIM_DIR)/simx/simx $(PROJECT).bin
-
-.depend: $(SRCS)
-	$(CC) $(CFLAGS) -MM $^ > .depend;
-
-clean:
-	rm -rf *.elf *.bin *.dump .depend 
+include ../common.mk
diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk
index fc55ee0a..ce82dac3 100644
--- a/tests/opencl/common.mk
+++ b/tests/opencl/common.mk
@@ -1,4 +1,5 @@
 XLEN ?= 32
+TOOLDIR ?= /opt
 
 TARGET ?= opaesim
 
@@ -6,12 +7,12 @@ XRT_SYN_DIR  ?= ../../../hw/syn/xilinx/xrt
 XRT_DEVICE_INDEX ?= 0
 
 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 VX_CFLAGS += -march=rv64imafd -mabi=lp64d
 K_CFLAGS += -march=rv64imafd -mabi=ilp64d
 STARTUP_ADDR ?= 0x180000000
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
 K_CFLAGS += -march=rv32imaf -mabi=ilp32f
 STARTUP_ADDR ?= 0x80000000
@@ -20,16 +21,16 @@ endif
 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
 RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
 
-POCL_CC_PATH ?= /opt/pocl/compiler
-POCL_RT_PATH ?= /opt/pocl/runtime
+POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
+POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
 
 VORTEX_RT_PATH ?= $(realpath ../../../runtime)
 VORTEX_KN_PATH ?= $(realpath ../../../kernel)
 
 FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
 
-LLVM_VORTEX ?= /opt/llvm-vortex
-LLVM_POCL ?= /opt/llvm-vortex
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
+LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
 
 K_CFLAGS   += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
 K_CFLAGS   += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
diff --git a/tests/regression/common.mk b/tests/regression/common.mk
index 006d6668..6a858edc 100644
--- a/tests/regression/common.mk
+++ b/tests/regression/common.mk
@@ -1,16 +1,18 @@
 XLEN ?= 32
 
+TOOLDIR ?= /opt
+
 TARGET ?= opaesim
 
 XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
 XRT_DEVICE_INDEX ?= 0
 
 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 VX_CFLAGS += -march=rv64imafd -mabi=lp64d
 STARTUP_ADDR ?= 0x180000000
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
 STARTUP_ADDR ?= 0x80000000
 endif
@@ -23,7 +25,7 @@ VORTEX_KN_PATH ?= $(realpath ../../../kernel)
 
 FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
 
-LLVM_VORTEX ?= /opt/llvm-vortex
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
 
 LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
 LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
diff --git a/tests/unittest/common.mk b/tests/unittest/common.mk
new file mode 100644
index 00000000..11add105
--- /dev/null
+++ b/tests/unittest/common.mk
@@ -0,0 +1,30 @@
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I$(VORTEX_RT_PATH)/common
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+all: $(PROJECT)
+
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run:
+	./$(PROJECT)
+
+clean:
+	rm -rf $(PROJECT) *.o .depend
+
+clean-all: clean
+	rm -rf *.elf *.bin *.dump
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
diff --git a/tests/unittest/vx_malloc/Makefile b/tests/unittest/vx_malloc/Makefile
index 2d604620..2036fcde 100644
--- a/tests/unittest/vx_malloc/Makefile
+++ b/tests/unittest/vx_malloc/Makefile
@@ -1,34 +1,5 @@
-VORTEX_RT_PATH ?= $(realpath ../../../runtime)
-
-CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
-
-CXXFLAGS += -I$(VORTEX_RT_PATH)/common
-
-# Debugigng
-ifdef DEBUG
-	CXXFLAGS += -g -O0
-else    
-	CXXFLAGS += -O2 -DNDEBUG
-endif
-
 PROJECT = vx_malloc
 
 SRCS = main.cpp
 
-all: $(PROJECT)
-
-$(PROJECT): $(SRCS)
-	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
-
-run:
-	./$(PROJECT)
-
-clean:
-	rm -rf $(PROJECT) *.o .depend
-
-clean-all: clean
-	rm -rf *.elf *.bin *.dump
-
-ifneq ($(MAKECMDGOALS),clean)
-    -include .depend
-endif
\ No newline at end of file
+include ../common.mk

From a08d3ebd4250a7d52ee7741712dfb612c163be2f Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Sun, 12 Nov 2023 23:40:59 -0800
Subject: [PATCH 02/11] minor update

---
 hw/rtl/afu/xrt/VX_afu_wrap.sv       |   2 +-
 hw/rtl/{fpu => core}/VX_fpu_unit.sv |   0
 hw/rtl/core/VX_muldiv_unit.sv       |   5 +
 hw/rtl/core/VX_scoreboard.sv        |   3 +
 hw/rtl/fpu/VX_fpu_cvt.sv            | 206 ++++++++++------------------
 hw/rtl/fpu/VX_fpu_rounding.sv       |   1 -
 6 files changed, 81 insertions(+), 136 deletions(-)
 rename hw/rtl/{fpu => core}/VX_fpu_unit.sv (100%)

diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv
index 3c4b3947..2abbbe43 100644
--- a/hw/rtl/afu/xrt/VX_afu_wrap.sv
+++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv
@@ -262,7 +262,7 @@ module VX_afu_wrap #(
 		.m_axi_awready	(m_axi_mem_awready_a),
 		.m_axi_awaddr	(m_axi_mem_awaddr_w),
 		.m_axi_awid		(m_axi_mem_awid_a),
-		`UNUSED_PIN (m_axi_awlen),
+		.m_axi_awlen    (m_axi_mem_awlen_a),
 		`UNUSED_PIN (m_axi_awsize),
 		`UNUSED_PIN (m_axi_awburst),
 		`UNUSED_PIN (m_axi_awlock),
diff --git a/hw/rtl/fpu/VX_fpu_unit.sv b/hw/rtl/core/VX_fpu_unit.sv
similarity index 100%
rename from hw/rtl/fpu/VX_fpu_unit.sv
rename to hw/rtl/core/VX_fpu_unit.sv
diff --git a/hw/rtl/core/VX_muldiv_unit.sv b/hw/rtl/core/VX_muldiv_unit.sv
index f3c730d4..141cdb55 100644
--- a/hw/rtl/core/VX_muldiv_unit.sv
+++ b/hw/rtl/core/VX_muldiv_unit.sv
@@ -220,8 +220,13 @@ module VX_muldiv_unit #(
     wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
 
     for (genvar i = 0; i < NUM_LANES; ++i) begin
+    `ifdef XLEN_64
         assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
         assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
+    `else
+        assign div_in1[i] = execute_if.data.rs1_data[i];
+        assign div_in2[i] = execute_if.data.rs2_data[i];
+    `endif        
     end
 
 `ifdef IDIV_DPI    
diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index ee5ae2ec..90a58134 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -107,6 +107,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
             .ready_out (scoreboard_if[i].ready)
         );
 
+    `ifdef SIMULATION
         reg [31:0] timeout_ctr;
     
         always @(posedge clk) begin
@@ -134,6 +135,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
             ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
                 $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
+    `endif
+    
     end    
 
 endmodule
diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv
index 34b2ed28..d668539b 100644
--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@@ -52,30 +52,27 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
  
     localparam MAN_BITS = 23;
     localparam EXP_BITS = 8;
-    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;    
-
-    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
-    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
     
     // Use 32-bit integer
-    localparam MAX_INT_WIDTH = 32;
+    localparam INT_WIDTH = 32;
 
     // The internal mantissa includes normal bit or an entire integer
-    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
+    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
 
     // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
     localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
 
     // The internal exponent must be able to represent the smallest denormal input value as signed
     // or the number of bits in an integer
-    localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
+    localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
 
     // shift amount for denormalization
     localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
 
     localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
     localparam NUM_FP_STICKY  = 2 * INT_MAN_WIDTH - MAN_BITS - 1;   // removed mantissa, 1. and R
-    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH;  // removed int and R
+    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH;  // removed int and R
     
     // Input processing
     
@@ -86,8 +83,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
             .EXP_BITS (EXP_BITS),
             .MAN_BITS (MAN_BITS)
         ) fp_class (
-            .exp_i  (dataa[i][30:23]),
-            .man_i  (dataa[i][22:0]),
+            .exp_i  (dataa[i][INT_WIDTH-2:MAN_BITS]),
+            .man_i  (dataa[i][MAN_BITS-1:0]),
             .clss_o (fclass[i])
         );
     end
@@ -97,15 +94,13 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
     wire [NUM_LANES-1:0]                    input_sign;
     
     for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [INT_MAN_WIDTH-1:0] int_mantissa;
-        wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
-        wire fmt_sign        = dataa[i][31];
-        wire int_sign        = dataa[i][31] && is_signed;
-        assign int_mantissa  = int_sign ? (-dataa[i]) : dataa[i];
-        assign fmt_mantissa  = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
+        wire i2f_sign = dataa[i][INT_WIDTH-1];
+        wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
+        wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
+        wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
         assign input_exp[i]  = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
-        assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
-        assign input_sign[i] = is_itof ? int_sign : fmt_sign;
+        assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
+        assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
     end
 
     // Pipeline stage0
@@ -159,9 +154,9 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
 
         // Unbias exponent and compensate for shift
-        wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
-        wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
-        assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
+        wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
     end
 
     // Pipeline stage1
@@ -193,51 +188,32 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
 
     wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
     wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
-    wire [NUM_LANES-1:0]                    of_before_round_s1;
+    wire [NUM_LANES-1:0] of_before_round_s1;
 
-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        reg [2*INT_MAN_WIDTH:0] preshift_mant_s1;   // mantissa before final shift                
-        reg [SHAMT_BITS-1:0]    denorm_shamt_s1;    // shift amount for denormalization
-        reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1;   // after eventual adjustments
-        reg                     of_before_round_tmp_s1;
+    for (genvar i = 0; i < NUM_LANES; ++i) begin           
+        reg [SHAMT_BITS-1:0] denorm_shamt_s1;    // shift amount for denormalization
+        reg of_before_round_tmp_s1;
 
         always @(*) begin
-            final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
-            preshift_mant_s1 = {input_mant_s1[i], 33'b0};
             denorm_shamt_s1  = '0;
             of_before_round_tmp_s1 = 1'b0;
 
-            if (is_itof_s1) begin                   
-                if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
-                    // Overflow or infinities (for proper rounding)
-                    final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
-                    preshift_mant_s1 = ~0;  // largest normal value and RS bits set
-                    of_before_round_tmp_s1 = 1'b1;
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
-                    // Limit the shift to retain sticky bits
-                    final_exp_tmp_s1 = '0; // denormal result
-                    denorm_shamt_s1  = (2 + MAN_BITS); // to sticky                
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
-                    // Denormalize underflowing values
-                    final_exp_tmp_s1 = '0; // denormal result
-                    denorm_shamt_s1  = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting               
-                end
-            end else begin
-                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
-                    // overflow: when converting to unsigned the range is larger by one
+            if (!is_itof_s1) begin
+                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
+                    // overflow
                     of_before_round_tmp_s1 = 1'b1;                
                 end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
                     // underflow
-                    denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
+                    denorm_shamt_s1 = INT_WIDTH+1; // all bits go to the sticky
                 end else begin
                     // By default right shift mantissa to be an integer
-                    denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
+                    denorm_shamt_s1 = SHAMT_BITS'(INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
                 end              
             end
         end
 
-        assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
-        assign final_exp_s1[i]        = final_exp_tmp_s1;
+        assign destination_mant_s1[i] = {input_mant_s1[i], 33'b0} >> denorm_shamt_s1;
+        assign final_exp_s1[i]        = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
         assign of_before_round_s1[i]  = of_before_round_tmp_s1;
     end
 
@@ -267,33 +243,33 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
     );
 
-    wire [NUM_LANES-1:0]       rounded_sign_s2;
-    wire [NUM_LANES-1:0][31:0] rounded_abs_s2;      // absolute value of result after rounding
-    wire [NUM_LANES-1:0]       int_round_has_sticky_s2;
-    wire [NUM_LANES-1:0]       fp_round_has_sticky_s2;
+    wire [NUM_LANES-1:0]    rounded_sign_s2;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
+    wire [NUM_LANES-1:0]    f2i_round_has_sticky_s2;
+    wire [NUM_LANES-1:0]    i2f_round_has_sticky_s2;
     
     // Rouding and classification
    
     for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [MAN_BITS-1:0]      final_mant_s2;        // mantissa after adjustments
-        wire [MAX_INT_WIDTH-1:0] final_int_s2;         // integer shifted in position
-        wire [1:0]               round_sticky_bits_s2;
-        wire [31:0]              fmt_pre_round_abs_s2;
-        wire [31:0]              pre_round_abs_s2;
-        wire [1:0]               int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
+        wire [MAN_BITS-1:0]     final_mant_s2;  // mantissa after adjustments
+        wire [INT_WIDTH-1:0]    final_int_s2;   // integer shifted in position
+        wire [1:0]              round_sticky_bits_s2;
+        wire [INT_WIDTH-1:0]    fmt_pre_round_abs_s2;
+        wire [INT_WIDTH-1:0]    pre_round_abs_s2;
+        wire [1:0]              f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
 
         // Extract final mantissa and round bit, discard the normal bit (for FP)
-        assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
-        assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (MAX_INT_WIDTH+1) + 1];
+        assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
+        assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (INT_WIDTH+1) + 1];
 
         // Collapse sticky bits
-        assign fp_round_sticky_bits_s2[0]  = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
-        assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
-        assign fp_round_has_sticky_s2[i]   = (| fp_round_sticky_bits_s2);
-        assign int_round_has_sticky_s2[i]  = (| int_round_sticky_bits_s2);
+        assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
+        assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
+        assign i2f_round_has_sticky_s2[i]  = (| i2f_round_sticky_bits_s2);
+        assign f2i_round_has_sticky_s2[i]  = (| f2i_round_sticky_bits_s2);
 
         // select RS bits for destination operation
-        assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
+        assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
 
         // Pack exponent and mantissa into proper rounding form
         assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
@@ -327,10 +303,10 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
     wire [NUM_LANES-1:0] mant_is_zero_s3;
     wire [NUM_LANES-1:0] input_sign_s3;
     wire [NUM_LANES-1:0] rounded_sign_s3;
-    wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
     wire [NUM_LANES-1:0] of_before_round_s3;   
-    wire [NUM_LANES-1:0] int_round_has_sticky_s3;
-    wire [NUM_LANES-1:0] fp_round_has_sticky_s3; 
+    wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
+    wire [NUM_LANES-1:0] i2f_round_has_sticky_s3; 
 
     VX_pipe_register #(
         .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
@@ -339,105 +315,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
-        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
+        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
+        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
     );
      
-    wire [NUM_LANES-1:0] of_after_round_s3;
-    wire [NUM_LANES-1:0] uf_after_round_s3;
-    wire [NUM_LANES-1:0][31:0] fmt_result_s3;
-    wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
     wire [NUM_LANES-1:0] rounded_int_res_zero_s3;  // after rounding
 
     for (genvar i = 0; i < NUM_LANES; ++i) begin
         // Assemble regular result, nan box short ones. Int zeroes need to be detected
-        assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
-
-        // Classification after rounding select by destination format
-        assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0);  // denormal
-        assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
+        assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
 
         // Negative integer result needs to be brought into two's complement
         assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
         assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
     end
 
-    // FP Special case handling
+    // F2I Special case handling
 
-    wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
-    fflags_t [NUM_LANES-1:0]   fp_special_status_s3;
-    wire [NUM_LANES-1:0]       fp_result_is_special_s3;
-
-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        // Detect special case from source format, I2F casts don't produce a special result
-        assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
-
-        // Signalling input NaNs raise invalid flag, otherwise no flags set
-        assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
-
-        // Assemble result according to destination format
-        assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
-                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
-    end
-
-    // INT Special case handling
-
-    reg [NUM_LANES-1:0][31:0] int_special_result_s3;
-    fflags_t [NUM_LANES-1:0]  int_special_status_s3;
-    wire [NUM_LANES-1:0]      int_result_is_special_s3;
+    reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
+    fflags_t [NUM_LANES-1:0]  f2i_special_status_s3;
+    wire [NUM_LANES-1:0]      f2i_result_is_special_s3;
 
     for (genvar i = 0; i < NUM_LANES; ++i) begin
          // Assemble result according to destination format
         always @(*) begin
             if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
-                int_special_result_s3[i][30:0] = '0;            // alone yields 2**(31)-1
-                int_special_result_s3[i][31]   = ~unsigned_s3;  // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-2:0] = '0;            // alone yields 2**(31)-1
+                f2i_special_result_s3[i][INT_WIDTH-1]   = ~unsigned_s3;  // for unsigned casts yields 2**31
             end else begin
-                int_special_result_s3[i][30:0] = 2**(31) - 1;   // alone yields 2**(31)-1
-                int_special_result_s3[i][31]   = unsigned_s3;   // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1;   // alone yields 2**(31)-1
+                f2i_special_result_s3[i][INT_WIDTH-1]   = unsigned_s3;   // for unsigned casts yields 2**31
             end
         end            
 
         // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
-        assign int_result_is_special_s3[i] = fclass_s3[i].is_nan 
+        assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan 
                                            | fclass_s3[i].is_inf
                                            | of_before_round_s3[i]
                                            | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
                                         
         // All integer special cases are invalid
-        assign int_special_status_s3[i] = {1'b1, 4'h0};
+        assign f2i_special_status_s3[i] = {1'b1, 4'h0};
     end
 
     // Result selection and Output handshake
 
     fflags_t [NUM_LANES-1:0] tmp_fflags_s3;    
-    wire [NUM_LANES-1:0][31:0] tmp_result_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
 
-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        fflags_t fp_regular_status_s3, int_regular_status_s3;
-        fflags_t fp_status_s3, int_status_s3;    
-        wire [31:0] fp_result_s3, int_result_s3;
+    for (genvar i = 0; i < NUM_LANES; ++i) begin                                  
+        fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
+        fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
 
-        wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;        
-                                     : (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
-                                  
-        assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
-        assign fp_regular_status_s3.DZ = 1'b0; // no divisions
-        assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
-        assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
-        assign fp_regular_status_s3.NX = inexact_s3;
+        fflags_t i2f_status_s3 = i2f_regular_status_s3;
+        fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
 
-        assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
+        wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
+        wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
 
-        assign fp_result_s3  = fp_result_is_special_s3[i]  ? fp_special_result_s3[i]  : fmt_result_s3[i];        
-        assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
-
-        assign fp_status_s3  = fp_result_is_special_s3[i]  ? fp_special_status_s3[i]  : fp_regular_status_s3;
-        assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
-
-        // Select output depending on special case detection
-        assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
-        assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
+        assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
+        assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
     end
 
     assign stall = ~ready_out && valid_out;
@@ -457,7 +396,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
     );
 
     assign ready_in = ~stall;
-
     assign has_fflags = 1'b1;
 
 endmodule
diff --git a/hw/rtl/fpu/VX_fpu_rounding.sv b/hw/rtl/fpu/VX_fpu_rounding.sv
index 5168fada..877b4eb6 100644
--- a/hw/rtl/fpu/VX_fpu_rounding.sv
+++ b/hw/rtl/fpu/VX_fpu_rounding.sv
@@ -54,7 +54,6 @@ module VX_fpu_rounding #(
                       2'b01: round_up = 1'b0;            // < ulp/2 away, round down
                       2'b10: round_up = abs_value_i[0];  // = ulp/2 away, round towards even result
                       2'b11: round_up = 1'b1;            // > ulp/2 away, round up
-                    default: round_up = 1'bx;
                 endcase
             `INST_FRM_RTZ: round_up = 1'b0; // always round down
             `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i;  // to 0 if +, away if -

From b274b8cc217683ec9834b5393bd75406da395f6b Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Mon, 13 Nov 2023 00:23:15 -0800
Subject: [PATCH 03/11] minor updates

---
 hw/rtl/core/VX_operands.sv | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv
index 3d2c570c..3ff5df46 100644
--- a/hw/rtl/core/VX_operands.sv
+++ b/hw/rtl/core/VX_operands.sv
@@ -38,9 +38,12 @@ module VX_operands import VX_gpu_pkg::*; #(
         reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
         reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
 
-        reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
-        reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
-        reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
+        reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
+        reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
+        reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
+        reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
+        reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
+        reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
         reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
 
         reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
@@ -160,11 +163,8 @@ module VX_operands import VX_gpu_pkg::*; #(
                     end
                     cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
                     cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
-                    if (writeback_if[i].data.sop) begin
-                        cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
-                    end else begin
-                        cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
-                    end                
+                    cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask : 
+                                                                    (cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
                 end
             end
         end
@@ -175,7 +175,6 @@ module VX_operands import VX_gpu_pkg::*; #(
                 gpr_rd_rid  <= '0;
                 gpr_rd_wis  <= '0;
                 cache_eop   <= {ISSUE_RATIO{1'b1}};
-                cache_reg   <= '0;
                 data_ready  <= 0;
             end else begin
                 state       <= state_n;

From ecf546bc4ab08e1f6f982275bc8fd15c52ce7b64 Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Mon, 13 Nov 2023 20:00:39 -0800
Subject: [PATCH 04/11] minor update

---
 hw/rtl/fpu/VX_fpu_cvt.sv | 131 ++++++++++++++++++---------------------
 1 file changed, 60 insertions(+), 71 deletions(-)

diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv
index d668539b..e12e51ad 100644
--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@@ -67,9 +67,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
     // or the number of bits in an integer
     localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
 
-    // shift amount for denormalization
-    localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
-
     localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
     localparam NUM_FP_STICKY  = 2 * INT_MAN_WIDTH - MAN_BITS - 1;   // removed mantissa, 1. and R
     localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH;  // removed int and R
@@ -105,14 +102,14 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
 
     // Pipeline stage0
     
-    wire                    valid_in_s0;
-    wire [NUM_LANES-1:0]    lane_mask_s0;
-    wire [TAGW-1:0]         tag_in_s0;
-    wire                    is_itof_s0;
-    wire                    unsigned_s0;
-    wire [2:0]              rnd_mode_s0;
+    wire                 valid_in_s0;
+    wire [NUM_LANES-1:0] lane_mask_s0;
+    wire [TAGW-1:0]      tag_in_s0;
+    wire                 is_itof_s0;
+    wire                 is_signed_s0;
+    wire [2:0]           rnd_mode_s0;
     fclass_t [NUM_LANES-1:0] fclass_s0;
-    wire [NUM_LANES-1:0]    input_sign_s0;
+    wire [NUM_LANES-1:0] input_sign_s0;
     wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
     wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
 
@@ -125,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
-        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
+        .data_in  ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
+        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
     );
     
     // Normalization
@@ -161,15 +158,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
 
     // Pipeline stage1
 
-    wire                    valid_in_s1;
-    wire [NUM_LANES-1:0]    lane_mask_s1;
-    wire [TAGW-1:0]         tag_in_s1;
-    wire                    is_itof_s1;
-    wire                    unsigned_s1;
-    wire [2:0]              rnd_mode_s1;
+    wire                 valid_in_s1;
+    wire [NUM_LANES-1:0] lane_mask_s1;
+    wire [TAGW-1:0]      tag_in_s1;
+    wire                 is_itof_s1;
+    wire                 is_signed_s1;
+    wire [2:0]           rnd_mode_s1;
     fclass_t [NUM_LANES-1:0] fclass_s1;
-    wire [NUM_LANES-1:0]    input_sign_s1;
-    wire [NUM_LANES-1:0]    mant_is_zero_s1;
+    wire [NUM_LANES-1:0] input_sign_s1;
+    wire [NUM_LANES-1:0] mant_is_zero_s1;
     wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
     wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
 
@@ -180,8 +177,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
-        .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
+        .data_in  ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
+        .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
     );
 
     // Perform adjustments to mantissa and exponent
@@ -190,47 +187,39 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
     wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
     wire [NUM_LANES-1:0] of_before_round_s1;
 
-    for (genvar i = 0; i < NUM_LANES; ++i) begin           
-        reg [SHAMT_BITS-1:0] denorm_shamt_s1;    // shift amount for denormalization
-        reg of_before_round_tmp_s1;
-
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
+        wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
+        wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
+        reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
         always @(*) begin
-            denorm_shamt_s1  = '0;
-            of_before_round_tmp_s1 = 1'b0;
-
-            if (!is_itof_s1) begin
-                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
-                    // overflow
-                    of_before_round_tmp_s1 = 1'b1;                
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
-                    // underflow
-                    denorm_shamt_s1 = INT_WIDTH+1; // all bits go to the sticky
-                end else begin
-                    // By default right shift mantissa to be an integer
-                    denorm_shamt_s1 = SHAMT_BITS'(INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
-                end              
+            if (overflow) begin
+                denorm_shamt_q = '0;
+            end else if (underflow) begin
+                denorm_shamt_q = INT_WIDTH+1;
+            end else begin
+                denorm_shamt_q = denorm_shamt;
             end
         end
-
-        assign destination_mant_s1[i] = {input_mant_s1[i], 33'b0} >> denorm_shamt_s1;
+        assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
         assign final_exp_s1[i]        = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
-        assign of_before_round_s1[i]  = of_before_round_tmp_s1;
+        assign of_before_round_s1[i]  = overflow;
     end
 
     // Pipeline stage2
     
-    wire                    valid_in_s2;
-    wire [NUM_LANES-1:0]    lane_mask_s2;
-    wire [TAGW-1:0]         tag_in_s2;
-    wire                    is_itof_s2;
-    wire                    unsigned_s2;
-    wire [2:0]              rnd_mode_s2;
+    wire                 valid_in_s2;
+    wire [NUM_LANES-1:0] lane_mask_s2;
+    wire [TAGW-1:0]      tag_in_s2;
+    wire                 is_itof_s2;
+    wire                 is_signed_s2;
+    wire [2:0]           rnd_mode_s2;
     fclass_t [NUM_LANES-1:0] fclass_s2;   
-    wire [NUM_LANES-1:0]    mant_is_zero_s2;
-    wire [NUM_LANES-1:0]    input_sign_s2;
+    wire [NUM_LANES-1:0] mant_is_zero_s2;
+    wire [NUM_LANES-1:0] input_sign_s2;
     wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
     wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
-    wire [NUM_LANES-1:0]    of_before_round_s2;
+    wire [NUM_LANES-1:0] of_before_round_s2;
     
     VX_pipe_register #(
         .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
@@ -239,24 +228,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
-        .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
+        .data_in  ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
+        .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
     );
 
-    wire [NUM_LANES-1:0]    rounded_sign_s2;
+    wire [NUM_LANES-1:0] rounded_sign_s2;
     wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
-    wire [NUM_LANES-1:0]    f2i_round_has_sticky_s2;
-    wire [NUM_LANES-1:0]    i2f_round_has_sticky_s2;
+    wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
+    wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
     
     // Rouding and classification
    
     for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [MAN_BITS-1:0]     final_mant_s2;  // mantissa after adjustments
-        wire [INT_WIDTH-1:0]    final_int_s2;   // integer shifted in position
-        wire [1:0]              round_sticky_bits_s2;
-        wire [INT_WIDTH-1:0]    fmt_pre_round_abs_s2;
-        wire [INT_WIDTH-1:0]    pre_round_abs_s2;
-        wire [1:0]              f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
+        wire [MAN_BITS-1:0]  final_mant_s2;  // mantissa after adjustments
+        wire [INT_WIDTH-1:0] final_int_s2;   // integer shifted in position
+        wire [1:0]           round_sticky_bits_s2;
+        wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
+        wire [INT_WIDTH-1:0] pre_round_abs_s2;
+        wire [1:0]           f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
 
         // Extract final mantissa and round bit, discard the normal bit (for FP)
         assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
@@ -298,7 +287,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
     wire [NUM_LANES-1:0] lane_mask_s3;
     wire [TAGW-1:0]      tag_in_s3;
     wire                 is_itof_s3;
-    wire                 unsigned_s3;
+    wire                 is_signed_s3;
     fclass_t [NUM_LANES-1:0] fclass_s3;   
     wire [NUM_LANES-1:0] mant_is_zero_s3;
     wire [NUM_LANES-1:0] input_sign_s3;
@@ -315,8 +304,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
-        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
+        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
+        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
     );
      
     wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
@@ -335,18 +324,18 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
     // F2I Special case handling
 
     reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
-    fflags_t [NUM_LANES-1:0]  f2i_special_status_s3;
-    wire [NUM_LANES-1:0]      f2i_result_is_special_s3;
+    fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
+    wire [NUM_LANES-1:0] f2i_result_is_special_s3;
 
     for (genvar i = 0; i < NUM_LANES; ++i) begin
          // Assemble result according to destination format
         always @(*) begin
             if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
                 f2i_special_result_s3[i][INT_WIDTH-2:0] = '0;            // alone yields 2**(31)-1
-                f2i_special_result_s3[i][INT_WIDTH-1]   = ~unsigned_s3;  // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-1]   = is_signed_s3;  // for unsigned casts yields 2**31
             end else begin
                 f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1;   // alone yields 2**(31)-1
-                f2i_special_result_s3[i][INT_WIDTH-1]   = unsigned_s3;   // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-1]   = ~is_signed_s3;   // for unsigned casts yields 2**31
             end
         end            
 
@@ -354,7 +343,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
         assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan 
                                            | fclass_s3[i].is_inf
                                            | of_before_round_s3[i]
-                                           | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
+                                           | (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
                                         
         // All integer special cases are invalid
         assign f2i_special_status_s3[i] = {1'b1, 4'h0};

From 4e7a536918638e09e9f4707685a91fa5e2bb451d Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Tue, 14 Nov 2023 05:37:46 -0800
Subject: [PATCH 05/11] adding tensor regression test.

---
 kernel/src/vx_spawn.c               |   9 +-
 tests/opencl/matmul/kernel.cl       |  54 +++---
 tests/opencl/matmul/main.cc         |   4 +-
 tests/regression/Makefile           |   6 +
 tests/regression/basic/main.cpp     |   7 +-
 tests/regression/demo/common.h      |   6 +-
 tests/regression/demo/kernel.cpp    |   8 +-
 tests/regression/demo/main.cpp      |  94 ++++++++---
 tests/regression/diverge/main.cpp   |   7 +-
 tests/regression/fence/main.cpp     |   7 +-
 tests/regression/io_addr/main.cpp   |  11 +-
 tests/regression/mstress/main.cpp   |  11 +-
 tests/regression/no_mf_ext/main.cpp |  11 +-
 tests/regression/no_smem/main.cpp   |   9 +-
 tests/regression/printf/main.cpp    |  11 +-
 tests/regression/sort/common.h      |   8 +-
 tests/regression/sort/kernel.cpp    |   8 +-
 tests/regression/sort/main.cpp      |  19 +--
 tests/regression/tensor/Makefile    |   9 +
 tests/regression/tensor/common.h    |  18 ++
 tests/regression/tensor/kernel.cpp  |  41 +++++
 tests/regression/tensor/main.cpp    | 249 ++++++++++++++++++++++++++++
 22 files changed, 474 insertions(+), 133 deletions(-)
 create mode 100644 tests/regression/tensor/Makefile
 create mode 100644 tests/regression/tensor/common.h
 create mode 100644 tests/regression/tensor/kernel.cpp
 create mode 100644 tests/regression/tensor/main.cpp

diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c
index 14773707..fd8258e1 100644
--- a/kernel/src/vx_spawn.c
+++ b/kernel/src/vx_spawn.c
@@ -51,9 +51,8 @@ inline char is_log2(int x) {
   return ((x & (x-1)) == 0);
 }
 
-inline int fast_log2(int x) {
-  float f = x;
-  return (*(int*)(&f)>>23) - 127;
+inline int log2_fast(int x) {
+  return 31 - __builtin_clz (x);
 }
 
 static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
@@ -286,8 +285,8 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
 
   // fast path handling
   char isXYpow2 = is_log2(XY);
-  char log2XY   = fast_log2(XY);
-  char log2X    = fast_log2(X);
+  char log2XY   = log2_fast(XY);
+  char log2X    = log2_fast(X);
 
   wspawn_kernel_args_t wspawn_args = { 
     ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X
diff --git a/tests/opencl/matmul/kernel.cl b/tests/opencl/matmul/kernel.cl
index ea9b2156..a0ef2d81 100644
--- a/tests/opencl/matmul/kernel.cl
+++ b/tests/opencl/matmul/kernel.cl
@@ -5,35 +5,37 @@ __kernel void matmul(__global float *A,
                      __local float *localA, 
                      __local float *localB)
 {
-    int row = get_global_id(1);
-    int col = get_global_id(0);
+    int globalRow = get_global_id(1);
+    int globalCol = get_global_id(0);
     int localRow = get_local_id(1);
     int localCol = get_local_id(0);
     int localSize = get_local_size(0);  // assuming square local size
 
     float sum = 0.0f;
 
-    // Loop over all blocks of both matrices
-    for (int k = 0; k < N; k += localSize) {
-        // Load block of matrix A to local memory
-        localA[localRow * localSize + localCol] = A[row * N + k + localCol];
+    // Load initial blocks of A and B into local memory
+    int k = 0;
+    localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
+    localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
 
-        // Load block of matrix B to local memory, adjusting for column-major access
-        localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
-
-        // Synchronize to make sure the tiles are loaded
+    // Iterate over blocks
+    for (k = 0; k < N; k += 16) {
+        // Ensure the initial block is loaded
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        // Multiply the two matrix blocks and accumulate result
-        for (int j = 0; j < localSize; j++) {
+        // Compute multiplication for this block
+        for (int j = 0; j < 16; j++) {
             sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
         }
 
-        // Synchronize before loading the next block
-        barrier(CLK_LOCAL_MEM_FENCE);
+        // Load the next block of matrix A into local memory
+        if (k + 16 < N) {
+            localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol];
+            localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol];
+        }
     }
 
-    C[row * N + col] = sum;
+    C[globalRow * N + globalCol] = sum;
 }
 
 /*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
@@ -49,15 +51,14 @@ __kernel void matmul(__global float *A,
 
     float sum = 0.0f;
 
+    // Load initial blocks of A and B into local memory
+    int k = 0;
+    localA[localRow][localCol] = A[globalRow * N + k + localCol];
+    localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
+
     // Iterate over blocks
-    for (int k = 0; k < N; k += 16) {
-        // Load a block of matrix A into local memory
-        localA[localRow][localCol] = A[globalRow * N + k + localCol];
-
-        // Load a block of matrix B into local memory
-        localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
-
-        // Ensure the entire block is loaded
+    for (k = 0; k < N; k += 16) {
+        // Ensure the initial block is loaded
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Compute multiplication for this block
@@ -65,8 +66,11 @@ __kernel void matmul(__global float *A,
             sum += localA[localRow][j] * localB[j][localCol];
         }
 
-        // Wait until all threads have computed before loading the next block
-        barrier(CLK_LOCAL_MEM_FENCE);
+        // Load the next block of matrix A into local memory
+        if (k + 16 < N) {
+            localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol];
+            localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol];
+        }
     }
 
     C[globalRow * N + globalCol] = sum;
diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/matmul/main.cc
index 8e20a3ef..f7714dd7 100644
--- a/tests/opencl/matmul/main.cc
+++ b/tests/opencl/matmul/main.cc
@@ -184,8 +184,8 @@ int main (int argc, char **argv) {
   CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
   CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
   CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
-  CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
-  CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
+  //CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
+  //CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
 
  // Allocate memories for input arrays and output arrays.
  std::vector<float> h_a(size * size);
diff --git a/tests/regression/Makefile b/tests/regression/Makefile
index 5ba29d57..89fa25af 100644
--- a/tests/regression/Makefile
+++ b/tests/regression/Makefile
@@ -10,6 +10,7 @@ all:
 	$(MAKE) -C fence
 	$(MAKE) -C no_mf_ext
 	$(MAKE) -C no_smem
+	$(MAKE) -C tensor
 
 run-simx:
 	$(MAKE) -C basic run-simx
@@ -23,6 +24,7 @@ run-simx:
 	$(MAKE) -C fence run-simx
 	$(MAKE) -C no_mf_ext run-simx
 	$(MAKE) -C no_smem run-simx
+	$(MAKE) -C tensor run-simx
 
 run-rtlsim:
 	$(MAKE) -C basic run-rtlsim
@@ -36,6 +38,7 @@ run-rtlsim:
 	$(MAKE) -C fence run-rtlsim
 	$(MAKE) -C no_mf_ext run-rtlsim
 	$(MAKE) -C no_smem run-rtlsim
+	$(MAKE) -C tensor run-rtlsim
 
 run-opae:
 	$(MAKE) -C basic run-opae
@@ -49,6 +52,7 @@ run-opae:
 	$(MAKE) -C fence run-opae
 	$(MAKE) -C no_mf_ext run-opae
 	$(MAKE) -C no_smem run-opae
+	$(MAKE) -C tensor run-opae
 
 clean:
 	$(MAKE) -C basic clean
@@ -62,6 +66,7 @@ clean:
 	$(MAKE) -C fence clean
 	$(MAKE) -C no_mf_ext clean
 	$(MAKE) -C no_smem clean
+	$(MAKE) -C tensor clean
 
 clean-all:
 	$(MAKE) -C basic clean-all
@@ -75,3 +80,4 @@ clean-all:
 	$(MAKE) -C fence clean-all
 	$(MAKE) -C no_mf_ext clean-all
 	$(MAKE) -C no_smem clean-all
+	$(MAKE) -C tensor clean-all
diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp
index e79387b5..0f6f3bde 100755
--- a/tests/regression/basic/main.cpp
+++ b/tests/regression/basic/main.cpp
@@ -262,11 +262,8 @@ int main(int argc, char *argv[]) {
 
     // upload kernel argument
     std::cout << "upload kernel argument" << std::endl;
-    {
-      auto buf_ptr = (void*)staging_buf.data();
-      memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-      RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-    }
+    memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
 
     std::cout << "run kernel test" << std::endl;
     RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
diff --git a/tests/regression/demo/common.h b/tests/regression/demo/common.h
index e18b65a0..941983ac 100644
--- a/tests/regression/demo/common.h
+++ b/tests/regression/demo/common.h
@@ -3,6 +3,10 @@
 
 #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
 
+#ifndef TYPE
+#define TYPE float
+#endif
+
 typedef struct {
   uint32_t num_tasks;
   uint32_t task_size;
@@ -11,4 +15,4 @@ typedef struct {
   uint64_t dst_addr;  
 } kernel_arg_t;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/tests/regression/demo/kernel.cpp b/tests/regression/demo/kernel.cpp
index deb56169..49945440 100644
--- a/tests/regression/demo/kernel.cpp
+++ b/tests/regression/demo/kernel.cpp
@@ -4,11 +4,11 @@
 #include "common.h"
 
 void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
-	uint32_t count    = arg->task_size;
-	int32_t* src0_ptr = (int32_t*)arg->src0_addr;
-	int32_t* src1_ptr = (int32_t*)arg->src1_addr;
-	int32_t* dst_ptr  = (int32_t*)arg->dst_addr;
+	auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
+	auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
+	auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
 	
+	uint32_t count = arg->task_size;
 	uint32_t offset = task_id * count;
 
 	for (uint32_t i = 0; i < count; ++i) {
diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp
index dfe33377..63556a5f 100644
--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@@ -5,6 +5,8 @@
 #include <vortex.h>
 #include "common.h"
 
+#define FLOAT_ULP 6
+
 #define RT_CHECK(_expr)                                         \
    do {                                                         \
      int _ret = _expr;                                          \
@@ -17,10 +19,52 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+union Float_t {    
+    float f;
+    int   i;
+    struct {
+        uint32_t man  : 23;
+        uint32_t exp  : 8;
+        uint32_t sign : 1;
+    } parts;
+};
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static bool compare(int a, int b) { 
+    return a == b; 
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static bool compare(float a, float b) { 
+    Float_t fa{a}, fb{b};
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      return false;
+    }
+    return true;
+  }  
+};
+
 const char* kernel_file = "kernel.bin";
-uint32_t count = 0;
+uint32_t count = 16;
 
 vx_device_h device = nullptr;
+std::vector<TYPE> source_data;
 std::vector<uint8_t> staging_buf;
 kernel_arg_t kernel_arg = {};
 
@@ -79,11 +123,11 @@ int run_test(const kernel_arg_t& kernel_arg,
   std::cout << "verify result" << std::endl;  
   {
     int errors = 0;
-    auto buf_ptr = (int32_t*)staging_buf.data();
+    auto buf_ptr = (TYPE*)staging_buf.data();
     for (uint32_t i = 0; i < num_points; ++i) {
-      int ref = i + i; 
-      int cur = buf_ptr[i];
-      if (cur != ref) {
+      auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
+      auto cur = buf_ptr[i];
+      if (!Comparator<TYPE>::compare(cur, ref)) {
         std::cout << "error at result #" << std::dec << i
                   << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
         ++errors;
@@ -103,9 +147,7 @@ int main(int argc, char *argv[]) {
   // parse command arguments
   parse_args(argc, argv);
 
-  if (count == 0) {
-    count = 1;
-  }
+  std::srand(50);
 
   // open device connection
   std::cout << "open device connection" << std::endl;  
@@ -118,8 +160,9 @@ int main(int argc, char *argv[]) {
 
   uint32_t num_tasks  = num_cores * num_warps * num_threads;
   uint32_t num_points = count * num_tasks;
-  uint32_t buf_size   = num_points * sizeof(int32_t);
+  uint32_t buf_size   = num_points * sizeof(TYPE);
 
+  std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
   std::cout << "number of points: " << num_points << std::endl;
   std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
 
@@ -147,18 +190,22 @@ int main(int argc, char *argv[]) {
   
   // upload kernel argument
   std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+
+  // generate source data
+  source_data.resize(2 * num_points);
+  for (uint32_t i = 0; i < source_data.size(); ++i) {
+    auto r = static_cast<float>(std::rand()) / RAND_MAX;
+    source_data[i] = static_cast<TYPE>(r * 2 * num_points);
   }
 
   // upload source buffer0
   {
     std::cout << "upload source buffer0" << std::endl;
-    auto buf_ptr = (int32_t*)staging_buf.data();
+    auto buf_ptr = (TYPE*)staging_buf.data();
     for (uint32_t i = 0; i < num_points; ++i) {
-      buf_ptr[i] = i-1;
+      buf_ptr[i] = source_data[2 * i + 0];
     }
     RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
   }
@@ -166,23 +213,18 @@ int main(int argc, char *argv[]) {
   // upload source buffer1
   {
     std::cout << "upload source buffer1" << std::endl;
-    auto buf_ptr = (int32_t*)staging_buf.data();
+    auto buf_ptr = (TYPE*)staging_buf.data();
     for (uint32_t i = 0; i < num_points; ++i) {
-      buf_ptr[i] = i+1;
+      buf_ptr[i] = source_data[2 * i + 1];
     }   
     RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
   }
 
   // clear destination buffer
-  {
-    std::cout << "clear destination buffer" << std::endl;      
-    auto buf_ptr = (int32_t*)staging_buf.data();
-    for (uint32_t i = 0; i < num_points; ++i) {
-      buf_ptr[i] = 0xdeadbeef;
-    }  
-    RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));  
-  }
-
+  std::cout << "clear destination buffer" << std::endl;
+  memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
+  RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));  
+  
   // run tests
   std::cout << "run tests" << std::endl;
   RT_CHECK(run_test(kernel_arg, buf_size, num_points));
diff --git a/tests/regression/diverge/main.cpp b/tests/regression/diverge/main.cpp
index 742f2419..d5de1bc1 100644
--- a/tests/regression/diverge/main.cpp
+++ b/tests/regression/diverge/main.cpp
@@ -233,11 +233,8 @@ int main(int argc, char *argv[]) {
   
   // upload kernel argument
   std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
 
   // upload source buffer
   {
diff --git a/tests/regression/fence/main.cpp b/tests/regression/fence/main.cpp
index d9f2920f..c9225edc 100644
--- a/tests/regression/fence/main.cpp
+++ b/tests/regression/fence/main.cpp
@@ -147,11 +147,8 @@ int main(int argc, char *argv[]) {
   
   // upload kernel argument
   std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
 
   // upload source buffer0
   {
diff --git a/tests/regression/io_addr/main.cpp b/tests/regression/io_addr/main.cpp
index d4c74aad..0272bfbc 100644
--- a/tests/regression/io_addr/main.cpp
+++ b/tests/regression/io_addr/main.cpp
@@ -190,13 +190,10 @@ int main(int argc, char *argv[]) {
   staging_buf.resize(staging_buf_size);
   
   // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
   // upload test address data
   {
     std::cout << "upload test address data" << std::endl;
diff --git a/tests/regression/mstress/main.cpp b/tests/regression/mstress/main.cpp
index ecc867bc..9b527126 100644
--- a/tests/regression/mstress/main.cpp
+++ b/tests/regression/mstress/main.cpp
@@ -236,13 +236,10 @@ int main(int argc, char *argv[]) {
   staging_buf.resize(staging_buf_size);
   
   // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
   // upload source buffer0
   {
     std::cout << "upload address buffer" << std::endl;
diff --git a/tests/regression/no_mf_ext/main.cpp b/tests/regression/no_mf_ext/main.cpp
index 7632dad1..e711b99a 100644
--- a/tests/regression/no_mf_ext/main.cpp
+++ b/tests/regression/no_mf_ext/main.cpp
@@ -136,13 +136,10 @@ int main(int argc, char *argv[]) {
   staging_buf.resize(alloc_size);
   
   // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
   // upload source buffer0
   {
     std::cout << "upload source buffer" << std::endl;
diff --git a/tests/regression/no_smem/main.cpp b/tests/regression/no_smem/main.cpp
index 8bb00389..53db0465 100644
--- a/tests/regression/no_smem/main.cpp
+++ b/tests/regression/no_smem/main.cpp
@@ -135,13 +135,10 @@ int main(int argc, char *argv[]) {
   uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
   staging_buf.resize(alloc_size);
   
-  // upload kernel argument
+  // upload kernel argument  
   std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
 
   // upload source buffer0
   {
diff --git a/tests/regression/printf/main.cpp b/tests/regression/printf/main.cpp
index 4b13faad..3a920294 100644
--- a/tests/regression/printf/main.cpp
+++ b/tests/regression/printf/main.cpp
@@ -110,13 +110,10 @@ int main(int argc, char *argv[]) {
   staging_buf.resize(alloc_size);
   
   // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (void*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
   // upload source buffer0
   {
     std::cout << "upload source buffer" << std::endl;
diff --git a/tests/regression/sort/common.h b/tests/regression/sort/common.h
index 492e03c6..92ceeb91 100644
--- a/tests/regression/sort/common.h
+++ b/tests/regression/sort/common.h
@@ -3,11 +3,7 @@
 
 #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
 
-#define FP_ENABLE 
-
-#ifdef FP_ENABLE
-#define TYPE float
-#else
+#ifndef TYPE
 #define TYPE int
 #endif
 
@@ -17,4 +13,4 @@ typedef struct {
   uint64_t dst_addr;  
 } kernel_arg_t;
 
-#endif
\ No newline at end of file
+#endif
diff --git a/tests/regression/sort/kernel.cpp b/tests/regression/sort/kernel.cpp
index 0cd7074e..2e9d3453 100644
--- a/tests/regression/sort/kernel.cpp
+++ b/tests/regression/sort/kernel.cpp
@@ -5,14 +5,14 @@
 
 void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
 	uint32_t num_points = arg->num_points;
-	TYPE* src_ptr = (TYPE*)arg->src_addr;
-	TYPE* dst_ptr = (TYPE*)arg->dst_addr;
+	auto src_ptr = (TYPE*)arg->src_addr;
+	auto dst_ptr = (TYPE*)arg->dst_addr;
 
-	TYPE ref_value = src_ptr[task_id];
+	auto ref_value = src_ptr[task_id];
 
 	uint32_t pos = 0;
 	for (uint32_t i = 0; i < num_points; ++i) {
-		TYPE cur_value = src_ptr[i];		
+		auto cur_value = src_ptr[i];		
 		pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
 	}
 	dst_ptr[pos] = ref_value;
diff --git a/tests/regression/sort/main.cpp b/tests/regression/sort/main.cpp
index 59796f73..38d5d4d4 100644
--- a/tests/regression/sort/main.cpp
+++ b/tests/regression/sort/main.cpp
@@ -66,8 +66,8 @@ void gen_input_data(uint32_t num_points) {
   src_data.resize(num_points);
 
   for (uint32_t i = 0; i < num_points; ++i) {
-    float r = static_cast<float>(std::rand()) / RAND_MAX;
-    TYPE value = r * num_points;
+    auto r = static_cast<float>(std::rand()) / RAND_MAX;
+    auto value = static_cast<TYPE>(r * num_points);
     src_data[i] = value;
     std::cout << std::dec << i << ": value=" << value << std::endl;
   }  
@@ -172,19 +172,16 @@ int main(int argc, char *argv[]) {
   {
     std::cout << "allocate staging buffer" << std::endl;    
     uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
-                                    std::max<uint32_t>(dst_buf_size, 
-                                      sizeof(kernel_arg_t)));
+                                  std::max<uint32_t>(dst_buf_size, 
+                                    sizeof(kernel_arg_t)));
     staging_buf.resize(staging_buf_size);
   }
   
   // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
   // upload source buffer
   {
     std::cout << "upload source buffer" << std::endl;
diff --git a/tests/regression/tensor/Makefile b/tests/regression/tensor/Makefile
new file mode 100644
index 00000000..790664dc
--- /dev/null
+++ b/tests/regression/tensor/Makefile
@@ -0,0 +1,9 @@
+PROJECT = tensor
+
+SRCS = main.cpp
+
+VX_SRCS = kernel.cpp
+
+OPTS ?= -s16
+
+include ../common.mk
\ No newline at end of file
diff --git a/tests/regression/tensor/common.h b/tests/regression/tensor/common.h
new file mode 100644
index 00000000..75cfc340
--- /dev/null
+++ b/tests/regression/tensor/common.h
@@ -0,0 +1,18 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t num_tasks;
+  uint32_t size;
+  uint64_t A_addr;
+  uint64_t B_addr;
+  uint64_t C_addr;  
+} kernel_arg_t;
+
+#endif
diff --git a/tests/regression/tensor/kernel.cpp b/tests/regression/tensor/kernel.cpp
new file mode 100644
index 00000000..5cf0851c
--- /dev/null
+++ b/tests/regression/tensor/kernel.cpp
@@ -0,0 +1,41 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+inline char is_log2(uint32_t x) {
+    return ((x & (x-1)) == 0);
+}
+
+inline uint32_t log2_fast(uint32_t x) { 
+    return 31 - __builtin_clz (x);
+}
+
+void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
+	auto size  = arg->size;
+    auto A = reinterpret_cast<TYPE*>(arg->A_addr);
+	auto B = reinterpret_cast<TYPE*>(arg->B_addr);
+	auto C = reinterpret_cast<TYPE*>(arg->C_addr);
+
+    uint32_t row, col;
+    if (is_log2(size)) {
+        uint32_t log_size = log2_fast(size);
+        row = task_id >> log_size;
+        col = task_id & (size-1);
+    } else {
+        row = task_id / size;
+        col = task_id % size;
+    }
+
+    TYPE sum (0);
+    for (int e = 0; e < size; ++e) {
+        sum += A[row * size + e] * B[e * size + col];
+    }
+    C[row * size + col] = sum;
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
+	vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
+	return 0;
+}
diff --git a/tests/regression/tensor/main.cpp b/tests/regression/tensor/main.cpp
new file mode 100644
index 00000000..d93f3177
--- /dev/null
+++ b/tests/regression/tensor/main.cpp
@@ -0,0 +1,249 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <vortex.h>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+union Float_t {    
+    float f;
+    int   i;
+    struct {
+        uint32_t man  : 23;
+        uint32_t exp  : 8;
+        uint32_t sign : 1;
+    } parts;
+};
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static bool compare(int a, int b) { 
+    return a == b; 
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static bool compare(float a, float b) { 
+    Float_t fa{a}, fb{b};
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      return false;
+    }
+    return true;
+  }  
+};
+
+static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
+  for (uint32_t row = 0; row < height; ++row) {
+    for (uint32_t col = 0; col < width; ++col) {
+      TYPE sum(0);
+      for (uint32_t e = 0; e < width; ++e) {
+          sum += A[row * width + e] * B[e * width + col];
+      }
+      out[row * width + col] = sum;
+    }
+  }
+}
+
+const char* kernel_file = "kernel.bin";
+uint32_t size = 16;
+
+vx_device_h device = nullptr;
+std::vector<uint8_t> staging_buf;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "s:k:h?")) != -1) {
+    switch (c) {
+    case 's':
+      size = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {    
+    vx_mem_free(device, kernel_arg.A_addr);
+    vx_mem_free(device, kernel_arg.B_addr);
+    vx_mem_free(device, kernel_arg.C_addr);
+    vx_dev_close(device);
+  }
+}
+
+int run_test(const kernel_arg_t& kernel_arg,
+             uint32_t buf_size, 
+             const std::vector<TYPE>& refs) {              
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
+
+  // verify result
+  std::cout << "verify result" << std::endl;  
+  {
+    int errors = 0;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < refs.size(); ++i) {
+      auto ref = refs[i];
+      auto cur = buf_ptr[i];
+      if (!Comparator<TYPE>::compare(cur, ref)) {
+        std::cout << "error at result #" << std::dec << i
+                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;  
+    }
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {  
+  // parse command arguments
+  parse_args(argc, argv);
+
+  std::srand(50);
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;  
+  RT_CHECK(vx_dev_open(&device));
+
+  uint32_t num_points = size * size;
+  uint32_t buf_size = num_points * sizeof(TYPE);
+
+  std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
+  std::cout << "matrix size: " << size << "x" << size << std::endl;
+  std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
+
+  // upload program
+  std::cout << "upload program" << std::endl;  
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.A_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.B_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.C_addr));
+
+  kernel_arg.num_tasks = num_points;
+  kernel_arg.size = size;
+
+  std::cout << "dev_src0=0x" << std::hex << kernel_arg.A_addr << std::endl;
+  std::cout << "dev_src1=0x" << std::hex << kernel_arg.B_addr << std::endl;
+  std::cout << "dev_dst=0x" << std::hex << kernel_arg.C_addr << std::endl;
+  
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
+  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
+  staging_buf.resize(alloc_size);
+  
+  // upload kernel argument
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+
+  // generate source data
+  std::vector<TYPE> src_A(num_points);
+  std::vector<TYPE> src_B(num_points);
+  std::vector<TYPE> refs(num_points);
+  for (uint32_t i = 0; i < num_points; ++i) {
+    auto a = static_cast<float>(std::rand()) / RAND_MAX;
+    auto b = static_cast<float>(std::rand()) / RAND_MAX;
+    src_A[i] = static_cast<TYPE>(a * size);
+    src_B[i] = static_cast<TYPE>(b * size);
+  }
+  cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size);
+
+  // upload source buffer0
+  {
+    std::cout << "upload source buffer0" << std::endl;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = src_A[i];
+    }
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size));
+  }
+
+  // upload source buffer1
+  {
+    std::cout << "upload source buffer1" << std::endl;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = src_B[i];
+    }   
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size));
+  }
+
+  // clear destination buffer
+  std::cout << "clear destination buffer" << std::endl;
+  memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
+  RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));  
+  
+  // run tests
+  std::cout << "run tests" << std::endl;
+  RT_CHECK(run_test(kernel_arg, buf_size, refs));
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;  
+  cleanup();
+
+  std::cout << "PASSED!" << std::endl;
+
+  return 0;
+}
\ No newline at end of file

From 61e3442ef80b5c7db66e436e08855529d9869af0 Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Tue, 14 Nov 2023 22:31:30 -0800
Subject: [PATCH 06/11] adding opencl convolution benchmark

---
 tests/opencl/Makefile              |   6 +
 tests/opencl/convolution/Makefile  |   7 +
 tests/opencl/convolution/kernel.cl |  32 ++++
 tests/opencl/convolution/main.cc   | 258 +++++++++++++++++++++++++++++
 tests/opencl/matmul/Makefile       |   2 +-
 tests/opencl/matmul/kernel.cl      |  59 +++----
 tests/opencl/matmul/main.cc        |  51 +++---
 tests/opencl/oclprintf/main.cc     |   2 +-
 tests/opencl/psort/main.cc         |   3 +-
 tests/opencl/sgemm/common.h        |   6 +-
 tests/opencl/sgemm/main.cc         | 125 ++++++++------
 tests/opencl/vecadd/main.cc        |   4 +-
 tests/regression/demo/main.cpp     |  46 ++---
 tests/regression/tensor/Makefile   |   2 +-
 tests/regression/tensor/kernel.cpp |   4 +-
 tests/regression/tensor/main.cpp   |  53 +++---
 16 files changed, 490 insertions(+), 170 deletions(-)
 create mode 100644 tests/opencl/convolution/Makefile
 create mode 100644 tests/opencl/convolution/kernel.cl
 create mode 100644 tests/opencl/convolution/main.cc

diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile
index 2cee5c5d..c838c3de 100644
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@@ -18,6 +18,7 @@ all:
 	$(MAKE) -C oclprintf
 	$(MAKE) -C blackscholes
 	$(MAKE) -C matmul
+	$(MAKE) -C convolution
 
 run-simx:
 	$(MAKE) -C vecadd run-simx
@@ -37,6 +38,7 @@ run-simx:
 	$(MAKE) -C blackscholes run-simx
 	$(MAKE) -C matmul run-simx
 	$(MAKE) -C transpose run-simx
+	$(MAKE) -C convolution run-simx
 #	$(MAKE) -C vectorhypot run-simx	
 #	$(MAKE) -C mri-q run-simx
 
@@ -58,6 +60,7 @@ run-rtlsim:
 	$(MAKE) -C oclprintf run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
 	$(MAKE) -C matmul run-rtlsim
+	$(MAKE) -C convolution run-rtlsim
 #	$(MAKE) -C vectorhypot run-rtlsim
 #	$(MAKE) -C mri-q run-rtlsim
 
@@ -79,6 +82,7 @@ run-opae:
 	$(MAKE) -C oclprintf run-opae
 	$(MAKE) -C blackscholes run-opae
 	$(MAKE) -C matmul run-opae
+	$(MAKE) -C convolution run-opae
 #	$(MAKE) -C vectorhypot run-opae
 #	$(MAKE) -C mri-q run-opae
 
@@ -102,6 +106,7 @@ clean:
 	$(MAKE) -C oclprintf clean
 	$(MAKE) -C blackscholes clean
 	$(MAKE) -C matmul clean
+	$(MAKE) -C convolution clean
 
 clean-all:
 	$(MAKE) -C vecadd clean-all
@@ -124,3 +129,4 @@ clean-all:
 	$(MAKE) -C oclprintf clean-all
 	$(MAKE) -C blackscholes clean-all
 	$(MAKE) -C matmul clean-all
+	$(MAKE) -C convolution clean-all
diff --git a/tests/opencl/convolution/Makefile b/tests/opencl/convolution/Makefile
new file mode 100644
index 00000000..42a577d2
--- /dev/null
+++ b/tests/opencl/convolution/Makefile
@@ -0,0 +1,7 @@
+PROJECT = convolution
+
+SRCS = main.cc
+
+OPTS ?= -n32
+
+include ../common.mk
diff --git a/tests/opencl/convolution/kernel.cl b/tests/opencl/convolution/kernel.cl
new file mode 100644
index 00000000..2ef31040
--- /dev/null
+++ b/tests/opencl/convolution/kernel.cl
@@ -0,0 +1,32 @@
+__kernel void conv3x3(__global float* output, 
+                      __global float* input,
+                      __global float* weights, 
+                      const int width, 
+                      const int height) 
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Adjust for padded borders
+    int paddedWidth = width + 2;
+    int paddedX = x + 1;
+    int paddedY = y + 1;
+
+    // Compute the convolution sum
+    float sum = 0.0f;
+
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left
+    sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1];       // Top-center
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right
+
+    sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3];       // Middle-left
+    sum += input[paddedY * paddedWidth + paddedX] * weights[4];             // Center
+    sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5];       // Middle-right
+
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left
+    sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7];       // Bottom-center
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right
+
+    // Store the result in the output array
+    output[y * width + x] = sum;
+}
\ No newline at end of file
diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc
new file mode 100644
index 00000000..d7487c2f
--- /dev/null
+++ b/tests/opencl/convolution/main.cc
@@ -0,0 +1,258 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <CL/opencl.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h> 
+#include <chrono>
+#include <vector>
+
+#define FLOAT_ULP 6
+
+#define KERNEL_NAME "conv3x3"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
+}
+
+static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) {
+  int paddedWidth = width + 2;
+  for (int32_t y = 0; y < height; ++y) {
+    for (int32_t x = 0; x < width; ++x) {
+      int paddedY = y + 1;
+      int paddedX = x + 1;
+      float sum = 0.0f;
+      for (int32_t ky = -1; ky <= 1; ++ky) {
+        for (int32_t kx = -1; kx <= 1; ++kx) {
+          int32_t iy = paddedY + ky;
+          int32_t ix = paddedX + kx;
+          float value = I[iy * paddedWidth + ix];
+          float weight = W[(ky + 1) * 3 + (kx + 1)];
+          sum += value * weight;
+        }
+      }
+      O[y * width + x] = sum;
+    }
+  }
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem i_memobj = NULL;
+cl_mem w_memobj = NULL;
+cl_mem o_memobj = NULL;
+uint8_t* kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (i_memobj) clReleaseMemObject(i_memobj);
+  if (w_memobj) clReleaseMemObject(w_memobj);
+  if (o_memobj) clReleaseMemObject(o_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);  
+  if (kernel_bin) free(kernel_bin);
+}
+
+int size = 32;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  printf("Matrix size=%d\n", size);
+
+  uint32_t o_points = size * size;
+  uint32_t i_points = (size+2) * (size+2);
+  uint32_t w_points = 3 * 3;
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  printf("Allocate device buffers\n");  
+  size_t i_nbytes = i_points * sizeof(float);
+  size_t w_nbytes = w_points * sizeof(float);
+  size_t o_nbytes = o_points * sizeof(float);
+  i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err));
+  w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err));
+  o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif  
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  size_t global_size[2] = {size, size};
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size));
+
+  // Allocate memories for input arrays and output arrays.
+  std::vector<float> h_i(i_points);
+  std::vector<float> h_w(w_points);
+  std::vector<float> h_o(o_points, 0.0f);
+	
+  // Generate input values
+  for (int32_t y = -1; y < size+1; ++y) {
+    for (int32_t x = -1; x < size+1; ++x) {
+      if (x >= 0 && x < size && y >= 0 && y < size) {
+        h_i[(y+1) * (size+2) + (x+1)] = static_cast<float>(rand()) / RAND_MAX;
+      } else {
+        h_i[(y+1) * (size+2) + (x+1)] = 0;
+      }
+    }
+  }
+  for (uint32_t i = 0; i < w_points; ++i) {
+    h_w[i] = static_cast<float>(rand()) / RAND_MAX;
+  }
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL));
+
+  printf("Verify result\n");
+  std::vector<float> ref_vec(o_points);
+  convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size);
+  int errors = 0;
+  for (uint32_t i = 0; i < o_points; ++i) {
+    if (!compare_equal(h_o[i], ref_vec[i])) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]);
+      ++errors;
+    }
+  }
+  if (errors != 0) {
+    printf("FAILED! - %d errors\n", errors);    
+  } else {
+    printf("PASSED!\n");
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
diff --git a/tests/opencl/matmul/Makefile b/tests/opencl/matmul/Makefile
index 0d1d136a..39b92b36 100644
--- a/tests/opencl/matmul/Makefile
+++ b/tests/opencl/matmul/Makefile
@@ -2,6 +2,6 @@ PROJECT = matmul
 
 SRCS = main.cc
 
-OPTS ?= -n16
+OPTS ?= -n32
 
 include ../common.mk
diff --git a/tests/opencl/matmul/kernel.cl b/tests/opencl/matmul/kernel.cl
index a0ef2d81..02aa074c 100644
--- a/tests/opencl/matmul/kernel.cl
+++ b/tests/opencl/matmul/kernel.cl
@@ -7,43 +7,41 @@ __kernel void matmul(__global float *A,
 {
     int globalRow = get_global_id(1);
     int globalCol = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);
     int localSize = get_local_size(0);  // assuming square local size
 
     float sum = 0.0f;
 
-    // Load initial blocks of A and B into local memory
-    int k = 0;
-    localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
-    localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
+    // Loop over all blocks of both matrices
+    for (int k = 0; k < N; k += localSize) {
+        // Load block of matrix A to local memory
+        localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
 
-    // Iterate over blocks
-    for (k = 0; k < N; k += 16) {
-        // Ensure the initial block is loaded
+        // Load block of matrix B to local memory, adjusting for column-major access
+        localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
+
+        // Synchronize to make sure the tiles are loaded
         barrier(CLK_LOCAL_MEM_FENCE);
 
-        // Compute multiplication for this block
-        for (int j = 0; j < 16; j++) {
+        // Multiply the two matrix blocks and accumulate result
+        for (int j = 0; j < localSize; j++) {
             sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
         }
-
-        // Load the next block of matrix A into local memory
-        if (k + 16 < N) {
-            localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol];
-            localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol];
-        }
     }
 
     C[globalRow * N + globalCol] = sum;
 }
 
-/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
+/*__kernel void matmul(__global float *A, 
+                       __global float *B, 
+                       __global float *C, 
+                       const unsigned int N)
 {
     int globalRow = get_global_id(1);
     int globalCol = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);
 
     // Static local memory declaration
     __local float localA[16][16];
@@ -51,26 +49,21 @@ __kernel void matmul(__global float *A,
 
     float sum = 0.0f;
 
-    // Load initial blocks of A and B into local memory
-    int k = 0;
-    localA[localRow][localCol] = A[globalRow * N + k + localCol];
-    localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
-
     // Iterate over blocks
-    for (k = 0; k < N; k += 16) {
-        // Ensure the initial block is loaded
+    for (int k = 0; k < N; k += 16) {
+        // Load a block of matrix A into local memory
+        localA[localRow][localCol] = A[globalRow * N + k + localCol];
+
+        // Load a block of matrix B into local memory
+        localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
+
+        // Ensure the entire block is loaded
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Compute multiplication for this block
         for (int j = 0; j < 16; j++) {
             sum += localA[localRow][j] * localB[j][localCol];
         }
-
-        // Load the next block of matrix A into local memory
-        if (k + 16 < N) {
-            localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol];
-            localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol];
-        }
     }
 
     C[globalRow * N + globalCol] = sum;
diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/matmul/main.cc
index f7714dd7..3d26ff0c 100644
--- a/tests/opencl/matmul/main.cc
+++ b/tests/opencl/matmul/main.cc
@@ -10,6 +10,8 @@
 
 #define LOCAL_SIZE 16
 
+#define FLOAT_ULP 6
+
 #define KERNEL_NAME "matmul"
 
 #define CL_CHECK(_expr)                                                \
@@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
   return 0;
 }
 
-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
   fi_t fa, fb;
   fa.f = a;
   fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
 }
 
-static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
+static void matmul_cpu(float *C, float *A, float *B, int N) {
     for (int i = 0; i < N; i++) {
         for (int j = 0; j < N; j++) {
             float sum = 0.0f;
@@ -98,7 +101,7 @@ static void cleanup() {
   if (kernel_bin) free(kernel_bin);
 }
 
-int size = 64;
+int size = 32;
 
 static void show_usage() {
   printf("Usage: [-n size] [-h: help]\n");
@@ -106,7 +109,7 @@ static void show_usage() {
 
 static void parse_args(int argc, char **argv) {
   int c;
-  while ((c = getopt(argc, argv, "fn:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
     switch (c) {
     case 'n':
       size = atoi(optarg);
@@ -127,6 +130,8 @@ int main (int argc, char **argv) {
   // parse command arguments
   parse_args(argc, argv);
 
+  uint32_t num_points = size * size;
+
   printf("Matrix size=%d\n", size);
   if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
     printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
@@ -148,7 +153,7 @@ int main (int argc, char **argv) {
   printf("Using device: %s\n", device_string);
 
   printf("Allocate device buffers\n");
-  size_t nbytes = size * size * sizeof(float);
+  size_t nbytes = num_points * sizeof(float);
   a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@@ -176,32 +181,26 @@ int main (int argc, char **argv) {
   // Create kernel
   kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
 
-  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
   size_t global_size[2] = {size, size};
+  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};  
 
   // Set kernel arguments
   CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
   CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
   CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
   CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
-  //CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
-  //CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
 
  // Allocate memories for input arrays and output arrays.
- std::vector<float> h_a(size * size);
- std::vector<float> h_b(size * size);
- std::vector<float> h_c(size * size);
+ std::vector<float> h_a(num_points);
+ std::vector<float> h_b(num_points);
+ std::vector<float> h_c(num_points);
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = static_cast<float>(rand()) / RAND_MAX;
+    h_b[i] = static_cast<float>(rand()) / RAND_MAX;
   }
 
   // Creating command queue
@@ -223,10 +222,10 @@ int main (int argc, char **argv) {
   CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
 
   printf("Verify result\n");
-  std::vector<float> ref_vec(size * size);
-  matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
+  std::vector<float> ref_vec(num_points);
+  matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
   int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
+  for (uint32_t i = 0; i < num_points; ++i) {
     if (!compare_equal(h_c[i], ref_vec[i])) {
       if (errors < 100) 
         printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
diff --git a/tests/opencl/oclprintf/main.cc b/tests/opencl/oclprintf/main.cc
index 7c0463cf..184eec96 100644
--- a/tests/opencl/oclprintf/main.cc
+++ b/tests/opencl/oclprintf/main.cc
@@ -143,7 +143,7 @@ int main (int argc, char **argv) {
   // Allocate memories for input arrays and output arrays.    
   h_a = (int*)malloc(nbytes);
 	
-  // Initialize values for array members.  
+  // Generate input values
   for (int i = 0; i < size; ++i) {
     h_a[i] = -1 + i;
   }
diff --git a/tests/opencl/psort/main.cc b/tests/opencl/psort/main.cc
index 26a42807..b627ceee 100644
--- a/tests/opencl/psort/main.cc
+++ b/tests/opencl/psort/main.cc
@@ -155,9 +155,8 @@ int main (int argc, char **argv) {
   h_a = (int*)malloc(nbytes);
   h_c = (int*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
   for (int i = 0; i < size; ++i) {
-    h_c[i] = 0xdeadbeef;
     if (float_enable) {
       float value = sinf(i)*sinf(i);
       h_a[i] = *(int*)&value;
diff --git a/tests/opencl/sgemm/common.h b/tests/opencl/sgemm/common.h
index 01f68d48..fdb40bce 100644
--- a/tests/opencl/sgemm/common.h
+++ b/tests/opencl/sgemm/common.h
@@ -1,12 +1,8 @@
 #ifndef COMMON_H
 #define COMMON_H
 
-#define USE_FLOAT
-
-#ifdef USE_FLOAT
+#ifndef TYPE
 #define TYPE float
-#else
-#define TYPE int
 #endif
 
 #endif // COMMON_H
\ No newline at end of file
diff --git a/tests/opencl/sgemm/main.cc b/tests/opencl/sgemm/main.cc
index 3ca14792..7a02929f 100644
--- a/tests/opencl/sgemm/main.cc
+++ b/tests/opencl/sgemm/main.cc
@@ -11,6 +11,8 @@
 
 #define KERNEL_NAME "sgemm"
 
+#define FLOAT_ULP 6
+
 #define CL_CHECK(_expr)                                                \
    do {                                                                \
      cl_int _err = _expr;                                              \
@@ -33,6 +35,66 @@
      _ret;                                                             \
    })
 
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      TYPE acc = 0;
+      for (int k = 0; k < K; ++k) {
+          acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}*/
+
 static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
   if (nullptr == filename || nullptr == data || 0 == size)
     return -1;
@@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
   return 0;
 }
 
-/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      TYPE acc = 0;
-      for (int k = 0; k < K; ++k) {
-          acc += A[k * M + m] * B[n * K + k];
-      }
-      C[n * M + m] = acc;
-    }
-  }
-}*/
-
-#ifdef USE_FLOAT
-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
-  fi_t fa, fb;
-  fa.f = a;
-  fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
-}
-#else
-static bool compare_equal(int a, int b, int ulp = 21) {
-  return (a == b);
-}
-#endif
-
 cl_device_id device_id = NULL;
 cl_context context = NULL;
 cl_command_queue commandQueue = NULL;
@@ -145,6 +181,8 @@ int main (int argc, char **argv) {
   // parse command arguments
   parse_args(argc, argv);
 
+  uint32_t num_points = size * size;
+
   cl_platform_id platform_id;
   size_t kernel_size;
   cl_int binary_status;
@@ -163,7 +201,7 @@ int main (int argc, char **argv) {
   context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
 
   // Allocate device buffers
-  size_t nbytes = size * size * sizeof(TYPE);
+  size_t nbytes = num_points * sizeof(TYPE);
   a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
   c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@@ -194,23 +232,17 @@ int main (int argc, char **argv) {
   h_b = (TYPE*)malloc(nbytes);
   h_c = (TYPE*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values 
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = Comparator<TYPE>::generate();
+    h_b[i] = Comparator<TYPE>::generate();
   }
 
   size_t global_offset[2] = {0, 0};
   size_t global_work_size[2] = {size, size};
   size_t local_work_size[2] = {1, 1};
 
-  std::vector<float> ref_vec(size * size);
+  std::vector<float> ref_vec(num_points);
 
   // reference generation
   size_t num_groups_y = global_work_size[1] / local_work_size[1];
@@ -228,12 +260,7 @@ int main (int argc, char **argv) {
           TYPE acc = 0;
           for (int k = 0; k < width; k++) {
             acc += h_a[k * width + r] * h_b[c * width + k];
-          }
-        /*#ifdef USE_FLOAT
-          printf("*** r=%d, c=%d, v=%f\n", r, c, acc);
-        #else
-          printf("*** r=%d, c=%d, v=%d\n", r, c, acc);
-        #endif*/                   
+          }                  
           ref_vec[c * width + r] = acc;         
         }
       }
@@ -260,14 +287,8 @@ int main (int argc, char **argv) {
 
   printf("Verify result\n");
   int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
-    if (!compare_equal(h_c[i], ref_vec[i])) {
-      if (errors < 100) 
-      #ifdef USE_FLOAT
-        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
-      #else
-        printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]);
-      #endif
+  for (uint32_t i = 0; i < num_points; ++i) {
+    if (!Comparator<TYPE>::compare(h_c[i], ref_vec[i], i, errors)) {
       ++errors;
     }
   }
diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc
index 23aa49b4..992e88be 100644
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@@ -166,12 +166,10 @@ int main (int argc, char **argv) {
   h_b = (float*)malloc(nbytes);
   h_c = (float*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
   for (int i = 0; i < size; ++i) {
     h_a[i] = sinf(i)*sinf(i);
     h_b[i] = cosf(i)*cosf(i);
-    h_c[i] = 0xdeadbeef;
-    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
   }
 
   // Creating command queue
diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp
index 63556a5f..f14f66c3 100644
--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@@ -19,16 +19,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-union Float_t {    
-    float f;
-    int   i;
-    struct {
-        uint32_t man  : 23;
-        uint32_t exp  : 8;
-        uint32_t sign : 1;
-    } parts;
-};
-
 template <typename Type>
 class Comparator {};
 
@@ -38,22 +28,41 @@ public:
   static const char* type_str() {
     return "integer";
   }
-  static bool compare(int a, int b) { 
-    return a == b; 
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
   }  
 };
 
 template <>
 class Comparator<float> {
+private:
+  union Float_t { float f; int i; };
 public:
   static const char* type_str() {
     return "float";
   }
-  static bool compare(float a, float b) { 
-    Float_t fa{a}, fb{b};
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {     
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
     auto d = std::abs(fa.i - fb.i);
     if (d > FLOAT_ULP) {
-      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
       return false;
     }
     return true;
@@ -127,9 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg,
     for (uint32_t i = 0; i < num_points; ++i) {
       auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
       auto cur = buf_ptr[i];
-      if (!Comparator<TYPE>::compare(cur, ref)) {
-        std::cout << "error at result #" << std::dec << i
-                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
         ++errors;
       }
     }
@@ -196,8 +203,7 @@ int main(int argc, char *argv[]) {
   // generate source data
   source_data.resize(2 * num_points);
   for (uint32_t i = 0; i < source_data.size(); ++i) {
-    auto r = static_cast<float>(std::rand()) / RAND_MAX;
-    source_data[i] = static_cast<TYPE>(r * 2 * num_points);
+    source_data[i] = Comparator<TYPE>::generate();
   }
 
   // upload source buffer0
diff --git a/tests/regression/tensor/Makefile b/tests/regression/tensor/Makefile
index 790664dc..dbb70c3b 100644
--- a/tests/regression/tensor/Makefile
+++ b/tests/regression/tensor/Makefile
@@ -4,6 +4,6 @@ SRCS = main.cpp
 
 VX_SRCS = kernel.cpp
 
-OPTS ?= -s16
+OPTS ?= -n32
 
 include ../common.mk
\ No newline at end of file
diff --git a/tests/regression/tensor/kernel.cpp b/tests/regression/tensor/kernel.cpp
index 5cf0851c..b0e8f69e 100644
--- a/tests/regression/tensor/kernel.cpp
+++ b/tests/regression/tensor/kernel.cpp
@@ -12,10 +12,10 @@ inline uint32_t log2_fast(uint32_t x) {
 }
 
 void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
-	auto size  = arg->size;
-    auto A = reinterpret_cast<TYPE*>(arg->A_addr);
+	auto A = reinterpret_cast<TYPE*>(arg->A_addr);
 	auto B = reinterpret_cast<TYPE*>(arg->B_addr);
 	auto C = reinterpret_cast<TYPE*>(arg->C_addr);
+    auto size  = arg->size;
 
     uint32_t row, col;
     if (is_log2(size)) {
diff --git a/tests/regression/tensor/main.cpp b/tests/regression/tensor/main.cpp
index d93f3177..81103c10 100644
--- a/tests/regression/tensor/main.cpp
+++ b/tests/regression/tensor/main.cpp
@@ -19,16 +19,6 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
-union Float_t {    
-    float f;
-    int   i;
-    struct {
-        uint32_t man  : 23;
-        uint32_t exp  : 8;
-        uint32_t sign : 1;
-    } parts;
-};
-
 template <typename Type>
 class Comparator {};
 
@@ -38,8 +28,17 @@ public:
   static const char* type_str() {
     return "integer";
   }
-  static bool compare(int a, int b) { 
-    return a == b; 
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
   }  
 };
 
@@ -49,18 +48,26 @@ public:
   static const char* type_str() {
     return "float";
   }
-  static bool compare(float a, float b) { 
-    Float_t fa{a}, fb{b};
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
     auto d = std::abs(fa.i - fb.i);
     if (d > FLOAT_ULP) {
-      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
       return false;
     }
     return true;
   }  
 };
 
-static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
+static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
   for (uint32_t row = 0; row < height; ++row) {
     for (uint32_t col = 0; col < width; ++col) {
       TYPE sum(0);
@@ -73,7 +80,7 @@ static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t
 }
 
 const char* kernel_file = "kernel.bin";
-uint32_t size = 16;
+uint32_t size = 32;
 
 vx_device_h device = nullptr;
 std::vector<uint8_t> staging_buf;
@@ -81,14 +88,14 @@ kernel_arg_t kernel_arg = {};
 
 static void show_usage() {
    std::cout << "Vortex Test." << std::endl;
-   std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
 }
 
 static void parse_args(int argc, char **argv) {
   int c;
-  while ((c = getopt(argc, argv, "s:k:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
     switch (c) {
-    case 's':
+    case 'n':
       size = atoi(optarg);
       break;
     case 'k':
@@ -138,9 +145,7 @@ int run_test(const kernel_arg_t& kernel_arg,
     for (uint32_t i = 0; i < refs.size(); ++i) {
       auto ref = refs[i];
       auto cur = buf_ptr[i];
-      if (!Comparator<TYPE>::compare(cur, ref)) {
-        std::cout << "error at result #" << std::dec << i
-                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
         ++errors;
       }
     }
@@ -208,7 +213,7 @@ int main(int argc, char *argv[]) {
     src_A[i] = static_cast<TYPE>(a * size);
     src_B[i] = static_cast<TYPE>(b * size);
   }
-  cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size);
+  matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
 
   // upload source buffer0
   {

From ede5e1c311f0424b53ac7c33a943c14411ff54ac Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Wed, 15 Nov 2023 00:28:26 -0800
Subject: [PATCH 07/11] minor update

---
 tests/opencl/Makefile          |  6 +--
 tests/opencl/common.mk         |  3 +-
 tests/opencl/oclprintf/main.cc | 20 ++++-----
 tests/opencl/psort/main.cc     | 74 ++++++++++++++++++----------------
 tests/opencl/saxpy/main.cc     | 31 ++++++--------
 tests/opencl/sfilter/main.cc   | 31 ++++++--------
 tests/opencl/sgemm/main.cc     | 20 ++++-----
 tests/opencl/vecadd/main.cc    | 21 +++++-----
 8 files changed, 100 insertions(+), 106 deletions(-)

diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile
index c838c3de..acb96ba9 100644
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@@ -36,9 +36,9 @@ run-simx:
 	$(MAKE) -C lbm run-simx
 	$(MAKE) -C oclprintf run-simx
 	$(MAKE) -C blackscholes run-simx
-	$(MAKE) -C matmul run-simx
 	$(MAKE) -C transpose run-simx
 	$(MAKE) -C convolution run-simx
+#	$(MAKE) -C matmul run-simx
 #	$(MAKE) -C vectorhypot run-simx	
 #	$(MAKE) -C mri-q run-simx
 
@@ -59,8 +59,8 @@ run-rtlsim:
 	$(MAKE) -C lbm run-rtlsim
 	$(MAKE) -C oclprintf run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
-	$(MAKE) -C matmul run-rtlsim
 	$(MAKE) -C convolution run-rtlsim
+#	$(MAKE) -C matmul run-rtlsim
 #	$(MAKE) -C vectorhypot run-rtlsim
 #	$(MAKE) -C mri-q run-rtlsim
 
@@ -81,8 +81,8 @@ run-opae:
 	$(MAKE) -C lbm run-opae
 	$(MAKE) -C oclprintf run-opae
 	$(MAKE) -C blackscholes run-opae
-	$(MAKE) -C matmul run-opae
 	$(MAKE) -C convolution run-opae
+#	$(MAKE) -C matmul run-opae
 #	$(MAKE) -C vectorhypot run-opae
 #	$(MAKE) -C mri-q run-opae
 
diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk
index ce82dac3..762712b4 100644
--- a/tests/opencl/common.mk
+++ b/tests/opencl/common.mk
@@ -41,13 +41,12 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
 CXXFLAGS += -pthread
 CXXFLAGS += -I$(POCL_RT_PATH)/include
-LDFLAGS  += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex
 
 ifdef HOSTGPU
 	CXXFLAGS += -DHOSTGPU
 	LDFLAGS += -lOpenCL
 else
-	LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so	
+	LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
 endif
 
 # Debugigng
diff --git a/tests/opencl/oclprintf/main.cc b/tests/opencl/oclprintf/main.cc
index 184eec96..4af39802 100644
--- a/tests/opencl/oclprintf/main.cc
+++ b/tests/opencl/oclprintf/main.cc
@@ -106,11 +106,6 @@ int main (int argc, char **argv) {
   
   cl_platform_id platform_id;
   size_t kernel_size;
-  cl_int binary_status;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
   
   // Getting platform and device information
   CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@@ -124,12 +119,17 @@ int main (int argc, char **argv) {
   a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
 
   printf("Create program from kernel source\n");
-  program = CL_CHECK2(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
     return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif
 
   // Build program
   CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
diff --git a/tests/opencl/psort/main.cc b/tests/opencl/psort/main.cc
index b627ceee..8bc834dc 100644
--- a/tests/opencl/psort/main.cc
+++ b/tests/opencl/psort/main.cc
@@ -115,11 +115,6 @@ int main (int argc, char **argv) {
   
   cl_platform_id platform_id;
   size_t kernel_size;
-  cl_int binary_status;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
   
   // Getting platform and device information
   CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@@ -134,12 +129,17 @@ int main (int argc, char **argv) {
   c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
 
   printf("Create program from kernel source\n");
-  program = CL_CHECK2(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
     return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif
 
   // Build program
   CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@@ -159,12 +159,12 @@ int main (int argc, char **argv) {
   for (int i = 0; i < size; ++i) {
     if (float_enable) {
       float value = sinf(i)*sinf(i);
-      h_a[i] = *(int*)&value;
-      printf("*** [%d]: h_a=%f\n", i, value);
+      ((float*)h_a)[i] = value;
+      printf("*** [%d]: %f\n", i, value);
     } else {      
       int value = size*sinf(i);
       h_a[i] = value;
-      printf("*** [%d]: h_a=%d\n", i, value);
+      printf("*** [%d]: %d\n", i, value);
     }
   }
 
@@ -188,38 +188,44 @@ int main (int argc, char **argv) {
   CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));
 
   printf("Verify result\n");
-  for (int i = 0; i < size; ++i) {
-    int value = h_c[i];
+  for (int i = 0; i < size; ++i) {    
     if (float_enable) {
-      printf("*** [%d]: h_a=%f\n", i, *(float*)&value);
+      float value = ((float*)h_c)[i];
+      printf("*** [%d]: %f\n", i, value);
     } else {
-      printf("*** [%d]: h_a=%d\n", i, value);
+      int value = h_c[i];
+      printf("*** [%d]: %d\n", i, value);
     }
   }
   int errors = 0;
-  for (int i = 0; i < size; ++i) {
-    int ref = h_a[i];
-    float ref_f = *(float*)&ref;
+  for (int i = 0; i < size; ++i) {    
     int pos = 0;
-    for (int j = 0; j < size; ++j) {
-      int cur = h_a[j];
-      if (float_enable) {
-        float cur_f = *(float*)&cur;
-        pos += (cur_f < ref_f) || (cur_f == ref_f && j < i);
-      } else {
+    if (float_enable) {    
+      float ref = ((float*)h_a)[i];
+      for (int j = 0; j < size; ++j) {        
+        float cur = ((float*)h_a)[j];
+        pos += (cur < ref) || (cur == ref && j < i);
+      }        
+      float value = ((float*)h_c)[pos];
+      if (value != ref) {
+        if (errors < 100) {        
+          printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref, value);
+        }        
+        ++errors;
+      }
+    } else {      
+      int ref = h_a[i];
+      for (int j = 0; j < size; ++j) {        
+        int cur = h_a[j];
         pos += (cur < ref) || (cur == ref && j < i);
       }
-    }
-    int value = h_c[pos];
-    if (value != ref) {
-      if (errors < 100) {
-        if (float_enable) {
-          printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref_f, *(float*)&value);
-        } else {
+      int value = h_c[pos];
+      if (value != ref) {
+        if (errors < 100) {        
           printf("*** error: [%d] expected=%d, actual=%d\n", pos, ref, value);
         }
+        ++errors;
       }
-      ++errors;
     }
   }
   if (0 == errors) {
diff --git a/tests/opencl/saxpy/main.cc b/tests/opencl/saxpy/main.cc
index 4ea15759..9355c945 100644
--- a/tests/opencl/saxpy/main.cc
+++ b/tests/opencl/saxpy/main.cc
@@ -151,16 +151,12 @@ int main(int argc, char **argv) {
   
   cl_platform_id platform_id;
   cl_device_id device_id;
+  cl_program program;
   cl_mem input_buffer;
   cl_mem output_buffer;
   size_t kernel_size;
   cl_context context;
   cl_command_queue queue;
-  cl_int binary_status = 0;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
 
   // Getting platform and device information
   CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@@ -172,19 +168,18 @@ int main(int argc, char **argv) {
   cl_kernel kernel = 0;
   cl_mem memObjects[2] = {0, 0};
 
-  // Create OpenCL program - first attempt to load cached binary.
-  //  If that is not available, then create the program from source
-  //  and store the binary for future use.
-  printf("create program from binary...\n");
-  cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    std::cerr << "Failed to write program binary" << std::endl;
-    Cleanup(device_id, context, queue, program, kernel, memObjects);
-    return 1;
-  } else {
-    printf("Read program from binary.\n");
-  }
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif
 
   // Build program
   CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
diff --git a/tests/opencl/sfilter/main.cc b/tests/opencl/sfilter/main.cc
index 3a7a5979..0ae264a5 100644
--- a/tests/opencl/sfilter/main.cc
+++ b/tests/opencl/sfilter/main.cc
@@ -149,14 +149,10 @@ int main(int argc, char **argv) {
 
   cl_platform_id platform_id;
   cl_device_id device_id;
+  cl_program program;
   size_t kernel_size;
-  cl_int binary_status = 0;
   uint8_t *kernel_bin = NULL;
 
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
-
   // Getting platform and device information
   CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
   CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
@@ -170,19 +166,18 @@ int main(int argc, char **argv) {
   cl_kernel kernel = 0;
   cl_mem memObjects[2] = {0, 0};
 
-  // Create OpenCL program - first attempt to load cached binary.
-  // If that is not available, then create the program from source
-  // and store the binary for future use.
-  printf("create program from binary...\n");
-  cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    std::cerr << "Failed to write program binary" << std::endl;
-    Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
-    return 1;
-  } else {
-    printf("Read program from binary.");
-  }
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif
 
   // Build program
   CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
diff --git a/tests/opencl/sgemm/main.cc b/tests/opencl/sgemm/main.cc
index 7a02929f..bc48dff0 100644
--- a/tests/opencl/sgemm/main.cc
+++ b/tests/opencl/sgemm/main.cc
@@ -185,13 +185,8 @@ int main (int argc, char **argv) {
 
   cl_platform_id platform_id;
   size_t kernel_size;
-  cl_int binary_status;
 
   srand(50);
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
   
   // Getting platform and device information
   CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@@ -207,12 +202,17 @@ int main (int argc, char **argv) {
   c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
 
   printf("Create program from kernel source\n");
-  program = CL_CHECK2(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
     return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif
 
   // Build program
   CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc
index 992e88be..e443f7c5 100644
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@@ -122,11 +122,6 @@ int main (int argc, char **argv) {
   
   cl_platform_id platform_id;
   size_t kernel_size;
-  cl_int binary_status;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
   
   // Getting platform and device information
   CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@@ -142,13 +137,17 @@ int main (int argc, char **argv) {
   c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
 
   printf("Create program from kernel source\n");
-  cl_int _err;
-  program = clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
     return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif
 
   // Build program
   CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

From 2c94e358b8d6cfacb360a18ea1810260507a685a Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Wed, 15 Nov 2023 00:52:39 -0800
Subject: [PATCH 08/11] perf counter bug fix

---
 hw/rtl/core/VX_core.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv
index d50a3d32..8aaea911 100644
--- a/hw/rtl/core/VX_core.sv
+++ b/hw/rtl/core/VX_core.sv
@@ -267,7 +267,7 @@ module VX_core import VX_gpu_pkg::*; #(
 
     wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;    
 
-    wire perf_icache_pending_read_cycle;
+    wire [1:0] perf_icache_pending_read_cycle;
     wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
 
     reg  [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;

From 547d916ae25a207e31827b801635fb60e1e30162 Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Wed, 15 Nov 2023 13:00:06 -0800
Subject: [PATCH 09/11] minor update

---
 hw/rtl/VX_config.vh         | 8 ++++----
 hw/syn/altera/opae/Makefile | 6 +++---
 hw/syn/xilinx/xrt/Makefile  | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 6ecb3cf4..9d4ea56d 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -223,18 +223,18 @@
 
 // Number of ALU units
 `ifndef NUM_ALU_LANES
-`define NUM_ALU_LANES   `UP(`NUM_THREADS / 2)
+`define NUM_ALU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_ALU_BLOCKS
-`define NUM_ALU_BLOCKS  `UP(`ISSUE_WIDTH / 1)
+`define NUM_ALU_BLOCKS  `ISSUE_WIDTH
 `endif
 
 // Number of FPU units
 `ifndef NUM_FPU_LANES
-`define NUM_FPU_LANES   `UP(`NUM_THREADS / 2)
+`define NUM_FPU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_FPU_BLOCKS
-`define NUM_FPU_BLOCKS  `UP(`ISSUE_WIDTH / 1)
+`define NUM_FPU_BLOCKS  `ISSUE_WIDTH
 `endif
 
 // Number of LSU units
diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile
index 0db2015d..56008d42 100644
--- a/hw/syn/altera/opae/Makefile
+++ b/hw/syn/altera/opae/Makefile
@@ -50,9 +50,9 @@ CONFIGS_1c  := -DNUM_CLUSTERS=1 -DNUM_CORES=1
 CONFIGS_2c  := -DNUM_CLUSTERS=1 -DNUM_CORES=2
 CONFIGS_4c  := -DNUM_CLUSTERS=1 -DNUM_CORES=4
 CONFIGS_8c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=8
-CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
-CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
-CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
+CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16
+CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
+CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
 CONFIGS += $(CONFIGS_$(NUM_CORES)c)
 
 # include paths
diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile
index c8714779..ad8e77a7 100644
--- a/hw/syn/xilinx/xrt/Makefile
+++ b/hw/syn/xilinx/xrt/Makefile
@@ -67,9 +67,9 @@ CONFIGS_1c  := -DNUM_CLUSTERS=1 -DNUM_CORES=1
 CONFIGS_2c  := -DNUM_CLUSTERS=1 -DNUM_CORES=2
 CONFIGS_4c  := -DNUM_CLUSTERS=1 -DNUM_CORES=4
 CONFIGS_8c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=8
-CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
-CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
-CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
+CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16
+CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
+CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
 CONFIGS += $(CONFIGS_$(NUM_CORES)c)
 
 # include paths

From d65cc61df57b99a8a9226474e53ba5ba34819e9a Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Thu, 16 Nov 2023 12:00:37 -0800
Subject: [PATCH 10/11] minor update

---
 hw/rtl/VX_config.vh                  |   6 +-
 hw/rtl/cache/VX_cache_cluster_top.sv | 190 ---------------------------
 hw/rtl/cache/VX_cache_data.sv        |   2 +-
 hw/rtl/cache/VX_cache_top.sv         |   2 +-
 hw/syn/altera/quartus/cache/Makefile |   4 +-
 hw/syn/altera/quartus/core/Makefile  |   2 +-
 6 files changed, 8 insertions(+), 198 deletions(-)
 delete mode 100644 hw/rtl/cache/VX_cache_cluster_top.sv

diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 9d4ea56d..c7b2a2c3 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -407,7 +407,7 @@
 
 // Number of Associative Ways
 `ifndef ICACHE_NUM_WAYS
-`define ICACHE_NUM_WAYS 2
+`define ICACHE_NUM_WAYS 1
 `endif
 
 // Dcache Configurable Knobs //////////////////////////////////////////////////
@@ -461,7 +461,7 @@
 
 // Number of Associative Ways
 `ifndef DCACHE_NUM_WAYS
-`define DCACHE_NUM_WAYS 2
+`define DCACHE_NUM_WAYS 1
 `endif
 
 // SM Configurable Knobs //////////////////////////////////////////////////////
@@ -520,7 +520,7 @@
 
 // Number of Associative Ways
 `ifndef L2_NUM_WAYS
-`define L2_NUM_WAYS 4
+`define L2_NUM_WAYS 2
 `endif
 
 // L3cache Configurable Knobs /////////////////////////////////////////////////
diff --git a/hw/rtl/cache/VX_cache_cluster_top.sv b/hw/rtl/cache/VX_cache_cluster_top.sv
deleted file mode 100644
index 500f2c87..00000000
--- a/hw/rtl/cache/VX_cache_cluster_top.sv
+++ /dev/null
@@ -1,190 +0,0 @@
-// Copyright © 2019-2023
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-`include "VX_cache_define.vh"
-
-module VX_cache_cluster_top import VX_gpu_pkg::*; #(
-    parameter `STRING INSTANCE_ID    = "",
-
-    parameter NUM_UNITS             = 2,
-    parameter NUM_INPUTS            = 4,
-    parameter TAG_SEL_IDX           = 0,
-
-    // Number of Word requests per cycle
-    parameter NUM_REQS              = 4,
-
-    // Size of cache in bytes
-    parameter CACHE_SIZE            = 16384, 
-    // Size of line inside a bank in bytes
-    parameter LINE_SIZE             = 16, 
-    // Number of banks
-    parameter NUM_BANKS             = 4,
-    // Number of associative ways
-    parameter NUM_WAYS              = 4,
-    // Size of a word in bytes
-    parameter WORD_SIZE             = 4, 
-
-    // Core Response Queue Size
-    parameter CRSQ_SIZE             = 2,
-    // Miss Reserv Queue Knob
-    parameter MSHR_SIZE             = 16, 
-    // Memory Response Queue Size
-    parameter MRSQ_SIZE             = 0,
-    // Memory Request Queue Size
-    parameter MREQ_SIZE             = 4,
-
-    // Enable cache writeable
-    parameter WRITE_ENABLE          = 1,
-
-    // Request debug identifier
-    parameter UUID_WIDTH            = 0,
-
-    // core request tag size
-    parameter TAG_WIDTH             = UUID_WIDTH + 16,
-
-    // enable bypass for non-cacheable addresses
-    parameter NC_ENABLE             = 1,
-
-    // Core response output register
-    parameter CORE_OUT_REG          = 2,
-
-    // Memory request output register
-    parameter MEM_OUT_REG           = 2,
-
-    parameter NUM_CACHES = `UP(NUM_UNITS),
-    parameter PASSTHRU   = (NUM_UNITS == 0),
-    parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
-    parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : 
-                                                       `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) : 
-                                          (NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
-                                                       `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
-    parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
- ) (    
-    input wire clk,
-    input wire reset,
-
-// PERF
-`ifdef PERF_ENABLE
-    output cache_perf_t cache_perf,
-`endif
-
-    // Core request
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_req_valid,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_req_rw,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0]  core_req_byteen,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0]  core_req_tag,
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_req_ready,
-
-    // Core response
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_rsp_valid,
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0]  core_rsp_tag,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_rsp_ready,
-
-    // Memory request
-    output wire                    mem_req_valid,
-    output wire                    mem_req_rw, 
-    output wire [LINE_SIZE-1:0]    mem_req_byteen,
-    output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
-    output wire [`CS_LINE_WIDTH-1:0] mem_req_data, 
-    output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag, 
-    input  wire                    mem_req_ready,
-    
-    // Memory response
-    input  wire                    mem_rsp_valid, 
-    input  wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
-    input  wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag, 
-    output wire                    mem_rsp_ready
-);
-    VX_mem_bus_if #(
-        .DATA_SIZE (WORD_SIZE),
-        .TAG_WIDTH (TAG_WIDTH)
-    ) core_bus_if[NUM_INPUTS * NUM_REQS]();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (LINE_SIZE),
-        .TAG_WIDTH (MEM_TAG_X_WIDTH)
-    ) mem_bus_if();
-
-    // Core request
-    for (genvar i = 0; i < NUM_INPUTS; ++i) begin
-        for (genvar r = 0; r < NUM_REQS; ++r) begin
-            assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
-            assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
-        end
-    end
-
-    // Core response
-    for (genvar i = 0; i < NUM_INPUTS; ++i) begin
-        for (genvar r = 0; r < NUM_REQS; ++r) begin
-            assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
-            assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
-            assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
-            assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
-        end
-    end
-
-    // Memory request
-    assign mem_req_valid = mem_bus_if.req_valid;
-    assign mem_req_rw = mem_bus_if.req_data.rw; 
-    assign mem_req_byteen = mem_bus_if.req_data.byteen;
-    assign mem_req_addr = mem_bus_if.req_data.addr;
-    assign mem_req_data = mem_bus_if.req_data.data;  
-    assign mem_req_tag = mem_bus_if.req_data.tag; 
-    assign mem_bus_if.req_ready = mem_req_ready;
-    
-    // Memory response
-    assign mem_bus_if.rsp_valid = mem_rsp_valid;    
-    assign mem_bus_if.rsp_data.data = mem_rsp_data;
-    assign mem_bus_if.rsp_data.tag = mem_rsp_tag; 
-    assign mem_rsp_ready = mem_bus_if.rsp_ready;
-
-    VX_cache_cluster #(
-        .INSTANCE_ID    (INSTANCE_ID),
-        .NUM_UNITS      (NUM_UNITS),
-        .NUM_INPUTS     (NUM_INPUTS),
-        .TAG_SEL_IDX    (TAG_SEL_IDX),
-        .NUM_REQS       (NUM_REQS),
-        .CACHE_SIZE     (CACHE_SIZE),        
-        .LINE_SIZE      (LINE_SIZE),
-        .NUM_BANKS      (NUM_BANKS),
-        .NUM_WAYS       (NUM_WAYS),
-        .WORD_SIZE      (WORD_SIZE),
-        .CRSQ_SIZE      (CRSQ_SIZE),
-        .MSHR_SIZE      (MSHR_SIZE),
-        .MRSQ_SIZE      (MRSQ_SIZE),
-        .MREQ_SIZE      (MREQ_SIZE),
-        .WRITE_ENABLE   (WRITE_ENABLE),
-        .UUID_WIDTH     (UUID_WIDTH),
-        .TAG_WIDTH      (TAG_WIDTH),
-        .NC_ENABLE      (NC_ENABLE),
-        .CORE_OUT_REG   (CORE_OUT_REG),
-        .MEM_OUT_REG    (MEM_OUT_REG)
-    ) cache (
-    `ifdef PERF_ENABLE
-        .cache_perf     (cache_perf),
-    `endif
-        .clk            (clk),
-        .reset          (reset),
-        .core_bus_if    (core_bus_if),
-        .mem_bus_if     (mem_bus_if)
-    );
-
-endmodule
diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv
index 493e4884..5106d7d5 100644
--- a/hw/rtl/cache/VX_cache_data.sv
+++ b/hw/rtl/cache/VX_cache_data.sv
@@ -93,7 +93,7 @@ module VX_cache_data #(
         assign wren  = fill;
     end
     
-    wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
+    wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
 
     VX_onehot_encoder #(
         .N (NUM_WAYS)
diff --git a/hw/rtl/cache/VX_cache_top.sv b/hw/rtl/cache/VX_cache_top.sv
index 9e36d9af..9be08dde 100644
--- a/hw/rtl/cache/VX_cache_top.sv
+++ b/hw/rtl/cache/VX_cache_top.sv
@@ -22,7 +22,7 @@ module VX_cache_top #(
     // Size of cache in bytes
     parameter CACHE_SIZE            = 16384, 
     // Size of line inside a bank in bytes
-    parameter LINE_SIZE             = 16, 
+    parameter LINE_SIZE             = 64, 
     // Number of banks
     parameter NUM_BANKS             = 4,
     // Number of associative ways
diff --git a/hw/syn/altera/quartus/cache/Makefile b/hw/syn/altera/quartus/cache/Makefile
index 258dc91a..f96a7614 100755
--- a/hw/syn/altera/quartus/cache/Makefile
+++ b/hw/syn/altera/quartus/cache/Makefile
@@ -1,6 +1,6 @@
-PROJECT = VX_cache_cluster_top
+PROJECT = VX_cache_top
 TOP_LEVEL_ENTITY = $(PROJECT)
-SRC_FILE = VX_cache_cluster.sv
+SRC_FILE = $(PROJECT).sv
 
 include ../../common.mk
 
diff --git a/hw/syn/altera/quartus/core/Makefile b/hw/syn/altera/quartus/core/Makefile
index f1dc07f3..eeeaa523 100644
--- a/hw/syn/altera/quartus/core/Makefile
+++ b/hw/syn/altera/quartus/core/Makefile
@@ -1,6 +1,6 @@
 PROJECT = VX_core_top
 TOP_LEVEL_ENTITY = $(PROJECT)
-SRC_FILE = VX_core.sv
+SRC_FILE = $(PROJECT).sv
 
 include ../../common.mk
 

From 43154cf738b9dfe91afc53c926e3734e02bd9ab6 Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Thu, 16 Nov 2023 23:41:59 -0800
Subject: [PATCH 11/11] minor updates

---
 hw/rtl/core/VX_sfu_unit.sv          |  4 +--
 hw/syn/altera/opae/Makefile         |  6 ++---
 hw/syn/xilinx/xrt/Makefile          |  6 ++---
 hw/unittest/top_modules/Makefile    |  1 -
 tests/opencl/Makefile               | 12 ++++-----
 tests/opencl/lbm/main.cc            |  4 ---
 tests/opencl/spmv/convert_dataset.c |  4 ---
 tests/opencl/spmv/main.cc           |  3 ---
 tests/opencl/stencil/main.cc        | 38 +++++++++--------------------
 9 files changed, 25 insertions(+), 53 deletions(-)

diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv
index e94f86fd..fd5dd59f 100644
--- a/hw/rtl/core/VX_sfu_unit.sv
+++ b/hw/rtl/core/VX_sfu_unit.sv
@@ -170,7 +170,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
         .NUM_INPUTS (RSP_ARB_SIZE),
         .DATAW      (RSP_ARB_DATAW),
         .ARBITER    ("R"),
-        .OUT_REG    (1)
+        .OUT_REG    (3)
     ) rsp_arb (
         .clk       (clk),
         .reset     (commit_reset), 
@@ -186,7 +186,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
     VX_gather_unit #(
         .BLOCK_SIZE (BLOCK_SIZE),
         .NUM_LANES  (NUM_LANES),
-        .OUT_REG    (3)
+        .OUT_REG    (1)
     ) gather_unit (
         .clk           (clk),
         .reset         (commit_reset),
diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile
index 56008d42..0db2015d 100644
--- a/hw/syn/altera/opae/Makefile
+++ b/hw/syn/altera/opae/Makefile
@@ -50,9 +50,9 @@ CONFIGS_1c  := -DNUM_CLUSTERS=1 -DNUM_CORES=1
 CONFIGS_2c  := -DNUM_CLUSTERS=1 -DNUM_CORES=2
 CONFIGS_4c  := -DNUM_CLUSTERS=1 -DNUM_CORES=4
 CONFIGS_8c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=8
-CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16
-CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
-CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
+CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
+CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
+CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
 CONFIGS += $(CONFIGS_$(NUM_CORES)c)
 
 # include paths
diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile
index ad8e77a7..c8714779 100644
--- a/hw/syn/xilinx/xrt/Makefile
+++ b/hw/syn/xilinx/xrt/Makefile
@@ -67,9 +67,9 @@ CONFIGS_1c  := -DNUM_CLUSTERS=1 -DNUM_CORES=1
 CONFIGS_2c  := -DNUM_CLUSTERS=1 -DNUM_CORES=2
 CONFIGS_4c  := -DNUM_CLUSTERS=1 -DNUM_CORES=4
 CONFIGS_8c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=8
-CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16
-CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
-CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
+CONFIGS_16c	:= -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
+CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
+CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
 CONFIGS += $(CONFIGS_$(NUM_CORES)c)
 
 # include paths
diff --git a/hw/unittest/top_modules/Makefile b/hw/unittest/top_modules/Makefile
index 2d0319e7..7445381e 100644
--- a/hw/unittest/top_modules/Makefile
+++ b/hw/unittest/top_modules/Makefile
@@ -56,7 +56,6 @@ PROJECT = top_modules
 all: build
 	
 build: $(SRCS)
-	verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
 	verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
 	verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'
 
diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile
index acb96ba9..88236559 100644
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@@ -31,16 +31,16 @@ run-simx:
 	$(MAKE) -C dotproduct run-simx
 	$(MAKE) -C kmeans run-simx
 	$(MAKE) -C spmv run-simx
-	$(MAKE) -C cutcp run-simx	
 	$(MAKE) -C stencil run-simx
 	$(MAKE) -C lbm run-simx
 	$(MAKE) -C oclprintf run-simx
 	$(MAKE) -C blackscholes run-simx
 	$(MAKE) -C transpose run-simx
 	$(MAKE) -C convolution run-simx
-#	$(MAKE) -C matmul run-simx
-#	$(MAKE) -C vectorhypot run-simx	
-#	$(MAKE) -C mri-q run-simx
+	$(MAKE) -C cutcp run-simx
+	$(MAKE) -C matmul run-simx
+	$(MAKE) -C vectorhypot run-simx	
+	$(MAKE) -C mri-q run-simx
 
 run-rtlsim:
 	$(MAKE) -C vecadd run-rtlsim
@@ -54,12 +54,12 @@ run-rtlsim:
 	$(MAKE) -C kmeans run-rtlsim
 	$(MAKE) -C spmv run-rtlsim
 	$(MAKE) -C transpose run-rtlsim
-	$(MAKE) -C cutcp run-rtlsim
 	$(MAKE) -C stencil run-rtlsim
 	$(MAKE) -C lbm run-rtlsim
 	$(MAKE) -C oclprintf run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
 	$(MAKE) -C convolution run-rtlsim
+#	$(MAKE) -C cutcp run-rtlsim
 #	$(MAKE) -C matmul run-rtlsim
 #	$(MAKE) -C vectorhypot run-rtlsim
 #	$(MAKE) -C mri-q run-rtlsim
@@ -76,12 +76,12 @@ run-opae:
 	$(MAKE) -C kmeans run-opae
 	$(MAKE) -C spmv run-opae
 	$(MAKE) -C transpose run-opae
-	$(MAKE) -C cutcp run-opae	
 	$(MAKE) -C stencil run-opae
 	$(MAKE) -C lbm run-opae
 	$(MAKE) -C oclprintf run-opae
 	$(MAKE) -C blackscholes run-opae
 	$(MAKE) -C convolution run-opae
+#	$(MAKE) -C cutcp run-opae
 #	$(MAKE) -C matmul run-opae
 #	$(MAKE) -C vectorhypot run-opae
 #	$(MAKE) -C mri-q run-opae
diff --git a/tests/opencl/lbm/main.cc b/tests/opencl/lbm/main.cc
index 1d825239..58a930e9 100644
--- a/tests/opencl/lbm/main.cc
+++ b/tests/opencl/lbm/main.cc
@@ -173,14 +173,10 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
 
 	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-	printf("OK+\n");
-
   // Setup DEVICE datastructures
   OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
   OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
 
-	printf("OK-\n");
-
   // Initialize DEVICE datastructures
   OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
   OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
diff --git a/tests/opencl/spmv/convert_dataset.c b/tests/opencl/spmv/convert_dataset.c
index 122d8819..aba9c3b3 100644
--- a/tests/opencl/spmv/convert_dataset.c
+++ b/tests/opencl/spmv/convert_dataset.c
@@ -91,15 +91,11 @@ int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size,
   if ((f = fopen(mtx_filename, "r")) == NULL)
     exit(1);
 
-  printf("OK**\n");
-
   if (mm_read_banner(f, &matcode) != 0) {
     printf("Could not process Matrix Market banner.\n");
     exit(1);
   }
 
-	printf("OK**\n");
-
   /*  This is how one can screen matrix types if their application */
   /*  only supports a subset of the Matrix Market data types.      */
 
diff --git a/tests/opencl/spmv/main.cc b/tests/opencl/spmv/main.cc
index 85182322..01aa43cd 100644
--- a/tests/opencl/spmv/main.cc
+++ b/tests/opencl/spmv/main.cc
@@ -148,7 +148,6 @@ int main(int argc, char **argv) {
   //    &h_data, &h_indices, &h_ptr,
   //    &h_perm, &h_nzcnt);
   int col_count;
-	printf("OK--\n");
   coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
              1,                       // row padding
              pad,                     // warp size
@@ -159,8 +158,6 @@ int main(int argc, char **argv) {
              &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
              &len, &nzcnt_len, &depth);
 
-	printf("OK++\n");
-
   //	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   h_Ax_vector = (float *)malloc(sizeof(float) * dim);
   h_x_vector = (float *)malloc(sizeof(float) * dim);
diff --git a/tests/opencl/stencil/main.cc b/tests/opencl/stencil/main.cc
index a68bd5a3..cbbed6bc 100644
--- a/tests/opencl/stencil/main.cc
+++ b/tests/opencl/stencil/main.cc
@@ -157,9 +157,7 @@ int main(int argc, char** argv) {
 	CHECK_ERROR("clBuildProgram")
 
 	cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-	CHECK_ERROR("clCreateKernel") 
-
-  printf("OK+\n");
+	CHECK_ERROR("clCreateKernel")
 
 	//host data
 	float *h_A0;
@@ -177,15 +175,11 @@ int main(int argc, char** argv) {
 	h_Anext=(float*)malloc(sizeof(float)*size);
 	pb_SwitchToTimer(&timers, pb_TimerID_IO);
 	//FILE *fp = fopen(parameters->inpFiles[0], "rb");
-	printf("OK+\n");
 	read_data(h_A0, nx,ny,nz,NULL);
-	printf("OK+\n");
-  //fclose(fp);
-  memcpy (h_Anext,h_A0,sizeof(float)*size);
+  	//fclose(fp);
+ 	memcpy (h_Anext,h_A0,sizeof(float)*size);
 
 	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-	printf("OK+\n");
 	
 	//memory allocation
 	d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
@@ -201,18 +195,16 @@ int main(int argc, char** argv) {
 
 	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-	printf("OK+\n");
-
 	//only use 1D thread block
-  int tx = 128;
+  	int tx = 128;
 	size_t block[3] = {tx,1,1};
 	size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-  //size_t grid[3] = {nx-2,ny-2,nz-2};
-  size_t offset[3] = {1,1,1};
-  printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
+  	//size_t grid[3] = {nx-2,ny-2,nz-2};
+  	size_t offset[3] = {1,1,1};
+  	printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
 	printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]);
   
-  printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
+  	printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
 
 	clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
 	clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
@@ -226,14 +218,10 @@ int main(int argc, char** argv) {
 	//main execution
 	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
-	printf("OK+0\n");
-
 	int t;
 	for(t=0;t<iteration;t++)
 	{
 		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-		printf("OK+0\n");
-
     //printf("iteration %d\n",t)
 		CHECK_ERROR("clEnqueueNDRangeKernel")
     
@@ -244,11 +232,9 @@ int main(int argc, char** argv) {
     clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
 	}
 
-  printf("OK+1\n");
-
-  cl_mem d_temp = d_A0;
-  d_A0 = d_Anext;
-  d_Anext = d_temp;
+	cl_mem d_temp = d_A0;
+	d_A0 = d_Anext;
+	d_Anext = d_temp;
 
 	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
@@ -260,8 +246,6 @@ int main(int argc, char** argv) {
 	clStatus = clReleaseCommandQueue(clCommandQueue);
 	clStatus = clReleaseContext(clContext);
 	CHECK_ERROR("clReleaseContext")
-
-	printf("OK+2\n");
  
 	if (parameters->outFile) {
 		pb_SwitchToTimer(&timers, pb_TimerID_IO);