From 62cdd8e99385ce5470c96b7ce1808080d11e740f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sat, 11 Nov 2023 15:49:39 -0800 Subject: [PATCH 01/11] minor update --- .travis.yml | 6 ++-- ci/toolchain_env.sh | 7 ++--- docs/fpga_setup.md | 3 -- hw/syn/xilinx/test/kernel/Makefile | 5 +-- kernel/Makefile | 7 +++-- tests/kernel/common.mk | 49 ++++++++++++++++++++++++++++++ tests/kernel/conform/Makefile | 49 +----------------------------- tests/kernel/fibonacci/Makefile | 49 +----------------------------- tests/kernel/hello/Makefile | 49 +----------------------------- tests/opencl/common.mk | 13 ++++---- tests/regression/common.mk | 8 +++-- tests/unittest/common.mk | 30 ++++++++++++++++++ tests/unittest/vx_malloc/Makefile | 31 +------------------ 13 files changed, 107 insertions(+), 199 deletions(-) create mode 100644 tests/kernel/common.mk create mode 100644 tests/unittest/common.mk diff --git a/.travis.yml b/.travis.yml index f719ce86..236ed3b7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,7 +38,7 @@ jobs: - rm -rf $HOME/build32 && cp -r $PWD $HOME/build32 - rm -rf $HOME/build64 && cp -r $PWD $HOME/build64 - make -C $HOME/build32 - - XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make -C $HOME/build64 + - XLEN=64 make -C $HOME/build64 - stage: test name: unittest script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest @@ -47,13 +47,13 @@ jobs: script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa - stage: test name: isa64 - script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa + script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa - stage: test name: regression script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression - stage: test name: regression64 - script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression + script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression - stage: test name: opencl script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl diff --git a/ci/toolchain_env.sh b/ci/toolchain_env.sh index 4046a6a7..440a899e 100644 --- a/ci/toolchain_env.sh +++ b/ci/toolchain_env.sh @@ -16,14 +16,11 @@ TOOLDIR=${TOOLDIR:=/opt} -export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain -export LLVM_POCL=$TOOLDIR/llvm-pocl -export LLVM_VORTEX=$TOOLDIR/llvm-vortex export VERILATOR_ROOT=$TOOLDIR/verilator export PATH=$VERILATOR_ROOT/bin:$PATH + export SV2V_PATH=$TOOLDIR/sv2v export PATH=$SV2V_PATH/bin:$PATH + export YOSYS_PATH=$TOOLDIR/yosys export PATH=$YOSYS_PATH/bin:$PATH -export POCL_CC_PATH=$TOOLDIR/pocl/compiler -export POCL_RT_PATH=$TOOLDIR/pocl/runtime diff --git a/docs/fpga_setup.md b/docs/fpga_setup.md index 61ff481f..88e0c3c0 100644 --- a/docs/fpga_setup.md +++ b/docs/fpga_setup.md @@ -9,9 +9,6 @@ OPAE Environment Setup $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH - $ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain - $ export PATH=:/opt/verilator/bin:$PATH - $ export VERILATOR_ROOT=/opt/verilator OPAE Build ------------------ diff --git a/hw/syn/xilinx/test/kernel/Makefile b/hw/syn/xilinx/test/kernel/Makefile index 55c21aa2..11457ab4 100644 --- a/hw/syn/xilinx/test/kernel/Makefile +++ b/hw/syn/xilinx/test/kernel/Makefile @@ -1,10 +1,11 @@ XLEN ?= 32 +TOOLDIR ?= /opt ifeq ($(XLEN),64) -RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain CFLAGS += -march=rv64imafd -mabi=lp64d else -RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain CFLAGS += -march=rv32imaf -mabi=ilp32f endif diff --git a/kernel/Makefile b/kernel/Makefile index e4c975dc..07b8c97b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -1,17 +1,18 @@ XLEN ?= 32 +TOOLDIR ?= /opt ifeq ($(XLEN),64) -RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain CFLAGS += -march=rv64imafd -mabi=lp64d else -RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain CFLAGS += -march=rv32imaf -mabi=ilp32f endif RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX) -LLVM_VORTEX ?= /opt/llvm-vortex +LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT) LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) diff --git a/tests/kernel/common.mk b/tests/kernel/common.mk new file mode 100644 index 00000000..7bf4b520 --- /dev/null +++ b/tests/kernel/common.mk @@ -0,0 +1,49 @@ +XLEN ?= 32 +TOOLDIR ?= /opt + +ifeq ($(XLEN),64) +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain +CFLAGS += -march=rv64imafd -mabi=lp64d +else +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain +CFLAGS += -march=rv32imaf -mabi=ilp32f +endif + +RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf + +VORTEX_KN_PATH ?= $(realpath ../../../kernel) + +CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc +AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar +DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump +CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy + +SIM_DIR = ../../../sim + +CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections +CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw + +LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a + +all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump + +$(PROJECT).dump: $(PROJECT).elf + $(DP) -D $(PROJECT).elf > $(PROJECT).dump + +$(PROJECT).bin: $(PROJECT).elf + $(CP) -O binary $(PROJECT).elf $(PROJECT).bin + +$(PROJECT).elf: $(SRCS) + $(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf + +run-rtlsim: $(PROJECT).bin + $(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin + +run-simx: $(PROJECT).bin + $(SIM_DIR)/simx/simx $(PROJECT).bin + +.depend: $(SRCS) + $(CC) $(CFLAGS) -MM $^ > .depend; + +clean: + rm -rf *.elf *.bin *.dump .depend diff --git a/tests/kernel/conform/Makefile b/tests/kernel/conform/Makefile index e9897125..ee96978f 100644 --- a/tests/kernel/conform/Makefile +++ b/tests/kernel/conform/Makefile @@ -1,52 +1,5 @@ -XLEN ?= 32 - -ifeq ($(XLEN),64) -RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain -CFLAGS += -march=rv64imafd -mabi=lp64d -else -RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain -CFLAGS += -march=rv32imaf -mabi=ilp32f -endif - -RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf - -VORTEX_KN_PATH ?= $(realpath ../../../kernel) - -CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc -AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar -DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump -CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy - -SIM_DIR = ../../../sim - -CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections -CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw - -LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a - PROJECT = conform SRCS = main.cpp tests.cpp -all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump - -$(PROJECT).dump: $(PROJECT).elf - $(DP) -D $(PROJECT).elf > $(PROJECT).dump - -$(PROJECT).bin: $(PROJECT).elf - $(CP) -O binary $(PROJECT).elf $(PROJECT).bin - -$(PROJECT).elf: $(SRCS) - $(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf - -run-rtlsim: $(PROJECT).bin - $(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin - -run-simx: $(PROJECT).bin - $(SIM_DIR)/simx/simx $(PROJECT).bin - -.depend: $(SRCS) - $(CC) $(CFLAGS) -MM $^ > .depend; - -clean: - rm -rf *.elf *.bin *.dump .depend +include ../common.mk diff --git a/tests/kernel/fibonacci/Makefile b/tests/kernel/fibonacci/Makefile index 1338b4ab..d4486c74 100644 --- a/tests/kernel/fibonacci/Makefile +++ b/tests/kernel/fibonacci/Makefile @@ -1,52 +1,5 @@ -XLEN ?= 32 - -ifeq ($(XLEN),64) -RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain -CFLAGS += -march=rv64imafd -mabi=lp64d -else -RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain -CFLAGS += -march=rv32imaf -mabi=ilp32f -endif - -RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf - -VORTEX_KN_PATH ?= $(realpath ../../../kernel) - -CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc -AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar -DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump -CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy - -SIM_DIR = ../../../sim - -CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections -CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw - -LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a - PROJECT = fibonacci SRCS = main.cpp -all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump - -$(PROJECT).dump: $(PROJECT).elf - $(DP) -D $(PROJECT).elf > $(PROJECT).dump - -$(PROJECT).bin: $(PROJECT).elf - $(CP) -O binary $(PROJECT).elf $(PROJECT).bin - -$(PROJECT).elf: $(SRCS) - $(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf - -run-rtlsim: $(PROJECT).bin - $(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin - -run-simx: $(PROJECT).bin - $(SIM_DIR)/simx/simx $(PROJECT).bin - -.depend: $(SRCS) - $(CC) $(CFLAGS) -MM $^ > .depend; - -clean: - rm -rf *.elf *.bin *.dump .depend +include ../common.mk diff --git a/tests/kernel/hello/Makefile b/tests/kernel/hello/Makefile index 42d95256..4cff6cbd 100644 --- a/tests/kernel/hello/Makefile +++ b/tests/kernel/hello/Makefile @@ -1,52 +1,5 @@ -XLEN ?= 32 - -ifeq ($(XLEN),64) -RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain -CFLAGS += -march=rv64imafd -mabi=lp64d -else -RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain -CFLAGS += -march=rv32imaf -mabi=ilp32f -endif - -RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf - -VORTEX_KN_PATH ?= $(realpath ../../../kernel) - -CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc -AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar -DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump -CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy - -SIM_DIR = ../../../sim - -CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections -CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw - -LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a - PROJECT = hello SRCS = main.cpp -all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump - -$(PROJECT).dump: $(PROJECT).elf - $(DP) -D $(PROJECT).elf > $(PROJECT).dump - -$(PROJECT).bin: $(PROJECT).elf - $(CP) -O binary $(PROJECT).elf $(PROJECT).bin - -$(PROJECT).elf: $(SRCS) - $(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf - -run-rtlsim: $(PROJECT).bin - $(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin - -run-simx: $(PROJECT).bin - $(SIM_DIR)/simx/simx $(PROJECT).bin - -.depend: $(SRCS) - $(CC) $(CFLAGS) -MM $^ > .depend; - -clean: - rm -rf *.elf *.bin *.dump .depend +include ../common.mk diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk index fc55ee0a..ce82dac3 100644 --- a/tests/opencl/common.mk +++ b/tests/opencl/common.mk @@ -1,4 +1,5 @@ XLEN ?= 32 +TOOLDIR ?= /opt TARGET ?= opaesim @@ -6,12 +7,12 @@ XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt XRT_DEVICE_INDEX ?= 0 ifeq ($(XLEN),64) -RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain VX_CFLAGS += -march=rv64imafd -mabi=lp64d K_CFLAGS += -march=rv64imafd -mabi=ilp64d STARTUP_ADDR ?= 0x180000000 else -RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain VX_CFLAGS += -march=rv32imaf -mabi=ilp32f K_CFLAGS += -march=rv32imaf -mabi=ilp32f STARTUP_ADDR ?= 0x80000000 @@ -20,16 +21,16 @@ endif RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX) -POCL_CC_PATH ?= /opt/pocl/compiler -POCL_RT_PATH ?= /opt/pocl/runtime +POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler +POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime VORTEX_RT_PATH ?= $(realpath ../../../runtime) VORTEX_KN_PATH ?= $(realpath ../../../kernel) FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae -LLVM_VORTEX ?= /opt/llvm-vortex -LLVM_POCL ?= /opt/llvm-vortex +LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex +LLVM_POCL ?= $(TOOLDIR)/llvm-vortex K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 006d6668..6a858edc 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -1,16 +1,18 @@ XLEN ?= 32 +TOOLDIR ?= /opt + TARGET ?= opaesim XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt XRT_DEVICE_INDEX ?= 0 ifeq ($(XLEN),64) -RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain VX_CFLAGS += -march=rv64imafd -mabi=lp64d STARTUP_ADDR ?= 0x180000000 else -RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain VX_CFLAGS += -march=rv32imaf -mabi=ilp32f STARTUP_ADDR ?= 0x80000000 endif @@ -23,7 +25,7 @@ VORTEX_KN_PATH ?= $(realpath ../../../kernel) FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae -LLVM_VORTEX ?= /opt/llvm-vortex +LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT) LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) diff --git a/tests/unittest/common.mk b/tests/unittest/common.mk new file mode 100644 index 00000000..11add105 --- /dev/null +++ b/tests/unittest/common.mk @@ -0,0 +1,30 @@ +VORTEX_RT_PATH ?= $(realpath ../../../runtime) + +CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors + +CXXFLAGS += -I$(VORTEX_RT_PATH)/common + +# Debugigng +ifdef DEBUG + CXXFLAGS += -g -O0 +else + CXXFLAGS += -O2 -DNDEBUG +endif + +all: $(PROJECT) + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run: + ./$(PROJECT) + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.elf *.bin *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif diff --git a/tests/unittest/vx_malloc/Makefile b/tests/unittest/vx_malloc/Makefile index 2d604620..2036fcde 100644 --- a/tests/unittest/vx_malloc/Makefile +++ b/tests/unittest/vx_malloc/Makefile @@ -1,34 +1,5 @@ -VORTEX_RT_PATH ?= $(realpath ../../../runtime) - -CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors - -CXXFLAGS += -I$(VORTEX_RT_PATH)/common - -# Debugigng -ifdef DEBUG - CXXFLAGS += -g -O0 -else - CXXFLAGS += -O2 -DNDEBUG -endif - PROJECT = vx_malloc SRCS = main.cpp -all: $(PROJECT) - -$(PROJECT): $(SRCS) - $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ - -run: - ./$(PROJECT) - -clean: - rm -rf $(PROJECT) *.o .depend - -clean-all: clean - rm -rf *.elf *.bin *.dump - -ifneq ($(MAKECMDGOALS),clean) - -include .depend -endif \ No newline at end of file +include ../common.mk From a08d3ebd4250a7d52ee7741712dfb612c163be2f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Sun, 12 Nov 2023 23:40:59 -0800 Subject: [PATCH 02/11] minor update --- hw/rtl/afu/xrt/VX_afu_wrap.sv | 2 +- hw/rtl/{fpu => core}/VX_fpu_unit.sv | 0 hw/rtl/core/VX_muldiv_unit.sv | 5 + hw/rtl/core/VX_scoreboard.sv | 3 + hw/rtl/fpu/VX_fpu_cvt.sv | 206 ++++++++++------------------ hw/rtl/fpu/VX_fpu_rounding.sv | 1 - 6 files changed, 81 insertions(+), 136 deletions(-) rename hw/rtl/{fpu => core}/VX_fpu_unit.sv (100%) diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index 3c4b3947..2abbbe43 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -262,7 +262,7 @@ module VX_afu_wrap #( .m_axi_awready (m_axi_mem_awready_a), .m_axi_awaddr (m_axi_mem_awaddr_w), .m_axi_awid (m_axi_mem_awid_a), - `UNUSED_PIN (m_axi_awlen), + .m_axi_awlen (m_axi_mem_awlen_a), `UNUSED_PIN (m_axi_awsize), `UNUSED_PIN (m_axi_awburst), `UNUSED_PIN (m_axi_awlock), diff --git a/hw/rtl/fpu/VX_fpu_unit.sv b/hw/rtl/core/VX_fpu_unit.sv similarity index 100% rename from hw/rtl/fpu/VX_fpu_unit.sv rename to hw/rtl/core/VX_fpu_unit.sv diff --git a/hw/rtl/core/VX_muldiv_unit.sv b/hw/rtl/core/VX_muldiv_unit.sv index f3c730d4..141cdb55 100644 --- a/hw/rtl/core/VX_muldiv_unit.sv +++ b/hw/rtl/core/VX_muldiv_unit.sv @@ -220,8 +220,13 @@ module VX_muldiv_unit #( wire [NUM_LANES-1:0][`XLEN-1:0] div_in2; for (genvar i = 0; i < NUM_LANES; ++i) begin + `ifdef XLEN_64 assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i]; assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i]; + `else + assign div_in1[i] = execute_if.data.rs1_data[i]; + assign div_in2[i] = execute_if.data.rs2_data[i]; + `endif end `ifdef IDIV_DPI diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index ee5ae2ec..90a58134 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -107,6 +107,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( .ready_out (scoreboard_if[i].ready) ); + `ifdef SIMULATION reg [31:0] timeout_ctr; always @(posedge clk) begin @@ -134,6 +135,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0, ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)", $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid)); + `endif + end endmodule diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv index 34b2ed28..d668539b 100644 --- a/hw/rtl/fpu/VX_fpu_cvt.sv +++ b/hw/rtl/fpu/VX_fpu_cvt.sv @@ -52,30 +52,27 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( localparam MAN_BITS = 23; localparam EXP_BITS = 8; - localparam EXP_BIAS = 2**(EXP_BITS-1)-1; - - localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1; - localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1); + localparam EXP_BIAS = 2**(EXP_BITS-1)-1; // Use 32-bit integer - localparam MAX_INT_WIDTH = 32; + localparam INT_WIDTH = 32; // The internal mantissa includes normal bit or an entire integer - localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH); + localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH); // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH); // The internal exponent must be able to represent the smallest denormal input value as signed // or the number of bits in an integer - localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1; + localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1; // shift amount for denormalization localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1); localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS; localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R - localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R + localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R // Input processing @@ -86,8 +83,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .EXP_BITS (EXP_BITS), .MAN_BITS (MAN_BITS) ) fp_class ( - .exp_i (dataa[i][30:23]), - .man_i (dataa[i][22:0]), + .exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]), + .man_i (dataa[i][MAN_BITS-1:0]), .clss_o (fclass[i]) ); end @@ -97,15 +94,13 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( wire [NUM_LANES-1:0] input_sign; for (genvar i = 0; i < NUM_LANES; ++i) begin - wire [INT_MAN_WIDTH-1:0] int_mantissa; - wire [INT_MAN_WIDTH-1:0] fmt_mantissa; - wire fmt_sign = dataa[i][31]; - wire int_sign = dataa[i][31] && is_signed; - assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i]; - assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]}); + wire i2f_sign = dataa[i][INT_WIDTH-1]; + wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed; + wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i]; + wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]}); assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal}); - assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa; - assign input_sign[i] = is_itof ? int_sign : fmt_sign; + assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa; + assign input_sign[i] = is_itof ? f2i_sign : i2f_sign; end // Pipeline stage0 @@ -159,9 +154,9 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i]; // Unbias exponent and compensate for shift - wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]}); - wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]}); - assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0; + wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]}); + wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]}); + assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0; end // Pipeline stage1 @@ -193,51 +188,32 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1; - wire [NUM_LANES-1:0] of_before_round_s1; + wire [NUM_LANES-1:0] of_before_round_s1; - for (genvar i = 0; i < NUM_LANES; ++i) begin - reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift - reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization - reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments - reg of_before_round_tmp_s1; + for (genvar i = 0; i < NUM_LANES; ++i) begin + reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization + reg of_before_round_tmp_s1; always @(*) begin - final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits - preshift_mant_s1 = {input_mant_s1[i], 33'b0}; denorm_shamt_s1 = '0; of_before_round_tmp_s1 = 1'b0; - if (is_itof_s1) begin - if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin - // Overflow or infinities (for proper rounding) - final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value - preshift_mant_s1 = ~0; // largest normal value and RS bits set - of_before_round_tmp_s1 = 1'b1; - end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin - // Limit the shift to retain sticky bits - final_exp_tmp_s1 = '0; // denormal result - denorm_shamt_s1 = (2 + MAN_BITS); // to sticky - end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin - // Denormalize underflowing values - final_exp_tmp_s1 = '0; // denormal result - denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting - end - end else begin - if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin - // overflow: when converting to unsigned the range is larger by one + if (!is_itof_s1) begin + if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin + // overflow of_before_round_tmp_s1 = 1'b1; end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin // underflow - denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky + denorm_shamt_s1 = INT_WIDTH+1; // all bits go to the sticky end else begin // By default right shift mantissa to be an integer - denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]); + denorm_shamt_s1 = SHAMT_BITS'(INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]); end end end - assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1; - assign final_exp_s1[i] = final_exp_tmp_s1; + assign destination_mant_s1[i] = {input_mant_s1[i], 33'b0} >> denorm_shamt_s1; + assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); assign of_before_round_s1[i] = of_before_round_tmp_s1; end @@ -267,33 +243,33 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2}) ); - wire [NUM_LANES-1:0] rounded_sign_s2; - wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding - wire [NUM_LANES-1:0] int_round_has_sticky_s2; - wire [NUM_LANES-1:0] fp_round_has_sticky_s2; + wire [NUM_LANES-1:0] rounded_sign_s2; + wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding + wire [NUM_LANES-1:0] f2i_round_has_sticky_s2; + wire [NUM_LANES-1:0] i2f_round_has_sticky_s2; // Rouding and classification for (genvar i = 0; i < NUM_LANES; ++i) begin - wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments - wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position - wire [1:0] round_sticky_bits_s2; - wire [31:0] fmt_pre_round_abs_s2; - wire [31:0] pre_round_abs_s2; - wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2; + wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments + wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position + wire [1:0] round_sticky_bits_s2; + wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2; + wire [INT_WIDTH-1:0] pre_round_abs_s2; + wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2; // Extract final mantissa and round bit, discard the normal bit (for FP) - assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; - assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1]; + assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; + assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1]; // Collapse sticky bits - assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]); - assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]); - assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2); - assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2); + assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]); + assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]); + assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2); + assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2); // select RS bits for destination operation - assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2; + assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2; // Pack exponent and mantissa into proper rounding form assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]}; @@ -327,10 +303,10 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( wire [NUM_LANES-1:0] mant_is_zero_s3; wire [NUM_LANES-1:0] input_sign_s3; wire [NUM_LANES-1:0] rounded_sign_s3; - wire [NUM_LANES-1:0][31:0] rounded_abs_s3; + wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3; wire [NUM_LANES-1:0] of_before_round_s3; - wire [NUM_LANES-1:0] int_round_has_sticky_s3; - wire [NUM_LANES-1:0] fp_round_has_sticky_s3; + wire [NUM_LANES-1:0] f2i_round_has_sticky_s3; + wire [NUM_LANES-1:0] i2f_round_has_sticky_s3; VX_pipe_register #( .DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)), @@ -339,105 +315,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}), - .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3}) + .data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}), + .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3}) ); - wire [NUM_LANES-1:0] of_after_round_s3; - wire [NUM_LANES-1:0] uf_after_round_s3; - wire [NUM_LANES-1:0][31:0] fmt_result_s3; - wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion + wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3; + wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding for (genvar i = 0; i < NUM_LANES; ++i) begin // Assemble regular result, nan box short ones. Int zeroes need to be detected - assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]}; - - // Classification after rounding select by destination format - assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal - assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp. + assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]}; // Negative integer result needs to be brought into two's complement assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i]; assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0); end - // FP Special case handling + // F2I Special case handling - wire [NUM_LANES-1:0][31:0] fp_special_result_s3; - fflags_t [NUM_LANES-1:0] fp_special_status_s3; - wire [NUM_LANES-1:0] fp_result_is_special_s3; - - for (genvar i = 0; i < NUM_LANES; ++i) begin - // Detect special case from source format, I2F casts don't produce a special result - assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan); - - // Signalling input NaNs raise invalid flag, otherwise no flags set - assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation - - // Assemble result according to destination format - assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero - : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN - end - - // INT Special case handling - - reg [NUM_LANES-1:0][31:0] int_special_result_s3; - fflags_t [NUM_LANES-1:0] int_special_status_s3; - wire [NUM_LANES-1:0] int_result_is_special_s3; + reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3; + fflags_t [NUM_LANES-1:0] f2i_special_status_s3; + wire [NUM_LANES-1:0] f2i_result_is_special_s3; for (genvar i = 0; i < NUM_LANES; ++i) begin // Assemble result according to destination format always @(*) begin if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin - int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1 - int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31 + f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1 + f2i_special_result_s3[i][INT_WIDTH-1] = ~unsigned_s3; // for unsigned casts yields 2**31 end else begin - int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1 - int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31 + f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1 + f2i_special_result_s3[i][INT_WIDTH-1] = unsigned_s3; // for unsigned casts yields 2**31 end end // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned) - assign int_result_is_special_s3[i] = fclass_s3[i].is_nan + assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan | fclass_s3[i].is_inf | of_before_round_s3[i] | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]); // All integer special cases are invalid - assign int_special_status_s3[i] = {1'b1, 4'h0}; + assign f2i_special_status_s3[i] = {1'b1, 4'h0}; end // Result selection and Output handshake fflags_t [NUM_LANES-1:0] tmp_fflags_s3; - wire [NUM_LANES-1:0][31:0] tmp_result_s3; + wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3; - for (genvar i = 0; i < NUM_LANES; ++i) begin - fflags_t fp_regular_status_s3, int_regular_status_s3; - fflags_t fp_status_s3, int_status_s3; - wire [31:0] fp_result_s3, int_result_s3; + for (genvar i = 0; i < NUM_LANES; ++i) begin + fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0; + fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0; - wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f; - : (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i]))); - - assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts - assign fp_regular_status_s3.DZ = 1'b0; // no divisions - assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF - assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3; - assign fp_regular_status_s3.NX = inexact_s3; + fflags_t i2f_status_s3 = i2f_regular_status_s3; + fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3; - assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0; + wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i]; + wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i]; - assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i]; - assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i]; - - assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3; - assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3; - - // Select output depending on special case detection - assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3; - assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3; + assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3; + assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3; end assign stall = ~ready_out && valid_out; @@ -457,7 +396,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( ); assign ready_in = ~stall; - assign has_fflags = 1'b1; endmodule diff --git a/hw/rtl/fpu/VX_fpu_rounding.sv b/hw/rtl/fpu/VX_fpu_rounding.sv index 5168fada..877b4eb6 100644 --- a/hw/rtl/fpu/VX_fpu_rounding.sv +++ b/hw/rtl/fpu/VX_fpu_rounding.sv @@ -54,7 +54,6 @@ module VX_fpu_rounding #( 2'b01: round_up = 1'b0; // < ulp/2 away, round down 2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result 2'b11: round_up = 1'b1; // > ulp/2 away, round up - default: round_up = 1'bx; endcase `INST_FRM_RTZ: round_up = 1'b0; // always round down `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if - From b274b8cc217683ec9834b5393bd75406da395f6b Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 13 Nov 2023 00:23:15 -0800 Subject: [PATCH 03/11] minor updates --- hw/rtl/core/VX_operands.sv | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index 3d2c570c..3ff5df46 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -38,9 +38,12 @@ module VX_operands import VX_gpu_pkg::*; #( reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n; reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n; - reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n; - reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n; - reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n; + reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0]; + reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0]; + reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0]; + reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0]; + reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0]; + reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0]; reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n; reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n; @@ -160,11 +163,8 @@ module VX_operands import VX_gpu_pkg::*; #( end cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd; cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop; - if (writeback_if[i].data.sop) begin - cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask; - end else begin - cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask; - end + cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask : + (cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask); end end end @@ -175,7 +175,6 @@ module VX_operands import VX_gpu_pkg::*; #( gpr_rd_rid <= '0; gpr_rd_wis <= '0; cache_eop <= {ISSUE_RATIO{1'b1}}; - cache_reg <= '0; data_ready <= 0; end else begin state <= state_n; From ecf546bc4ab08e1f6f982275bc8fd15c52ce7b64 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Mon, 13 Nov 2023 20:00:39 -0800 Subject: [PATCH 04/11] minor update --- hw/rtl/fpu/VX_fpu_cvt.sv | 131 ++++++++++++++++++--------------------- 1 file changed, 60 insertions(+), 71 deletions(-) diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv index d668539b..e12e51ad 100644 --- a/hw/rtl/fpu/VX_fpu_cvt.sv +++ b/hw/rtl/fpu/VX_fpu_cvt.sv @@ -67,9 +67,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( // or the number of bits in an integer localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1; - // shift amount for denormalization - localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1); - localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS; localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R @@ -105,14 +102,14 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( // Pipeline stage0 - wire valid_in_s0; - wire [NUM_LANES-1:0] lane_mask_s0; - wire [TAGW-1:0] tag_in_s0; - wire is_itof_s0; - wire unsigned_s0; - wire [2:0] rnd_mode_s0; + wire valid_in_s0; + wire [NUM_LANES-1:0] lane_mask_s0; + wire [TAGW-1:0] tag_in_s0; + wire is_itof_s0; + wire is_signed_s0; + wire [2:0] rnd_mode_s0; fclass_t [NUM_LANES-1:0] fclass_s0; - wire [NUM_LANES-1:0] input_sign_s0; + wire [NUM_LANES-1:0] input_sign_s0; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0; wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0; @@ -125,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}), - .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0}) + .data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}), + .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0}) ); // Normalization @@ -161,15 +158,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( // Pipeline stage1 - wire valid_in_s1; - wire [NUM_LANES-1:0] lane_mask_s1; - wire [TAGW-1:0] tag_in_s1; - wire is_itof_s1; - wire unsigned_s1; - wire [2:0] rnd_mode_s1; + wire valid_in_s1; + wire [NUM_LANES-1:0] lane_mask_s1; + wire [TAGW-1:0] tag_in_s1; + wire is_itof_s1; + wire is_signed_s1; + wire [2:0] rnd_mode_s1; fclass_t [NUM_LANES-1:0] fclass_s1; - wire [NUM_LANES-1:0] input_sign_s1; - wire [NUM_LANES-1:0] mant_is_zero_s1; + wire [NUM_LANES-1:0] input_sign_s1; + wire [NUM_LANES-1:0] mant_is_zero_s1; wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1; @@ -180,8 +177,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}), - .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1}) + .data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}), + .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1}) ); // Perform adjustments to mantissa and exponent @@ -190,47 +187,39 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1; wire [NUM_LANES-1:0] of_before_round_s1; - for (genvar i = 0; i < NUM_LANES; ++i) begin - reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization - reg of_before_round_tmp_s1; - + for (genvar i = 0; i < NUM_LANES; ++i) begin + wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i]; + wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1))); + wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))); + reg [INT_EXP_WIDTH-1:0] denorm_shamt_q; always @(*) begin - denorm_shamt_s1 = '0; - of_before_round_tmp_s1 = 1'b0; - - if (!is_itof_s1) begin - if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin - // overflow - of_before_round_tmp_s1 = 1'b1; - end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin - // underflow - denorm_shamt_s1 = INT_WIDTH+1; // all bits go to the sticky - end else begin - // By default right shift mantissa to be an integer - denorm_shamt_s1 = SHAMT_BITS'(INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]); - end + if (overflow) begin + denorm_shamt_q = '0; + end else if (underflow) begin + denorm_shamt_q = INT_WIDTH+1; + end else begin + denorm_shamt_q = denorm_shamt; end end - - assign destination_mant_s1[i] = {input_mant_s1[i], 33'b0} >> denorm_shamt_s1; + assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q); assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); - assign of_before_round_s1[i] = of_before_round_tmp_s1; + assign of_before_round_s1[i] = overflow; end // Pipeline stage2 - wire valid_in_s2; - wire [NUM_LANES-1:0] lane_mask_s2; - wire [TAGW-1:0] tag_in_s2; - wire is_itof_s2; - wire unsigned_s2; - wire [2:0] rnd_mode_s2; + wire valid_in_s2; + wire [NUM_LANES-1:0] lane_mask_s2; + wire [TAGW-1:0] tag_in_s2; + wire is_itof_s2; + wire is_signed_s2; + wire [2:0] rnd_mode_s2; fclass_t [NUM_LANES-1:0] fclass_s2; - wire [NUM_LANES-1:0] mant_is_zero_s2; - wire [NUM_LANES-1:0] input_sign_s2; + wire [NUM_LANES-1:0] mant_is_zero_s2; + wire [NUM_LANES-1:0] input_sign_s2; wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2; wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2; - wire [NUM_LANES-1:0] of_before_round_s2; + wire [NUM_LANES-1:0] of_before_round_s2; VX_pipe_register #( .DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)), @@ -239,24 +228,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}), - .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2}) + .data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}), + .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2}) ); - wire [NUM_LANES-1:0] rounded_sign_s2; + wire [NUM_LANES-1:0] rounded_sign_s2; wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding - wire [NUM_LANES-1:0] f2i_round_has_sticky_s2; - wire [NUM_LANES-1:0] i2f_round_has_sticky_s2; + wire [NUM_LANES-1:0] f2i_round_has_sticky_s2; + wire [NUM_LANES-1:0] i2f_round_has_sticky_s2; // Rouding and classification for (genvar i = 0; i < NUM_LANES; ++i) begin - wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments - wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position - wire [1:0] round_sticky_bits_s2; - wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2; - wire [INT_WIDTH-1:0] pre_round_abs_s2; - wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2; + wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments + wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position + wire [1:0] round_sticky_bits_s2; + wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2; + wire [INT_WIDTH-1:0] pre_round_abs_s2; + wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2; // Extract final mantissa and round bit, discard the normal bit (for FP) assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1]; @@ -298,7 +287,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( wire [NUM_LANES-1:0] lane_mask_s3; wire [TAGW-1:0] tag_in_s3; wire is_itof_s3; - wire unsigned_s3; + wire is_signed_s3; fclass_t [NUM_LANES-1:0] fclass_s3; wire [NUM_LANES-1:0] mant_is_zero_s3; wire [NUM_LANES-1:0] input_sign_s3; @@ -315,8 +304,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}), - .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3}) + .data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}), + .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3}) ); wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3; @@ -335,18 +324,18 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( // F2I Special case handling reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3; - fflags_t [NUM_LANES-1:0] f2i_special_status_s3; - wire [NUM_LANES-1:0] f2i_result_is_special_s3; + fflags_t [NUM_LANES-1:0] f2i_special_status_s3; + wire [NUM_LANES-1:0] f2i_result_is_special_s3; for (genvar i = 0; i < NUM_LANES; ++i) begin // Assemble result according to destination format always @(*) begin if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1 - f2i_special_result_s3[i][INT_WIDTH-1] = ~unsigned_s3; // for unsigned casts yields 2**31 + f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31 end else begin f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1 - f2i_special_result_s3[i][INT_WIDTH-1] = unsigned_s3; // for unsigned casts yields 2**31 + f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31 end end @@ -354,7 +343,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan | fclass_s3[i].is_inf | of_before_round_s3[i] - | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]); + | (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]); // All integer special cases are invalid assign f2i_special_status_s3[i] = {1'b1, 4'h0}; From 4e7a536918638e09e9f4707685a91fa5e2bb451d Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 14 Nov 2023 05:37:46 -0800 Subject: [PATCH 05/11] adding tensor regression test. --- kernel/src/vx_spawn.c | 9 +- tests/opencl/matmul/kernel.cl | 54 +++--- tests/opencl/matmul/main.cc | 4 +- tests/regression/Makefile | 6 + tests/regression/basic/main.cpp | 7 +- tests/regression/demo/common.h | 6 +- tests/regression/demo/kernel.cpp | 8 +- tests/regression/demo/main.cpp | 94 ++++++++--- tests/regression/diverge/main.cpp | 7 +- tests/regression/fence/main.cpp | 7 +- tests/regression/io_addr/main.cpp | 11 +- tests/regression/mstress/main.cpp | 11 +- tests/regression/no_mf_ext/main.cpp | 11 +- tests/regression/no_smem/main.cpp | 9 +- tests/regression/printf/main.cpp | 11 +- tests/regression/sort/common.h | 8 +- tests/regression/sort/kernel.cpp | 8 +- tests/regression/sort/main.cpp | 19 +-- tests/regression/tensor/Makefile | 9 + tests/regression/tensor/common.h | 18 ++ tests/regression/tensor/kernel.cpp | 41 +++++ tests/regression/tensor/main.cpp | 249 ++++++++++++++++++++++++++++ 22 files changed, 474 insertions(+), 133 deletions(-) create mode 100644 tests/regression/tensor/Makefile create mode 100644 tests/regression/tensor/common.h create mode 100644 tests/regression/tensor/kernel.cpp create mode 100644 tests/regression/tensor/main.cpp diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 14773707..fd8258e1 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -51,9 +51,8 @@ inline char is_log2(int x) { return ((x & (x-1)) == 0); } -inline int fast_log2(int x) { - float f = x; - return (*(int*)(&f)>>23) - 127; +inline int log2_fast(int x) { + return 31 - __builtin_clz (x); } static void __attribute__ ((noinline)) spawn_tasks_all_stub() { @@ -286,8 +285,8 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) { // fast path handling char isXYpow2 = is_log2(XY); - char log2XY = fast_log2(XY); - char log2X = fast_log2(X); + char log2XY = log2_fast(XY); + char log2X = log2_fast(X); wspawn_kernel_args_t wspawn_args = { ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X diff --git a/tests/opencl/matmul/kernel.cl b/tests/opencl/matmul/kernel.cl index ea9b2156..a0ef2d81 100644 --- a/tests/opencl/matmul/kernel.cl +++ b/tests/opencl/matmul/kernel.cl @@ -5,35 +5,37 @@ __kernel void matmul(__global float *A, __local float *localA, __local float *localB) { - int row = get_global_id(1); - int col = get_global_id(0); + int globalRow = get_global_id(1); + int globalCol = get_global_id(0); int localRow = get_local_id(1); int localCol = get_local_id(0); int localSize = get_local_size(0); // assuming square local size float sum = 0.0f; - // Loop over all blocks of both matrices - for (int k = 0; k < N; k += localSize) { - // Load block of matrix A to local memory - localA[localRow * localSize + localCol] = A[row * N + k + localCol]; + // Load initial blocks of A and B into local memory + int k = 0; + localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol]; + localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol]; - // Load block of matrix B to local memory, adjusting for column-major access - localB[localRow * localSize + localCol] = B[(k + localRow) * N + col]; - - // Synchronize to make sure the tiles are loaded + // Iterate over blocks + for (k = 0; k < N; k += 16) { + // Ensure the initial block is loaded barrier(CLK_LOCAL_MEM_FENCE); - // Multiply the two matrix blocks and accumulate result - for (int j = 0; j < localSize; j++) { + // Compute multiplication for this block + for (int j = 0; j < 16; j++) { sum += localA[localRow * localSize + j] * localB[j * localSize + localCol]; } - // Synchronize before loading the next block - barrier(CLK_LOCAL_MEM_FENCE); + // Load the next block of matrix A into local memory + if (k + 16 < N) { + localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol]; + localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol]; + } } - C[row * N + col] = sum; + C[globalRow * N + globalCol] = sum; } /*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N) @@ -49,15 +51,14 @@ __kernel void matmul(__global float *A, float sum = 0.0f; + // Load initial blocks of A and B into local memory + int k = 0; + localA[localRow][localCol] = A[globalRow * N + k + localCol]; + localB[localRow][localCol] = B[(k + localRow) * N + globalCol]; + // Iterate over blocks - for (int k = 0; k < N; k += 16) { - // Load a block of matrix A into local memory - localA[localRow][localCol] = A[globalRow * N + k + localCol]; - - // Load a block of matrix B into local memory - localB[localRow][localCol] = B[(k + localRow) * N + globalCol]; - - // Ensure the entire block is loaded + for (k = 0; k < N; k += 16) { + // Ensure the initial block is loaded barrier(CLK_LOCAL_MEM_FENCE); // Compute multiplication for this block @@ -65,8 +66,11 @@ __kernel void matmul(__global float *A, sum += localA[localRow][j] * localB[j][localCol]; } - // Wait until all threads have computed before loading the next block - barrier(CLK_LOCAL_MEM_FENCE); + // Load the next block of matrix A into local memory + if (k + 16 < N) { + localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol]; + localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol]; + } } C[globalRow * N + globalCol] = sum; diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/matmul/main.cc index 8e20a3ef..f7714dd7 100644 --- a/tests/opencl/matmul/main.cc +++ b/tests/opencl/matmul/main.cc @@ -184,8 +184,8 @@ int main (int argc, char **argv) { CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size)); - CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL)); - CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL)); + //CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL)); + //CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL)); // Allocate memories for input arrays and output arrays. std::vector h_a(size * size); diff --git a/tests/regression/Makefile b/tests/regression/Makefile index 5ba29d57..89fa25af 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -10,6 +10,7 @@ all: $(MAKE) -C fence $(MAKE) -C no_mf_ext $(MAKE) -C no_smem + $(MAKE) -C tensor run-simx: $(MAKE) -C basic run-simx @@ -23,6 +24,7 @@ run-simx: $(MAKE) -C fence run-simx $(MAKE) -C no_mf_ext run-simx $(MAKE) -C no_smem run-simx + $(MAKE) -C tensor run-simx run-rtlsim: $(MAKE) -C basic run-rtlsim @@ -36,6 +38,7 @@ run-rtlsim: $(MAKE) -C fence run-rtlsim $(MAKE) -C no_mf_ext run-rtlsim $(MAKE) -C no_smem run-rtlsim + $(MAKE) -C tensor run-rtlsim run-opae: $(MAKE) -C basic run-opae @@ -49,6 +52,7 @@ run-opae: $(MAKE) -C fence run-opae $(MAKE) -C no_mf_ext run-opae $(MAKE) -C no_smem run-opae + $(MAKE) -C tensor run-opae clean: $(MAKE) -C basic clean @@ -62,6 +66,7 @@ clean: $(MAKE) -C fence clean $(MAKE) -C no_mf_ext clean $(MAKE) -C no_smem clean + $(MAKE) -C tensor clean clean-all: $(MAKE) -C basic clean-all @@ -75,3 +80,4 @@ clean-all: $(MAKE) -C fence clean-all $(MAKE) -C no_mf_ext clean-all $(MAKE) -C no_smem clean-all + $(MAKE) -C tensor clean-all diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp index e79387b5..0f6f3bde 100755 --- a/tests/regression/basic/main.cpp +++ b/tests/regression/basic/main.cpp @@ -262,11 +262,8 @@ int main(int argc, char *argv[]) { // upload kernel argument std::cout << "upload kernel argument" << std::endl; - { - auto buf_ptr = (void*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); std::cout << "run kernel test" << std::endl; RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points)); diff --git a/tests/regression/demo/common.h b/tests/regression/demo/common.h index e18b65a0..941983ac 100644 --- a/tests/regression/demo/common.h +++ b/tests/regression/demo/common.h @@ -3,6 +3,10 @@ #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 +#ifndef TYPE +#define TYPE float +#endif + typedef struct { uint32_t num_tasks; uint32_t task_size; @@ -11,4 +15,4 @@ typedef struct { uint64_t dst_addr; } kernel_arg_t; -#endif \ No newline at end of file +#endif diff --git a/tests/regression/demo/kernel.cpp b/tests/regression/demo/kernel.cpp index deb56169..49945440 100644 --- a/tests/regression/demo/kernel.cpp +++ b/tests/regression/demo/kernel.cpp @@ -4,11 +4,11 @@ #include "common.h" void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { - uint32_t count = arg->task_size; - int32_t* src0_ptr = (int32_t*)arg->src0_addr; - int32_t* src1_ptr = (int32_t*)arg->src1_addr; - int32_t* dst_ptr = (int32_t*)arg->dst_addr; + auto src0_ptr = reinterpret_cast(arg->src0_addr); + auto src1_ptr = reinterpret_cast(arg->src1_addr); + auto dst_ptr = reinterpret_cast(arg->dst_addr); + uint32_t count = arg->task_size; uint32_t offset = task_id * count; for (uint32_t i = 0; i < count; ++i) { diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp index dfe33377..63556a5f 100644 --- a/tests/regression/demo/main.cpp +++ b/tests/regression/demo/main.cpp @@ -5,6 +5,8 @@ #include #include "common.h" +#define FLOAT_ULP 6 + #define RT_CHECK(_expr) \ do { \ int _ret = _expr; \ @@ -17,10 +19,52 @@ /////////////////////////////////////////////////////////////////////////////// +union Float_t { + float f; + int i; + struct { + uint32_t man : 23; + uint32_t exp : 8; + uint32_t sign : 1; + } parts; +}; + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "integer"; + } + static bool compare(int a, int b) { + return a == b; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "float"; + } + static bool compare(float a, float b) { + Float_t fa{a}, fb{b}; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + return false; + } + return true; + } +}; + const char* kernel_file = "kernel.bin"; -uint32_t count = 0; +uint32_t count = 16; vx_device_h device = nullptr; +std::vector source_data; std::vector staging_buf; kernel_arg_t kernel_arg = {}; @@ -79,11 +123,11 @@ int run_test(const kernel_arg_t& kernel_arg, std::cout << "verify result" << std::endl; { int errors = 0; - auto buf_ptr = (int32_t*)staging_buf.data(); + auto buf_ptr = (TYPE*)staging_buf.data(); for (uint32_t i = 0; i < num_points; ++i) { - int ref = i + i; - int cur = buf_ptr[i]; - if (cur != ref) { + auto ref = source_data[2 * i + 0] + source_data[2 * i + 1]; + auto cur = buf_ptr[i]; + if (!Comparator::compare(cur, ref)) { std::cout << "error at result #" << std::dec << i << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; ++errors; @@ -103,9 +147,7 @@ int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); - if (count == 0) { - count = 1; - } + std::srand(50); // open device connection std::cout << "open device connection" << std::endl; @@ -118,8 +160,9 @@ int main(int argc, char *argv[]) { uint32_t num_tasks = num_cores * num_warps * num_threads; uint32_t num_points = count * num_tasks; - uint32_t buf_size = num_points * sizeof(int32_t); + uint32_t buf_size = num_points * sizeof(TYPE); + std::cout << "data type: " << Comparator::type_str() << std::endl; std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << buf_size << " bytes" << std::endl; @@ -147,18 +190,22 @@ int main(int argc, char *argv[]) { // upload kernel argument std::cout << "upload kernel argument" << std::endl; - { - auto buf_ptr = (int*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + + // generate source data + source_data.resize(2 * num_points); + for (uint32_t i = 0; i < source_data.size(); ++i) { + auto r = static_cast(std::rand()) / RAND_MAX; + source_data[i] = static_cast(r * 2 * num_points); } // upload source buffer0 { std::cout << "upload source buffer0" << std::endl; - auto buf_ptr = (int32_t*)staging_buf.data(); + auto buf_ptr = (TYPE*)staging_buf.data(); for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = i-1; + buf_ptr[i] = source_data[2 * i + 0]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); } @@ -166,23 +213,18 @@ int main(int argc, char *argv[]) { // upload source buffer1 { std::cout << "upload source buffer1" << std::endl; - auto buf_ptr = (int32_t*)staging_buf.data(); + auto buf_ptr = (TYPE*)staging_buf.data(); for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = i+1; + buf_ptr[i] = source_data[2 * i + 1]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); } // clear destination buffer - { - std::cout << "clear destination buffer" << std::endl; - auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = 0xdeadbeef; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); - } - + std::cout << "clear destination buffer" << std::endl; + memset(staging_buf.data(), 0, num_points * sizeof(TYPE)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); + // run tests std::cout << "run tests" << std::endl; RT_CHECK(run_test(kernel_arg, buf_size, num_points)); diff --git a/tests/regression/diverge/main.cpp b/tests/regression/diverge/main.cpp index 742f2419..d5de1bc1 100644 --- a/tests/regression/diverge/main.cpp +++ b/tests/regression/diverge/main.cpp @@ -233,11 +233,8 @@ int main(int argc, char *argv[]) { // upload kernel argument std::cout << "upload kernel argument" << std::endl; - { - auto buf_ptr = (int*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); // upload source buffer { diff --git a/tests/regression/fence/main.cpp b/tests/regression/fence/main.cpp index d9f2920f..c9225edc 100644 --- a/tests/regression/fence/main.cpp +++ b/tests/regression/fence/main.cpp @@ -147,11 +147,8 @@ int main(int argc, char *argv[]) { // upload kernel argument std::cout << "upload kernel argument" << std::endl; - { - auto buf_ptr = (int*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); // upload source buffer0 { diff --git a/tests/regression/io_addr/main.cpp b/tests/regression/io_addr/main.cpp index d4c74aad..0272bfbc 100644 --- a/tests/regression/io_addr/main.cpp +++ b/tests/regression/io_addr/main.cpp @@ -190,13 +190,10 @@ int main(int argc, char *argv[]) { staging_buf.resize(staging_buf_size); // upload kernel argument - { - std::cout << "upload kernel argument" << std::endl; - auto buf_ptr = (int*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } - + std::cout << "upload kernel argument" << std::endl; + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + // upload test address data { std::cout << "upload test address data" << std::endl; diff --git a/tests/regression/mstress/main.cpp b/tests/regression/mstress/main.cpp index ecc867bc..9b527126 100644 --- a/tests/regression/mstress/main.cpp +++ b/tests/regression/mstress/main.cpp @@ -236,13 +236,10 @@ int main(int argc, char *argv[]) { staging_buf.resize(staging_buf_size); // upload kernel argument - { - std::cout << "upload kernel argument" << std::endl; - auto buf_ptr = (int*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } - + std::cout << "upload kernel argument" << std::endl; + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + // upload source buffer0 { std::cout << "upload address buffer" << std::endl; diff --git a/tests/regression/no_mf_ext/main.cpp b/tests/regression/no_mf_ext/main.cpp index 7632dad1..e711b99a 100644 --- a/tests/regression/no_mf_ext/main.cpp +++ b/tests/regression/no_mf_ext/main.cpp @@ -136,13 +136,10 @@ int main(int argc, char *argv[]) { staging_buf.resize(alloc_size); // upload kernel argument - { - std::cout << "upload kernel argument" << std::endl; - auto buf_ptr = (int*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } - + std::cout << "upload kernel argument" << std::endl; + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + // upload source buffer0 { std::cout << "upload source buffer" << std::endl; diff --git a/tests/regression/no_smem/main.cpp b/tests/regression/no_smem/main.cpp index 8bb00389..53db0465 100644 --- a/tests/regression/no_smem/main.cpp +++ b/tests/regression/no_smem/main.cpp @@ -135,13 +135,10 @@ int main(int argc, char *argv[]) { uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); staging_buf.resize(alloc_size); - // upload kernel argument + // upload kernel argument std::cout << "upload kernel argument" << std::endl; - { - auto buf_ptr = (int*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); // upload source buffer0 { diff --git a/tests/regression/printf/main.cpp b/tests/regression/printf/main.cpp index 4b13faad..3a920294 100644 --- a/tests/regression/printf/main.cpp +++ b/tests/regression/printf/main.cpp @@ -110,13 +110,10 @@ int main(int argc, char *argv[]) { staging_buf.resize(alloc_size); // upload kernel argument - { - std::cout << "upload kernel argument" << std::endl; - auto buf_ptr = (void*)staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } - + std::cout << "upload kernel argument" << std::endl; + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + // upload source buffer0 { std::cout << "upload source buffer" << std::endl; diff --git a/tests/regression/sort/common.h b/tests/regression/sort/common.h index 492e03c6..92ceeb91 100644 --- a/tests/regression/sort/common.h +++ b/tests/regression/sort/common.h @@ -3,11 +3,7 @@ #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 -#define FP_ENABLE - -#ifdef FP_ENABLE -#define TYPE float -#else +#ifndef TYPE #define TYPE int #endif @@ -17,4 +13,4 @@ typedef struct { uint64_t dst_addr; } kernel_arg_t; -#endif \ No newline at end of file +#endif diff --git a/tests/regression/sort/kernel.cpp b/tests/regression/sort/kernel.cpp index 0cd7074e..2e9d3453 100644 --- a/tests/regression/sort/kernel.cpp +++ b/tests/regression/sort/kernel.cpp @@ -5,14 +5,14 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { uint32_t num_points = arg->num_points; - TYPE* src_ptr = (TYPE*)arg->src_addr; - TYPE* dst_ptr = (TYPE*)arg->dst_addr; + auto src_ptr = (TYPE*)arg->src_addr; + auto dst_ptr = (TYPE*)arg->dst_addr; - TYPE ref_value = src_ptr[task_id]; + auto ref_value = src_ptr[task_id]; uint32_t pos = 0; for (uint32_t i = 0; i < num_points; ++i) { - TYPE cur_value = src_ptr[i]; + auto cur_value = src_ptr[i]; pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id)); } dst_ptr[pos] = ref_value; diff --git a/tests/regression/sort/main.cpp b/tests/regression/sort/main.cpp index 59796f73..38d5d4d4 100644 --- a/tests/regression/sort/main.cpp +++ b/tests/regression/sort/main.cpp @@ -66,8 +66,8 @@ void gen_input_data(uint32_t num_points) { src_data.resize(num_points); for (uint32_t i = 0; i < num_points; ++i) { - float r = static_cast(std::rand()) / RAND_MAX; - TYPE value = r * num_points; + auto r = static_cast(std::rand()) / RAND_MAX; + auto value = static_cast(r * num_points); src_data[i] = value; std::cout << std::dec << i << ": value=" << value << std::endl; } @@ -172,19 +172,16 @@ int main(int argc, char *argv[]) { { std::cout << "allocate staging buffer" << std::endl; uint32_t staging_buf_size = std::max(src_buf_size, - std::max(dst_buf_size, - sizeof(kernel_arg_t))); + std::max(dst_buf_size, + sizeof(kernel_arg_t))); staging_buf.resize(staging_buf_size); } // upload kernel argument - { - std::cout << "upload kernel argument" << std::endl; - auto buf_ptr = staging_buf.data(); - memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - } - + std::cout << "upload kernel argument" << std::endl; + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + // upload source buffer { std::cout << "upload source buffer" << std::endl; diff --git a/tests/regression/tensor/Makefile b/tests/regression/tensor/Makefile new file mode 100644 index 00000000..790664dc --- /dev/null +++ b/tests/regression/tensor/Makefile @@ -0,0 +1,9 @@ +PROJECT = tensor + +SRCS = main.cpp + +VX_SRCS = kernel.cpp + +OPTS ?= -s16 + +include ../common.mk \ No newline at end of file diff --git a/tests/regression/tensor/common.h b/tests/regression/tensor/common.h new file mode 100644 index 00000000..75cfc340 --- /dev/null +++ b/tests/regression/tensor/common.h @@ -0,0 +1,18 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t num_tasks; + uint32_t size; + uint64_t A_addr; + uint64_t B_addr; + uint64_t C_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/tensor/kernel.cpp b/tests/regression/tensor/kernel.cpp new file mode 100644 index 00000000..5cf0851c --- /dev/null +++ b/tests/regression/tensor/kernel.cpp @@ -0,0 +1,41 @@ +#include +#include +#include +#include "common.h" + +inline char is_log2(uint32_t x) { + return ((x & (x-1)) == 0); +} + +inline uint32_t log2_fast(uint32_t x) { + return 31 - __builtin_clz (x); +} + +void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) { + auto size = arg->size; + auto A = reinterpret_cast(arg->A_addr); + auto B = reinterpret_cast(arg->B_addr); + auto C = reinterpret_cast(arg->C_addr); + + uint32_t row, col; + if (is_log2(size)) { + uint32_t log_size = log2_fast(size); + row = task_id >> log_size; + col = task_id & (size-1); + } else { + row = task_id / size; + col = task_id % size; + } + + TYPE sum (0); + for (int e = 0; e < size; ++e) { + sum += A[row * size + e] * B[e * size + col]; + } + C[row * size + col] = sum; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); + return 0; +} diff --git a/tests/regression/tensor/main.cpp b/tests/regression/tensor/main.cpp new file mode 100644 index 00000000..d93f3177 --- /dev/null +++ b/tests/regression/tensor/main.cpp @@ -0,0 +1,249 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +union Float_t { + float f; + int i; + struct { + uint32_t man : 23; + uint32_t exp : 8; + uint32_t sign : 1; + } parts; +}; + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "integer"; + } + static bool compare(int a, int b) { + return a == b; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "float"; + } + static bool compare(float a, float b) { + Float_t fa{a}, fb{b}; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + return false; + } + return true; + } +}; + +static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) { + for (uint32_t row = 0; row < height; ++row) { + for (uint32_t col = 0; col < width; ++col) { + TYPE sum(0); + for (uint32_t e = 0; e < width; ++e) { + sum += A[row * width + e] * B[e * width + col]; + } + out[row * width + col] = sum; + } + } +} + +const char* kernel_file = "kernel.bin"; +uint32_t size = 16; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "s:k:h?")) != -1) { + switch (c) { + case 's': + size = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(device, kernel_arg.A_addr); + vx_mem_free(device, kernel_arg.B_addr); + vx_mem_free(device, kernel_arg.C_addr); + vx_dev_close(device); + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + const std::vector& refs) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < refs.size(); ++i) { + auto ref = refs[i]; + auto cur = buf_ptr[i]; + if (!Comparator::compare(cur, ref)) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint32_t num_points = size * size; + uint32_t buf_size = num_points * sizeof(TYPE); + + std::cout << "data type: " << Comparator::type_str() << std::endl; + std::cout << "matrix size: " << size << "x" << size << std::endl; + std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.A_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.B_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.C_addr)); + + kernel_arg.num_tasks = num_points; + kernel_arg.size = size; + + std::cout << "dev_src0=0x" << std::hex << kernel_arg.A_addr << std::endl; + std::cout << "dev_src1=0x" << std::hex << kernel_arg.B_addr << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.C_addr << std::endl; + + // allocate staging buffer + std::cout << "allocate staging buffer" << std::endl; + uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); + staging_buf.resize(alloc_size); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + + // generate source data + std::vector src_A(num_points); + std::vector src_B(num_points); + std::vector refs(num_points); + for (uint32_t i = 0; i < num_points; ++i) { + auto a = static_cast(std::rand()) / RAND_MAX; + auto b = static_cast(std::rand()) / RAND_MAX; + src_A[i] = static_cast(a * size); + src_B[i] = static_cast(b * size); + } + cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size); + + // upload source buffer0 + { + std::cout << "upload source buffer0" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = src_A[i]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size)); + } + + // upload source buffer1 + { + std::cout << "upload source buffer1" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = src_B[i]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size)); + } + + // clear destination buffer + std::cout << "clear destination buffer" << std::endl; + memset(staging_buf.data(), 0, num_points * sizeof(TYPE)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size)); + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, buf_size, refs)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file From 61e3442ef80b5c7db66e436e08855529d9869af0 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Tue, 14 Nov 2023 22:31:30 -0800 Subject: [PATCH 06/11] adding opencl convolution benchmark --- tests/opencl/Makefile | 6 + tests/opencl/convolution/Makefile | 7 + tests/opencl/convolution/kernel.cl | 32 ++++ tests/opencl/convolution/main.cc | 258 +++++++++++++++++++++++++++++ tests/opencl/matmul/Makefile | 2 +- tests/opencl/matmul/kernel.cl | 59 +++---- tests/opencl/matmul/main.cc | 51 +++--- tests/opencl/oclprintf/main.cc | 2 +- tests/opencl/psort/main.cc | 3 +- tests/opencl/sgemm/common.h | 6 +- tests/opencl/sgemm/main.cc | 125 ++++++++------ tests/opencl/vecadd/main.cc | 4 +- tests/regression/demo/main.cpp | 46 ++--- tests/regression/tensor/Makefile | 2 +- tests/regression/tensor/kernel.cpp | 4 +- tests/regression/tensor/main.cpp | 53 +++--- 16 files changed, 490 insertions(+), 170 deletions(-) create mode 100644 tests/opencl/convolution/Makefile create mode 100644 tests/opencl/convolution/kernel.cl create mode 100644 tests/opencl/convolution/main.cc diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index 2cee5c5d..c838c3de 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -18,6 +18,7 @@ all: $(MAKE) -C oclprintf $(MAKE) -C blackscholes $(MAKE) -C matmul + $(MAKE) -C convolution run-simx: $(MAKE) -C vecadd run-simx @@ -37,6 +38,7 @@ run-simx: $(MAKE) -C blackscholes run-simx $(MAKE) -C matmul run-simx $(MAKE) -C transpose run-simx + $(MAKE) -C convolution run-simx # $(MAKE) -C vectorhypot run-simx # $(MAKE) -C mri-q run-simx @@ -58,6 +60,7 @@ run-rtlsim: $(MAKE) -C oclprintf run-rtlsim $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C matmul run-rtlsim + $(MAKE) -C convolution run-rtlsim # $(MAKE) -C vectorhypot run-rtlsim # $(MAKE) -C mri-q run-rtlsim @@ -79,6 +82,7 @@ run-opae: $(MAKE) -C oclprintf run-opae $(MAKE) -C blackscholes run-opae $(MAKE) -C matmul run-opae + $(MAKE) -C convolution run-opae # $(MAKE) -C vectorhypot run-opae # $(MAKE) -C mri-q run-opae @@ -102,6 +106,7 @@ clean: $(MAKE) -C oclprintf clean $(MAKE) -C blackscholes clean $(MAKE) -C matmul clean + $(MAKE) -C convolution clean clean-all: $(MAKE) -C vecadd clean-all @@ -124,3 +129,4 @@ clean-all: $(MAKE) -C oclprintf clean-all $(MAKE) -C blackscholes clean-all $(MAKE) -C matmul clean-all + $(MAKE) -C convolution clean-all diff --git a/tests/opencl/convolution/Makefile b/tests/opencl/convolution/Makefile new file mode 100644 index 00000000..42a577d2 --- /dev/null +++ b/tests/opencl/convolution/Makefile @@ -0,0 +1,7 @@ +PROJECT = convolution + +SRCS = main.cc + +OPTS ?= -n32 + +include ../common.mk diff --git a/tests/opencl/convolution/kernel.cl b/tests/opencl/convolution/kernel.cl new file mode 100644 index 00000000..2ef31040 --- /dev/null +++ b/tests/opencl/convolution/kernel.cl @@ -0,0 +1,32 @@ +__kernel void conv3x3(__global float* output, + __global float* input, + __global float* weights, + const int width, + const int height) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + // Adjust for padded borders + int paddedWidth = width + 2; + int paddedX = x + 1; + int paddedY = y + 1; + + // Compute the convolution sum + float sum = 0.0f; + + sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left + sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1]; // Top-center + sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right + + sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3]; // Middle-left + sum += input[paddedY * paddedWidth + paddedX] * weights[4]; // Center + sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5]; // Middle-right + + sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left + sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7]; // Bottom-center + sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right + + // Store the result in the output array + output[y * width + x] = sum; +} \ No newline at end of file diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc new file mode 100644 index 00000000..d7487c2f --- /dev/null +++ b/tests/opencl/convolution/main.cc @@ -0,0 +1,258 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FLOAT_ULP 6 + +#define KERNEL_NAME "conv3x3" + +#define CL_CHECK(_expr) \ + do { \ + cl_int _err = _expr; \ + if (_err == CL_SUCCESS) \ + break; \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } while (0) + +#define CL_CHECK2(_expr) \ + ({ \ + cl_int _err = CL_INVALID_VALUE; \ + decltype(_expr) _ret = _expr; \ + if (_err != CL_SUCCESS) { \ + printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ + cleanup(); \ + exit(-1); \ + } \ + _ret; \ + }) + +static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { + if (nullptr == filename || nullptr == data || 0 == size) + return -1; + + FILE* fp = fopen(filename, "r"); + if (NULL == fp) { + fprintf(stderr, "Failed to load kernel."); + return -1; + } + + fseek(fp , 0 , SEEK_END); + long fsize = ftell(fp); + rewind(fp); + + *data = (uint8_t*)malloc(fsize); + *size = fread(*data, 1, fsize, fp); + + fclose(fp); + + return 0; +} + +static bool compare_equal(float a, float b) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + return d <= FLOAT_ULP; +} + +static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) { + int paddedWidth = width + 2; + for (int32_t y = 0; y < height; ++y) { + for (int32_t x = 0; x < width; ++x) { + int paddedY = y + 1; + int paddedX = x + 1; + float sum = 0.0f; + for (int32_t ky = -1; ky <= 1; ++ky) { + for (int32_t kx = -1; kx <= 1; ++kx) { + int32_t iy = paddedY + ky; + int32_t ix = paddedX + kx; + float value = I[iy * paddedWidth + ix]; + float weight = W[(ky + 1) * 3 + (kx + 1)]; + sum += value * weight; + } + } + O[y * width + x] = sum; + } + } +} + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue commandQueue = NULL; +cl_program program = NULL; +cl_kernel kernel = NULL; +cl_mem i_memobj = NULL; +cl_mem w_memobj = NULL; +cl_mem o_memobj = NULL; +uint8_t* kernel_bin = NULL; + +static void cleanup() { + if (commandQueue) clReleaseCommandQueue(commandQueue); + if (kernel) clReleaseKernel(kernel); + if (program) clReleaseProgram(program); + if (i_memobj) clReleaseMemObject(i_memobj); + if (w_memobj) clReleaseMemObject(w_memobj); + if (o_memobj) clReleaseMemObject(o_memobj); + if (context) clReleaseContext(context); + if (device_id) clReleaseDevice(device_id); + if (kernel_bin) free(kernel_bin); +} + +int size = 32; + +static void show_usage() { + printf("Usage: [-n size] [-h: help]\n"); +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +int main (int argc, char **argv) { + // parse command arguments + parse_args(argc, argv); + + printf("Matrix size=%d\n", size); + + uint32_t o_points = size * size; + uint32_t i_points = (size+2) * (size+2); + uint32_t w_points = 3 * 3; + + cl_platform_id platform_id; + size_t kernel_size; + + // Getting platform and device information + CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); + CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); + + printf("Create context\n"); + context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); + + char device_string[1024]; + clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); + printf("Using device: %s\n", device_string); + + printf("Allocate device buffers\n"); + size_t i_nbytes = i_points * sizeof(float); + size_t w_nbytes = w_points * sizeof(float); + size_t o_nbytes = o_points * sizeof(float); + i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err)); + w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err)); + o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err)); + + printf("Create program from kernel source\n"); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif + if (program == NULL) { + cleanup(); + return -1; + } + + // Build program + CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); + + // Create kernel + kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); + + size_t global_size[2] = {size, size}; + + // Set kernel arguments + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size)); + + // Allocate memories for input arrays and output arrays. + std::vector h_i(i_points); + std::vector h_w(w_points); + std::vector h_o(o_points, 0.0f); + + // Generate input values + for (int32_t y = -1; y < size+1; ++y) { + for (int32_t x = -1; x < size+1; ++x) { + if (x >= 0 && x < size && y >= 0 && y < size) { + h_i[(y+1) * (size+2) + (x+1)] = static_cast(rand()) / RAND_MAX; + } else { + h_i[(y+1) * (size+2) + (x+1)] = 0; + } + } + } + for (uint32_t i = 0; i < w_points; ++i) { + h_w[i] = static_cast(rand()) / RAND_MAX; + } + + // Creating command queue + commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err)); + + printf("Upload source buffers\n"); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL)); + CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL)); + + printf("Execute the kernel\n"); + auto time_start = std::chrono::high_resolution_clock::now(); + CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL)); + CL_CHECK(clFinish(commandQueue)); + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + printf("Download destination buffer\n"); + CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL)); + + printf("Verify result\n"); + std::vector ref_vec(o_points); + convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size); + int errors = 0; + for (uint32_t i = 0; i < o_points; ++i) { + if (!compare_equal(h_o[i], ref_vec[i])) { + if (errors < 100) + printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]); + ++errors; + } + } + if (errors != 0) { + printf("FAILED! - %d errors\n", errors); + } else { + printf("PASSED!\n"); + } + + // Clean up + cleanup(); + + return errors; +} diff --git a/tests/opencl/matmul/Makefile b/tests/opencl/matmul/Makefile index 0d1d136a..39b92b36 100644 --- a/tests/opencl/matmul/Makefile +++ b/tests/opencl/matmul/Makefile @@ -2,6 +2,6 @@ PROJECT = matmul SRCS = main.cc -OPTS ?= -n16 +OPTS ?= -n32 include ../common.mk diff --git a/tests/opencl/matmul/kernel.cl b/tests/opencl/matmul/kernel.cl index a0ef2d81..02aa074c 100644 --- a/tests/opencl/matmul/kernel.cl +++ b/tests/opencl/matmul/kernel.cl @@ -7,43 +7,41 @@ __kernel void matmul(__global float *A, { int globalRow = get_global_id(1); int globalCol = get_global_id(0); - int localRow = get_local_id(1); - int localCol = get_local_id(0); + int localRow = get_local_id(1); + int localCol = get_local_id(0); int localSize = get_local_size(0); // assuming square local size float sum = 0.0f; - // Load initial blocks of A and B into local memory - int k = 0; - localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol]; - localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol]; + // Loop over all blocks of both matrices + for (int k = 0; k < N; k += localSize) { + // Load block of matrix A to local memory + localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol]; - // Iterate over blocks - for (k = 0; k < N; k += 16) { - // Ensure the initial block is loaded + // Load block of matrix B to local memory, adjusting for column-major access + localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol]; + + // Synchronize to make sure the tiles are loaded barrier(CLK_LOCAL_MEM_FENCE); - // Compute multiplication for this block - for (int j = 0; j < 16; j++) { + // Multiply the two matrix blocks and accumulate result + for (int j = 0; j < localSize; j++) { sum += localA[localRow * localSize + j] * localB[j * localSize + localCol]; } - - // Load the next block of matrix A into local memory - if (k + 16 < N) { - localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol]; - localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol]; - } } C[globalRow * N + globalCol] = sum; } -/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N) +/*__kernel void matmul(__global float *A, + __global float *B, + __global float *C, + const unsigned int N) { int globalRow = get_global_id(1); int globalCol = get_global_id(0); - int localRow = get_local_id(1); - int localCol = get_local_id(0); + int localRow = get_local_id(1); + int localCol = get_local_id(0); // Static local memory declaration __local float localA[16][16]; @@ -51,26 +49,21 @@ __kernel void matmul(__global float *A, float sum = 0.0f; - // Load initial blocks of A and B into local memory - int k = 0; - localA[localRow][localCol] = A[globalRow * N + k + localCol]; - localB[localRow][localCol] = B[(k + localRow) * N + globalCol]; - // Iterate over blocks - for (k = 0; k < N; k += 16) { - // Ensure the initial block is loaded + for (int k = 0; k < N; k += 16) { + // Load a block of matrix A into local memory + localA[localRow][localCol] = A[globalRow * N + k + localCol]; + + // Load a block of matrix B into local memory + localB[localRow][localCol] = B[(k + localRow) * N + globalCol]; + + // Ensure the entire block is loaded barrier(CLK_LOCAL_MEM_FENCE); // Compute multiplication for this block for (int j = 0; j < 16; j++) { sum += localA[localRow][j] * localB[j][localCol]; } - - // Load the next block of matrix A into local memory - if (k + 16 < N) { - localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol]; - localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol]; - } } C[globalRow * N + globalCol] = sum; diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/matmul/main.cc index f7714dd7..3d26ff0c 100644 --- a/tests/opencl/matmul/main.cc +++ b/tests/opencl/matmul/main.cc @@ -10,6 +10,8 @@ #define LOCAL_SIZE 16 +#define FLOAT_ULP 6 + #define KERNEL_NAME "matmul" #define CL_CHECK(_expr) \ @@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) return 0; } -static bool compare_equal(float a, float b, int ulp = 21) { - union fi_t { int i; float f; }; +static bool compare_equal(float a, float b) { + union fi_t { float f; int32_t i; }; fi_t fa, fb; fa.f = a; fb.f = b; - return std::abs(fa.i - fb.i) <= ulp; + auto d = std::abs(fa.i - fb.i); + return d <= FLOAT_ULP; } -static void matrix_multiply_cpu(float *A, float *B, float *C, int N) { +static void matmul_cpu(float *C, float *A, float *B, int N) { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float sum = 0.0f; @@ -98,7 +101,7 @@ static void cleanup() { if (kernel_bin) free(kernel_bin); } -int size = 64; +int size = 32; static void show_usage() { printf("Usage: [-n size] [-h: help]\n"); @@ -106,7 +109,7 @@ static void show_usage() { static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "fn:h?")) != -1) { + while ((c = getopt(argc, argv, "n:h?")) != -1) { switch (c) { case 'n': size = atoi(optarg); @@ -127,6 +130,8 @@ int main (int argc, char **argv) { // parse command arguments parse_args(argc, argv); + uint32_t num_points = size * size; + printf("Matrix size=%d\n", size); if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) { printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE); @@ -148,7 +153,7 @@ int main (int argc, char **argv) { printf("Using device: %s\n", device_string); printf("Allocate device buffers\n"); - size_t nbytes = size * size * sizeof(float); + size_t nbytes = num_points * sizeof(float); a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); @@ -176,32 +181,26 @@ int main (int argc, char **argv) { // Create kernel kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err)); - size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE}; size_t global_size[2] = {size, size}; + size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE}; // Set kernel arguments CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size)); - //CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL)); - //CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL)); + CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL)); + CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL)); // Allocate memories for input arrays and output arrays. - std::vector h_a(size * size); - std::vector h_b(size * size); - std::vector h_c(size * size); + std::vector h_a(num_points); + std::vector h_b(num_points); + std::vector h_c(num_points); - // Initialize values for array members. - for (int i = 0; i < (size * size); ++i) { - #ifdef USE_FLOAT - h_a[i] = (float)rand() / (float)RAND_MAX; - h_b[i] = (float)rand() / (float)RAND_MAX; - #else - h_a[i] = rand(); - h_b[i] = rand(); - #endif - h_c[i] = 0xdeadbeef; + // Generate input values + for (uint32_t i = 0; i < num_points; ++i) { + h_a[i] = static_cast(rand()) / RAND_MAX; + h_b[i] = static_cast(rand()) / RAND_MAX; } // Creating command queue @@ -223,10 +222,10 @@ int main (int argc, char **argv) { CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL)); printf("Verify result\n"); - std::vector ref_vec(size * size); - matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size); + std::vector ref_vec(num_points); + matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size); int errors = 0; - for (int i = 0; i < (size * size); i++) { + for (uint32_t i = 0; i < num_points; ++i) { if (!compare_equal(h_c[i], ref_vec[i])) { if (errors < 100) printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]); diff --git a/tests/opencl/oclprintf/main.cc b/tests/opencl/oclprintf/main.cc index 7c0463cf..184eec96 100644 --- a/tests/opencl/oclprintf/main.cc +++ b/tests/opencl/oclprintf/main.cc @@ -143,7 +143,7 @@ int main (int argc, char **argv) { // Allocate memories for input arrays and output arrays. h_a = (int*)malloc(nbytes); - // Initialize values for array members. + // Generate input values for (int i = 0; i < size; ++i) { h_a[i] = -1 + i; } diff --git a/tests/opencl/psort/main.cc b/tests/opencl/psort/main.cc index 26a42807..b627ceee 100644 --- a/tests/opencl/psort/main.cc +++ b/tests/opencl/psort/main.cc @@ -155,9 +155,8 @@ int main (int argc, char **argv) { h_a = (int*)malloc(nbytes); h_c = (int*)malloc(nbytes); - // Initialize values for array members. + // Generate input values for (int i = 0; i < size; ++i) { - h_c[i] = 0xdeadbeef; if (float_enable) { float value = sinf(i)*sinf(i); h_a[i] = *(int*)&value; diff --git a/tests/opencl/sgemm/common.h b/tests/opencl/sgemm/common.h index 01f68d48..fdb40bce 100644 --- a/tests/opencl/sgemm/common.h +++ b/tests/opencl/sgemm/common.h @@ -1,12 +1,8 @@ #ifndef COMMON_H #define COMMON_H -#define USE_FLOAT - -#ifdef USE_FLOAT +#ifndef TYPE #define TYPE float -#else -#define TYPE int #endif #endif // COMMON_H \ No newline at end of file diff --git a/tests/opencl/sgemm/main.cc b/tests/opencl/sgemm/main.cc index 3ca14792..7a02929f 100644 --- a/tests/opencl/sgemm/main.cc +++ b/tests/opencl/sgemm/main.cc @@ -11,6 +11,8 @@ #define KERNEL_NAME "sgemm" +#define FLOAT_ULP 6 + #define CL_CHECK(_expr) \ do { \ cl_int _err = _expr; \ @@ -33,6 +35,66 @@ _ret; \ }) +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "integer"; + } + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "float"; + } + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } + return false; + } + return true; + } +}; + +/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) { + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + TYPE acc = 0; + for (int k = 0; k < K; ++k) { + acc += A[k * M + m] * B[n * K + k]; + } + C[n * M + m] = acc; + } + } +}*/ + static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) { if (nullptr == filename || nullptr == data || 0 == size) return -1; @@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) return 0; } -/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) { - for (int m = 0; m < M; ++m) { - for (int n = 0; n < N; ++n) { - TYPE acc = 0; - for (int k = 0; k < K; ++k) { - acc += A[k * M + m] * B[n * K + k]; - } - C[n * M + m] = acc; - } - } -}*/ - -#ifdef USE_FLOAT -static bool compare_equal(float a, float b, int ulp = 21) { - union fi_t { int i; float f; }; - fi_t fa, fb; - fa.f = a; - fb.f = b; - return std::abs(fa.i - fb.i) <= ulp; -} -#else -static bool compare_equal(int a, int b, int ulp = 21) { - return (a == b); -} -#endif - cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue commandQueue = NULL; @@ -145,6 +181,8 @@ int main (int argc, char **argv) { // parse command arguments parse_args(argc, argv); + uint32_t num_points = size * size; + cl_platform_id platform_id; size_t kernel_size; cl_int binary_status; @@ -163,7 +201,7 @@ int main (int argc, char **argv) { context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err)); // Allocate device buffers - size_t nbytes = size * size * sizeof(TYPE); + size_t nbytes = num_points * sizeof(TYPE); a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); @@ -194,23 +232,17 @@ int main (int argc, char **argv) { h_b = (TYPE*)malloc(nbytes); h_c = (TYPE*)malloc(nbytes); - // Initialize values for array members. - for (int i = 0; i < (size * size); ++i) { - #ifdef USE_FLOAT - h_a[i] = (float)rand() / (float)RAND_MAX; - h_b[i] = (float)rand() / (float)RAND_MAX; - #else - h_a[i] = rand(); - h_b[i] = rand(); - #endif - h_c[i] = 0xdeadbeef; + // Generate input values + for (uint32_t i = 0; i < num_points; ++i) { + h_a[i] = Comparator::generate(); + h_b[i] = Comparator::generate(); } size_t global_offset[2] = {0, 0}; size_t global_work_size[2] = {size, size}; size_t local_work_size[2] = {1, 1}; - std::vector ref_vec(size * size); + std::vector ref_vec(num_points); // reference generation size_t num_groups_y = global_work_size[1] / local_work_size[1]; @@ -228,12 +260,7 @@ int main (int argc, char **argv) { TYPE acc = 0; for (int k = 0; k < width; k++) { acc += h_a[k * width + r] * h_b[c * width + k]; - } - /*#ifdef USE_FLOAT - printf("*** r=%d, c=%d, v=%f\n", r, c, acc); - #else - printf("*** r=%d, c=%d, v=%d\n", r, c, acc); - #endif*/ + } ref_vec[c * width + r] = acc; } } @@ -260,14 +287,8 @@ int main (int argc, char **argv) { printf("Verify result\n"); int errors = 0; - for (int i = 0; i < (size * size); i++) { - if (!compare_equal(h_c[i], ref_vec[i])) { - if (errors < 100) - #ifdef USE_FLOAT - printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]); - #else - printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]); - #endif + for (uint32_t i = 0; i < num_points; ++i) { + if (!Comparator::compare(h_c[i], ref_vec[i], i, errors)) { ++errors; } } diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc index 23aa49b4..992e88be 100644 --- a/tests/opencl/vecadd/main.cc +++ b/tests/opencl/vecadd/main.cc @@ -166,12 +166,10 @@ int main (int argc, char **argv) { h_b = (float*)malloc(nbytes); h_c = (float*)malloc(nbytes); - // Initialize values for array members. + // Generate input values for (int i = 0; i < size; ++i) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); - h_c[i] = 0xdeadbeef; - //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]); } // Creating command queue diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp index 63556a5f..f14f66c3 100644 --- a/tests/regression/demo/main.cpp +++ b/tests/regression/demo/main.cpp @@ -19,16 +19,6 @@ /////////////////////////////////////////////////////////////////////////////// -union Float_t { - float f; - int i; - struct { - uint32_t man : 23; - uint32_t exp : 8; - uint32_t sign : 1; - } parts; -}; - template class Comparator {}; @@ -38,22 +28,41 @@ public: static const char* type_str() { return "integer"; } - static bool compare(int a, int b) { - return a == b; + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; } }; template <> class Comparator { +private: + union Float_t { float f; int i; }; public: static const char* type_str() { return "float"; } - static bool compare(float a, float b) { - Float_t fa{a}, fb{b}; + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { - std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } return false; } return true; @@ -127,9 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg, for (uint32_t i = 0; i < num_points; ++i) { auto ref = source_data[2 * i + 0] + source_data[2 * i + 1]; auto cur = buf_ptr[i]; - if (!Comparator::compare(cur, ref)) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + if (!Comparator::compare(cur, ref, i, errors)) { ++errors; } } @@ -196,8 +203,7 @@ int main(int argc, char *argv[]) { // generate source data source_data.resize(2 * num_points); for (uint32_t i = 0; i < source_data.size(); ++i) { - auto r = static_cast(std::rand()) / RAND_MAX; - source_data[i] = static_cast(r * 2 * num_points); + source_data[i] = Comparator::generate(); } // upload source buffer0 diff --git a/tests/regression/tensor/Makefile b/tests/regression/tensor/Makefile index 790664dc..dbb70c3b 100644 --- a/tests/regression/tensor/Makefile +++ b/tests/regression/tensor/Makefile @@ -4,6 +4,6 @@ SRCS = main.cpp VX_SRCS = kernel.cpp -OPTS ?= -s16 +OPTS ?= -n32 include ../common.mk \ No newline at end of file diff --git a/tests/regression/tensor/kernel.cpp b/tests/regression/tensor/kernel.cpp index 5cf0851c..b0e8f69e 100644 --- a/tests/regression/tensor/kernel.cpp +++ b/tests/regression/tensor/kernel.cpp @@ -12,10 +12,10 @@ inline uint32_t log2_fast(uint32_t x) { } void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) { - auto size = arg->size; - auto A = reinterpret_cast(arg->A_addr); + auto A = reinterpret_cast(arg->A_addr); auto B = reinterpret_cast(arg->B_addr); auto C = reinterpret_cast(arg->C_addr); + auto size = arg->size; uint32_t row, col; if (is_log2(size)) { diff --git a/tests/regression/tensor/main.cpp b/tests/regression/tensor/main.cpp index d93f3177..81103c10 100644 --- a/tests/regression/tensor/main.cpp +++ b/tests/regression/tensor/main.cpp @@ -19,16 +19,6 @@ /////////////////////////////////////////////////////////////////////////////// -union Float_t { - float f; - int i; - struct { - uint32_t man : 23; - uint32_t exp : 8; - uint32_t sign : 1; - } parts; -}; - template class Comparator {}; @@ -38,8 +28,17 @@ public: static const char* type_str() { return "integer"; } - static bool compare(int a, int b) { - return a == b; + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; } }; @@ -49,18 +48,26 @@ public: static const char* type_str() { return "float"; } - static bool compare(float a, float b) { - Float_t fa{a}, fb{b}; + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { - std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl; + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } return false; } return true; } }; -static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) { +static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) { for (uint32_t row = 0; row < height; ++row) { for (uint32_t col = 0; col < width; ++col) { TYPE sum(0); @@ -73,7 +80,7 @@ static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t } const char* kernel_file = "kernel.bin"; -uint32_t size = 16; +uint32_t size = 32; vx_device_h device = nullptr; std::vector staging_buf; @@ -81,14 +88,14 @@ kernel_arg_t kernel_arg = {}; static void show_usage() { std::cout << "Vortex Test." << std::endl; - std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl; + std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl; } static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "s:k:h?")) != -1) { + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { switch (c) { - case 's': + case 'n': size = atoi(optarg); break; case 'k': @@ -138,9 +145,7 @@ int run_test(const kernel_arg_t& kernel_arg, for (uint32_t i = 0; i < refs.size(); ++i) { auto ref = refs[i]; auto cur = buf_ptr[i]; - if (!Comparator::compare(cur, ref)) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + if (!Comparator::compare(cur, ref, i, errors)) { ++errors; } } @@ -208,7 +213,7 @@ int main(int argc, char *argv[]) { src_A[i] = static_cast(a * size); src_B[i] = static_cast(b * size); } - cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size); + matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size); // upload source buffer0 { From ede5e1c311f0424b53ac7c33a943c14411ff54ac Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 15 Nov 2023 00:28:26 -0800 Subject: [PATCH 07/11] minor update --- tests/opencl/Makefile | 6 +-- tests/opencl/common.mk | 3 +- tests/opencl/oclprintf/main.cc | 20 ++++----- tests/opencl/psort/main.cc | 74 ++++++++++++++++++---------------- tests/opencl/saxpy/main.cc | 31 ++++++-------- tests/opencl/sfilter/main.cc | 31 ++++++-------- tests/opencl/sgemm/main.cc | 20 ++++----- tests/opencl/vecadd/main.cc | 21 +++++----- 8 files changed, 100 insertions(+), 106 deletions(-) diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index c838c3de..acb96ba9 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -36,9 +36,9 @@ run-simx: $(MAKE) -C lbm run-simx $(MAKE) -C oclprintf run-simx $(MAKE) -C blackscholes run-simx - $(MAKE) -C matmul run-simx $(MAKE) -C transpose run-simx $(MAKE) -C convolution run-simx +# $(MAKE) -C matmul run-simx # $(MAKE) -C vectorhypot run-simx # $(MAKE) -C mri-q run-simx @@ -59,8 +59,8 @@ run-rtlsim: $(MAKE) -C lbm run-rtlsim $(MAKE) -C oclprintf run-rtlsim $(MAKE) -C blackscholes run-rtlsim - $(MAKE) -C matmul run-rtlsim $(MAKE) -C convolution run-rtlsim +# $(MAKE) -C matmul run-rtlsim # $(MAKE) -C vectorhypot run-rtlsim # $(MAKE) -C mri-q run-rtlsim @@ -81,8 +81,8 @@ run-opae: $(MAKE) -C lbm run-opae $(MAKE) -C oclprintf run-opae $(MAKE) -C blackscholes run-opae - $(MAKE) -C matmul run-opae $(MAKE) -C convolution run-opae +# $(MAKE) -C matmul run-opae # $(MAKE) -C vectorhypot run-opae # $(MAKE) -C mri-q run-opae diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk index ce82dac3..762712b4 100644 --- a/tests/opencl/common.mk +++ b/tests/opencl/common.mk @@ -41,13 +41,12 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing CXXFLAGS += -pthread CXXFLAGS += -I$(POCL_RT_PATH)/include -LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex ifdef HOSTGPU CXXFLAGS += -DHOSTGPU LDFLAGS += -lOpenCL else - LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so + LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so endif # Debugigng diff --git a/tests/opencl/oclprintf/main.cc b/tests/opencl/oclprintf/main.cc index 184eec96..4af39802 100644 --- a/tests/opencl/oclprintf/main.cc +++ b/tests/opencl/oclprintf/main.cc @@ -106,11 +106,6 @@ int main (int argc, char **argv) { cl_platform_id platform_id; size_t kernel_size; - cl_int binary_status; - - // read kernel binary from file - if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) - return -1; // Getting platform and device information CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); @@ -124,12 +119,17 @@ int main (int argc, char **argv) { a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err)); printf("Create program from kernel source\n"); - program = CL_CHECK2(clCreateProgramWithBinary( - context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err)); - if (program == NULL) { - cleanup(); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) return -1; - } + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); diff --git a/tests/opencl/psort/main.cc b/tests/opencl/psort/main.cc index b627ceee..8bc834dc 100644 --- a/tests/opencl/psort/main.cc +++ b/tests/opencl/psort/main.cc @@ -115,11 +115,6 @@ int main (int argc, char **argv) { cl_platform_id platform_id; size_t kernel_size; - cl_int binary_status; - - // read kernel binary from file - if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) - return -1; // Getting platform and device information CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); @@ -134,12 +129,17 @@ int main (int argc, char **argv) { c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); printf("Create program from kernel source\n"); - program = CL_CHECK2(clCreateProgramWithBinary( - context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err)); - if (program == NULL) { - cleanup(); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) return -1; - } + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); @@ -159,12 +159,12 @@ int main (int argc, char **argv) { for (int i = 0; i < size; ++i) { if (float_enable) { float value = sinf(i)*sinf(i); - h_a[i] = *(int*)&value; - printf("*** [%d]: h_a=%f\n", i, value); + ((float*)h_a)[i] = value; + printf("*** [%d]: %f\n", i, value); } else { int value = size*sinf(i); h_a[i] = value; - printf("*** [%d]: h_a=%d\n", i, value); + printf("*** [%d]: %d\n", i, value); } } @@ -188,38 +188,44 @@ int main (int argc, char **argv) { CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL)); printf("Verify result\n"); - for (int i = 0; i < size; ++i) { - int value = h_c[i]; + for (int i = 0; i < size; ++i) { if (float_enable) { - printf("*** [%d]: h_a=%f\n", i, *(float*)&value); + float value = ((float*)h_c)[i]; + printf("*** [%d]: %f\n", i, value); } else { - printf("*** [%d]: h_a=%d\n", i, value); + int value = h_c[i]; + printf("*** [%d]: %d\n", i, value); } } int errors = 0; - for (int i = 0; i < size; ++i) { - int ref = h_a[i]; - float ref_f = *(float*)&ref; + for (int i = 0; i < size; ++i) { int pos = 0; - for (int j = 0; j < size; ++j) { - int cur = h_a[j]; - if (float_enable) { - float cur_f = *(float*)&cur; - pos += (cur_f < ref_f) || (cur_f == ref_f && j < i); - } else { + if (float_enable) { + float ref = ((float*)h_a)[i]; + for (int j = 0; j < size; ++j) { + float cur = ((float*)h_a)[j]; + pos += (cur < ref) || (cur == ref && j < i); + } + float value = ((float*)h_c)[pos]; + if (value != ref) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref, value); + } + ++errors; + } + } else { + int ref = h_a[i]; + for (int j = 0; j < size; ++j) { + int cur = h_a[j]; pos += (cur < ref) || (cur == ref && j < i); } - } - int value = h_c[pos]; - if (value != ref) { - if (errors < 100) { - if (float_enable) { - printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref_f, *(float*)&value); - } else { + int value = h_c[pos]; + if (value != ref) { + if (errors < 100) { printf("*** error: [%d] expected=%d, actual=%d\n", pos, ref, value); } + ++errors; } - ++errors; } } if (0 == errors) { diff --git a/tests/opencl/saxpy/main.cc b/tests/opencl/saxpy/main.cc index 4ea15759..9355c945 100644 --- a/tests/opencl/saxpy/main.cc +++ b/tests/opencl/saxpy/main.cc @@ -151,16 +151,12 @@ int main(int argc, char **argv) { cl_platform_id platform_id; cl_device_id device_id; + cl_program program; cl_mem input_buffer; cl_mem output_buffer; size_t kernel_size; cl_context context; cl_command_queue queue; - cl_int binary_status = 0; - - // read kernel binary from file - if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) - return -1; // Getting platform and device information CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); @@ -172,19 +168,18 @@ int main(int argc, char **argv) { cl_kernel kernel = 0; cl_mem memObjects[2] = {0, 0}; - // Create OpenCL program - first attempt to load cached binary. - // If that is not available, then create the program from source - // and store the binary for future use. - printf("create program from binary...\n"); - cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary( - context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err)); - if (program == NULL) { - std::cerr << "Failed to write program binary" << std::endl; - Cleanup(device_id, context, queue, program, kernel, memObjects); - return 1; - } else { - printf("Read program from binary.\n"); - } + printf("Create program from kernel source\n"); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK_ERR(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK_ERR(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); diff --git a/tests/opencl/sfilter/main.cc b/tests/opencl/sfilter/main.cc index 3a7a5979..0ae264a5 100644 --- a/tests/opencl/sfilter/main.cc +++ b/tests/opencl/sfilter/main.cc @@ -149,14 +149,10 @@ int main(int argc, char **argv) { cl_platform_id platform_id; cl_device_id device_id; + cl_program program; size_t kernel_size; - cl_int binary_status = 0; uint8_t *kernel_bin = NULL; - // read kernel binary from file - if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) - return -1; - // Getting platform and device information CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL)); @@ -170,19 +166,18 @@ int main(int argc, char **argv) { cl_kernel kernel = 0; cl_mem memObjects[2] = {0, 0}; - // Create OpenCL program - first attempt to load cached binary. - // If that is not available, then create the program from source - // and store the binary for future use. - printf("create program from binary...\n"); - cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary( - context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err)); - if (program == NULL) { - std::cerr << "Failed to write program binary" << std::endl; - Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects); - return 1; - } else { - printf("Read program from binary."); - } + printf("Create program from kernel source\n"); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK_ERR(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK_ERR(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); diff --git a/tests/opencl/sgemm/main.cc b/tests/opencl/sgemm/main.cc index 7a02929f..bc48dff0 100644 --- a/tests/opencl/sgemm/main.cc +++ b/tests/opencl/sgemm/main.cc @@ -185,13 +185,8 @@ int main (int argc, char **argv) { cl_platform_id platform_id; size_t kernel_size; - cl_int binary_status; srand(50); - - // read kernel binary from file - if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) - return -1; // Getting platform and device information CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); @@ -207,12 +202,17 @@ int main (int argc, char **argv) { c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); printf("Create program from kernel source\n"); - program = CL_CHECK2(clCreateProgramWithBinary( - context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err)); - if (program == NULL) { - cleanup(); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) return -1; - } + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); diff --git a/tests/opencl/vecadd/main.cc b/tests/opencl/vecadd/main.cc index 992e88be..e443f7c5 100644 --- a/tests/opencl/vecadd/main.cc +++ b/tests/opencl/vecadd/main.cc @@ -122,11 +122,6 @@ int main (int argc, char **argv) { cl_platform_id platform_id; size_t kernel_size; - cl_int binary_status; - - // read kernel binary from file - if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) - return -1; // Getting platform and device information CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL)); @@ -142,13 +137,17 @@ int main (int argc, char **argv) { c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err)); printf("Create program from kernel source\n"); - cl_int _err; - program = clCreateProgramWithBinary( - context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err); - if (program == NULL) { - cleanup(); +#ifdef HOSTGPU + if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size)) return -1; - } + program = CL_CHECK2(clCreateProgramWithSource( + context, 1, (const char**)&kernel_bin, &kernel_size, &_err)); +#else + if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) + return -1; + program = CL_CHECK2(clCreateProgramWithBinary( + context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err)); +#endif // Build program CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL)); From 2c94e358b8d6cfacb360a18ea1810260507a685a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 15 Nov 2023 00:52:39 -0800 Subject: [PATCH 08/11] perf counter bug fix --- hw/rtl/core/VX_core.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index d50a3d32..8aaea911 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -267,7 +267,7 @@ module VX_core import VX_gpu_pkg::*; #( wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle; - wire perf_icache_pending_read_cycle; + wire [1:0] perf_icache_pending_read_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; From 547d916ae25a207e31827b801635fb60e1e30162 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 15 Nov 2023 13:00:06 -0800 Subject: [PATCH 09/11] minor update --- hw/rtl/VX_config.vh | 8 ++++---- hw/syn/altera/opae/Makefile | 6 +++--- hw/syn/xilinx/xrt/Makefile | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 6ecb3cf4..9d4ea56d 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -223,18 +223,18 @@ // Number of ALU units `ifndef NUM_ALU_LANES -`define NUM_ALU_LANES `UP(`NUM_THREADS / 2) +`define NUM_ALU_LANES `NUM_THREADS `endif `ifndef NUM_ALU_BLOCKS -`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1) +`define NUM_ALU_BLOCKS `ISSUE_WIDTH `endif // Number of FPU units `ifndef NUM_FPU_LANES -`define NUM_FPU_LANES `UP(`NUM_THREADS / 2) +`define NUM_FPU_LANES `NUM_THREADS `endif `ifndef NUM_FPU_BLOCKS -`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1) +`define NUM_FPU_BLOCKS `ISSUE_WIDTH `endif // Number of LSU units diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 0db2015d..56008d42 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -50,9 +50,9 @@ CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4 CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8 -CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE -CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE -CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE +CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 +CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 +CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 CONFIGS += $(CONFIGS_$(NUM_CORES)c) # include paths diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index c8714779..ad8e77a7 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -67,9 +67,9 @@ CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4 CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8 -CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE -CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE -CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE +CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 +CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 +CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 CONFIGS += $(CONFIGS_$(NUM_CORES)c) # include paths From d65cc61df57b99a8a9226474e53ba5ba34819e9a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 16 Nov 2023 12:00:37 -0800 Subject: [PATCH 10/11] minor update --- hw/rtl/VX_config.vh | 6 +- hw/rtl/cache/VX_cache_cluster_top.sv | 190 --------------------------- hw/rtl/cache/VX_cache_data.sv | 2 +- hw/rtl/cache/VX_cache_top.sv | 2 +- hw/syn/altera/quartus/cache/Makefile | 4 +- hw/syn/altera/quartus/core/Makefile | 2 +- 6 files changed, 8 insertions(+), 198 deletions(-) delete mode 100644 hw/rtl/cache/VX_cache_cluster_top.sv diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 9d4ea56d..c7b2a2c3 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -407,7 +407,7 @@ // Number of Associative Ways `ifndef ICACHE_NUM_WAYS -`define ICACHE_NUM_WAYS 2 +`define ICACHE_NUM_WAYS 1 `endif // Dcache Configurable Knobs ////////////////////////////////////////////////// @@ -461,7 +461,7 @@ // Number of Associative Ways `ifndef DCACHE_NUM_WAYS -`define DCACHE_NUM_WAYS 2 +`define DCACHE_NUM_WAYS 1 `endif // SM Configurable Knobs ////////////////////////////////////////////////////// @@ -520,7 +520,7 @@ // Number of Associative Ways `ifndef L2_NUM_WAYS -`define L2_NUM_WAYS 4 +`define L2_NUM_WAYS 2 `endif // L3cache Configurable Knobs ///////////////////////////////////////////////// diff --git a/hw/rtl/cache/VX_cache_cluster_top.sv b/hw/rtl/cache/VX_cache_cluster_top.sv deleted file mode 100644 index 500f2c87..00000000 --- a/hw/rtl/cache/VX_cache_cluster_top.sv +++ /dev/null @@ -1,190 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_cache_define.vh" - -module VX_cache_cluster_top import VX_gpu_pkg::*; #( - parameter `STRING INSTANCE_ID = "", - - parameter NUM_UNITS = 2, - parameter NUM_INPUTS = 4, - parameter TAG_SEL_IDX = 0, - - // Number of Word requests per cycle - parameter NUM_REQS = 4, - - // Size of cache in bytes - parameter CACHE_SIZE = 16384, - // Size of line inside a bank in bytes - parameter LINE_SIZE = 16, - // Number of banks - parameter NUM_BANKS = 4, - // Number of associative ways - parameter NUM_WAYS = 4, - // Size of a word in bytes - parameter WORD_SIZE = 4, - - // Core Response Queue Size - parameter CRSQ_SIZE = 2, - // Miss Reserv Queue Knob - parameter MSHR_SIZE = 16, - // Memory Response Queue Size - parameter MRSQ_SIZE = 0, - // Memory Request Queue Size - parameter MREQ_SIZE = 4, - - // Enable cache writeable - parameter WRITE_ENABLE = 1, - - // Request debug identifier - parameter UUID_WIDTH = 0, - - // core request tag size - parameter TAG_WIDTH = UUID_WIDTH + 16, - - // enable bypass for non-cacheable addresses - parameter NC_ENABLE = 1, - - // Core response output register - parameter CORE_OUT_REG = 2, - - // Memory request output register - parameter MEM_OUT_REG = 2, - - parameter NUM_CACHES = `UP(NUM_UNITS), - parameter PASSTHRU = (NUM_UNITS == 0), - parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES), - parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : - `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) : - (NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : - `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)), - parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1) - ) ( - input wire clk, - input wire reset, - -// PERF -`ifdef PERF_ENABLE - output cache_perf_t cache_perf, -`endif - - // Core request - input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid, - input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw, - input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, - input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr, - input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data, - input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag, - output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready, - - // Core response - output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid, - output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data, - output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag, - input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready, - - // Memory request - output wire mem_req_valid, - output wire mem_req_rw, - output wire [LINE_SIZE-1:0] mem_req_byteen, - output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr, - output wire [`CS_LINE_WIDTH-1:0] mem_req_data, - output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag, - input wire mem_req_ready, - - // Memory response - input wire mem_rsp_valid, - input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data, - input wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag, - output wire mem_rsp_ready -); - VX_mem_bus_if #( - .DATA_SIZE (WORD_SIZE), - .TAG_WIDTH (TAG_WIDTH) - ) core_bus_if[NUM_INPUTS * NUM_REQS](); - - VX_mem_bus_if #( - .DATA_SIZE (LINE_SIZE), - .TAG_WIDTH (MEM_TAG_X_WIDTH) - ) mem_bus_if(); - - // Core request - for (genvar i = 0; i < NUM_INPUTS; ++i) begin - for (genvar r = 0; r < NUM_REQS; ++r) begin - assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r]; - assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r]; - assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r]; - assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r]; - assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r]; - assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r]; - assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready; - end - end - - // Core response - for (genvar i = 0; i < NUM_INPUTS; ++i) begin - for (genvar r = 0; r < NUM_REQS; ++r) begin - assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid; - assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data; - assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag; - assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r]; - end - end - - // Memory request - assign mem_req_valid = mem_bus_if.req_valid; - assign mem_req_rw = mem_bus_if.req_data.rw; - assign mem_req_byteen = mem_bus_if.req_data.byteen; - assign mem_req_addr = mem_bus_if.req_data.addr; - assign mem_req_data = mem_bus_if.req_data.data; - assign mem_req_tag = mem_bus_if.req_data.tag; - assign mem_bus_if.req_ready = mem_req_ready; - - // Memory response - assign mem_bus_if.rsp_valid = mem_rsp_valid; - assign mem_bus_if.rsp_data.data = mem_rsp_data; - assign mem_bus_if.rsp_data.tag = mem_rsp_tag; - assign mem_rsp_ready = mem_bus_if.rsp_ready; - - VX_cache_cluster #( - .INSTANCE_ID (INSTANCE_ID), - .NUM_UNITS (NUM_UNITS), - .NUM_INPUTS (NUM_INPUTS), - .TAG_SEL_IDX (TAG_SEL_IDX), - .NUM_REQS (NUM_REQS), - .CACHE_SIZE (CACHE_SIZE), - .LINE_SIZE (LINE_SIZE), - .NUM_BANKS (NUM_BANKS), - .NUM_WAYS (NUM_WAYS), - .WORD_SIZE (WORD_SIZE), - .CRSQ_SIZE (CRSQ_SIZE), - .MSHR_SIZE (MSHR_SIZE), - .MRSQ_SIZE (MRSQ_SIZE), - .MREQ_SIZE (MREQ_SIZE), - .WRITE_ENABLE (WRITE_ENABLE), - .UUID_WIDTH (UUID_WIDTH), - .TAG_WIDTH (TAG_WIDTH), - .NC_ENABLE (NC_ENABLE), - .CORE_OUT_REG (CORE_OUT_REG), - .MEM_OUT_REG (MEM_OUT_REG) - ) cache ( - `ifdef PERF_ENABLE - .cache_perf (cache_perf), - `endif - .clk (clk), - .reset (reset), - .core_bus_if (core_bus_if), - .mem_bus_if (mem_bus_if) - ); - -endmodule diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index 493e4884..5106d7d5 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -93,7 +93,7 @@ module VX_cache_data #( assign wren = fill; end - wire [`CLOG2(NUM_WAYS)-1:0] way_idx; + wire [`LOG2UP(NUM_WAYS)-1:0] way_idx; VX_onehot_encoder #( .N (NUM_WAYS) diff --git a/hw/rtl/cache/VX_cache_top.sv b/hw/rtl/cache/VX_cache_top.sv index 9e36d9af..9be08dde 100644 --- a/hw/rtl/cache/VX_cache_top.sv +++ b/hw/rtl/cache/VX_cache_top.sv @@ -22,7 +22,7 @@ module VX_cache_top #( // Size of cache in bytes parameter CACHE_SIZE = 16384, // Size of line inside a bank in bytes - parameter LINE_SIZE = 16, + parameter LINE_SIZE = 64, // Number of banks parameter NUM_BANKS = 4, // Number of associative ways diff --git a/hw/syn/altera/quartus/cache/Makefile b/hw/syn/altera/quartus/cache/Makefile index 258dc91a..f96a7614 100755 --- a/hw/syn/altera/quartus/cache/Makefile +++ b/hw/syn/altera/quartus/cache/Makefile @@ -1,6 +1,6 @@ -PROJECT = VX_cache_cluster_top +PROJECT = VX_cache_top TOP_LEVEL_ENTITY = $(PROJECT) -SRC_FILE = VX_cache_cluster.sv +SRC_FILE = $(PROJECT).sv include ../../common.mk diff --git a/hw/syn/altera/quartus/core/Makefile b/hw/syn/altera/quartus/core/Makefile index f1dc07f3..eeeaa523 100644 --- a/hw/syn/altera/quartus/core/Makefile +++ b/hw/syn/altera/quartus/core/Makefile @@ -1,6 +1,6 @@ PROJECT = VX_core_top TOP_LEVEL_ENTITY = $(PROJECT) -SRC_FILE = VX_core.sv +SRC_FILE = $(PROJECT).sv include ../../common.mk From 43154cf738b9dfe91afc53c926e3734e02bd9ab6 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 16 Nov 2023 23:41:59 -0800 Subject: [PATCH 11/11] minor updates --- hw/rtl/core/VX_sfu_unit.sv | 4 +-- hw/syn/altera/opae/Makefile | 6 ++--- hw/syn/xilinx/xrt/Makefile | 6 ++--- hw/unittest/top_modules/Makefile | 1 - tests/opencl/Makefile | 12 ++++----- tests/opencl/lbm/main.cc | 4 --- tests/opencl/spmv/convert_dataset.c | 4 --- tests/opencl/spmv/main.cc | 3 --- tests/opencl/stencil/main.cc | 38 +++++++++-------------------- 9 files changed, 25 insertions(+), 53 deletions(-) diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index e94f86fd..fd5dd59f 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -170,7 +170,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( .NUM_INPUTS (RSP_ARB_SIZE), .DATAW (RSP_ARB_DATAW), .ARBITER ("R"), - .OUT_REG (1) + .OUT_REG (3) ) rsp_arb ( .clk (clk), .reset (commit_reset), @@ -186,7 +186,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( VX_gather_unit #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_REG (3) + .OUT_REG (1) ) gather_unit ( .clk (clk), .reset (commit_reset), diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 56008d42..0db2015d 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -50,9 +50,9 @@ CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4 CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8 -CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 +CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE +CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE +CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE CONFIGS += $(CONFIGS_$(NUM_CORES)c) # include paths diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index ad8e77a7..c8714779 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -67,9 +67,9 @@ CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 CONFIGS_4c := -DNUM_CLUSTERS=1 -DNUM_CORES=4 CONFIGS_8c := -DNUM_CLUSTERS=1 -DNUM_CORES=8 -CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 +CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE +CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE +CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE CONFIGS += $(CONFIGS_$(NUM_CORES)c) # include paths diff --git a/hw/unittest/top_modules/Makefile b/hw/unittest/top_modules/Makefile index 2d0319e7..7445381e 100644 --- a/hw/unittest/top_modules/Makefile +++ b/hw/unittest/top_modules/Makefile @@ -56,7 +56,6 @@ PROJECT = top_modules all: build build: $(SRCS) - verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)' verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)' verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)' diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index acb96ba9..88236559 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -31,16 +31,16 @@ run-simx: $(MAKE) -C dotproduct run-simx $(MAKE) -C kmeans run-simx $(MAKE) -C spmv run-simx - $(MAKE) -C cutcp run-simx $(MAKE) -C stencil run-simx $(MAKE) -C lbm run-simx $(MAKE) -C oclprintf run-simx $(MAKE) -C blackscholes run-simx $(MAKE) -C transpose run-simx $(MAKE) -C convolution run-simx -# $(MAKE) -C matmul run-simx -# $(MAKE) -C vectorhypot run-simx -# $(MAKE) -C mri-q run-simx + $(MAKE) -C cutcp run-simx + $(MAKE) -C matmul run-simx + $(MAKE) -C vectorhypot run-simx + $(MAKE) -C mri-q run-simx run-rtlsim: $(MAKE) -C vecadd run-rtlsim @@ -54,12 +54,12 @@ run-rtlsim: $(MAKE) -C kmeans run-rtlsim $(MAKE) -C spmv run-rtlsim $(MAKE) -C transpose run-rtlsim - $(MAKE) -C cutcp run-rtlsim $(MAKE) -C stencil run-rtlsim $(MAKE) -C lbm run-rtlsim $(MAKE) -C oclprintf run-rtlsim $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C convolution run-rtlsim +# $(MAKE) -C cutcp run-rtlsim # $(MAKE) -C matmul run-rtlsim # $(MAKE) -C vectorhypot run-rtlsim # $(MAKE) -C mri-q run-rtlsim @@ -76,12 +76,12 @@ run-opae: $(MAKE) -C kmeans run-opae $(MAKE) -C spmv run-opae $(MAKE) -C transpose run-opae - $(MAKE) -C cutcp run-opae $(MAKE) -C stencil run-opae $(MAKE) -C lbm run-opae $(MAKE) -C oclprintf run-opae $(MAKE) -C blackscholes run-opae $(MAKE) -C convolution run-opae +# $(MAKE) -C cutcp run-opae # $(MAKE) -C matmul run-opae # $(MAKE) -C vectorhypot run-opae # $(MAKE) -C mri-q run-opae diff --git a/tests/opencl/lbm/main.cc b/tests/opencl/lbm/main.cc index 1d825239..58a930e9 100644 --- a/tests/opencl/lbm/main.cc +++ b/tests/opencl/lbm/main.cc @@ -173,14 +173,10 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) { pb_SwitchToTimer(&timers, pb_TimerID_COPY); - printf("OK+\n"); - // Setup DEVICE datastructures OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid); OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid); - printf("OK-\n"); - // Initialize DEVICE datastructures OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid); OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid); diff --git a/tests/opencl/spmv/convert_dataset.c b/tests/opencl/spmv/convert_dataset.c index 122d8819..aba9c3b3 100644 --- a/tests/opencl/spmv/convert_dataset.c +++ b/tests/opencl/spmv/convert_dataset.c @@ -91,15 +91,11 @@ int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size, if ((f = fopen(mtx_filename, "r")) == NULL) exit(1); - printf("OK**\n"); - if (mm_read_banner(f, &matcode) != 0) { printf("Could not process Matrix Market banner.\n"); exit(1); } - printf("OK**\n"); - /* This is how one can screen matrix types if their application */ /* only supports a subset of the Matrix Market data types. */ diff --git a/tests/opencl/spmv/main.cc b/tests/opencl/spmv/main.cc index 85182322..01aa43cd 100644 --- a/tests/opencl/spmv/main.cc +++ b/tests/opencl/spmv/main.cc @@ -148,7 +148,6 @@ int main(int argc, char **argv) { // &h_data, &h_indices, &h_ptr, // &h_perm, &h_nzcnt); int col_count; - printf("OK--\n"); coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx 1, // row padding pad, // warp size @@ -159,8 +158,6 @@ int main(int argc, char **argv) { &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, &len, &nzcnt_len, &depth); - printf("OK++\n"); - // pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); h_Ax_vector = (float *)malloc(sizeof(float) * dim); h_x_vector = (float *)malloc(sizeof(float) * dim); diff --git a/tests/opencl/stencil/main.cc b/tests/opencl/stencil/main.cc index a68bd5a3..cbbed6bc 100644 --- a/tests/opencl/stencil/main.cc +++ b/tests/opencl/stencil/main.cc @@ -157,9 +157,7 @@ int main(int argc, char** argv) { CHECK_ERROR("clBuildProgram") cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); - CHECK_ERROR("clCreateKernel") - - printf("OK+\n"); + CHECK_ERROR("clCreateKernel") //host data float *h_A0; @@ -177,15 +175,11 @@ int main(int argc, char** argv) { h_Anext=(float*)malloc(sizeof(float)*size); pb_SwitchToTimer(&timers, pb_TimerID_IO); //FILE *fp = fopen(parameters->inpFiles[0], "rb"); - printf("OK+\n"); read_data(h_A0, nx,ny,nz,NULL); - printf("OK+\n"); - //fclose(fp); - memcpy (h_Anext,h_A0,sizeof(float)*size); + //fclose(fp); + memcpy (h_Anext,h_A0,sizeof(float)*size); pb_SwitchToTimer(&timers, pb_TimerID_COPY); - - printf("OK+\n"); //memory allocation d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); @@ -201,18 +195,16 @@ int main(int argc, char** argv) { pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); - printf("OK+\n"); - //only use 1D thread block - int tx = 128; + int tx = 128; size_t block[3] = {tx,1,1}; size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; - //size_t grid[3] = {nx-2,ny-2,nz-2}; - size_t offset[3] = {1,1,1}; - printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]); + //size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1,1,1}; + printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]); printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]); - printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2])); + printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2])); clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); @@ -226,14 +218,10 @@ int main(int argc, char** argv) { //main execution pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); - printf("OK+0\n"); - int t; for(t=0;toutFile) { pb_SwitchToTimer(&timers, pb_TimerID_IO);