Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
@@ -1,67 +0,0 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=BlackScholes
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: BlackScholes.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
@@ -1,67 +0,0 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=DotProduct
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: DotProduct.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
@@ -1,59 +1,126 @@
|
||||
all:
|
||||
$(MAKE) -C vecadd
|
||||
$(MAKE) -C sgemm
|
||||
$(MAKE) -C psort
|
||||
$(MAKE) -C saxpy
|
||||
$(MAKE) -C sfilter
|
||||
$(MAKE) -C nearn
|
||||
$(MAKE) -C guassian
|
||||
$(MAKE) -C dotproduct
|
||||
$(MAKE) -C kmeans
|
||||
$(MAKE) -C spmv
|
||||
$(MAKE) -C transpose
|
||||
$(MAKE) -C cutcp
|
||||
$(MAKE) -C vectorhypot
|
||||
$(MAKE) -C stencil
|
||||
$(MAKE) -C mri-q
|
||||
$(MAKE) -C lbm
|
||||
$(MAKE) -C oclprintf
|
||||
$(MAKE) -C psort
|
||||
|
||||
$(MAKE) -C blackscholes
|
||||
$(MAKE) -C matmul
|
||||
|
||||
run-simx:
|
||||
$(MAKE) -C vecadd run-simx
|
||||
$(MAKE) -C vecadd run-simx
|
||||
$(MAKE) -C sgemm run-simx
|
||||
$(MAKE) -C psort run-simx
|
||||
$(MAKE) -C saxpy run-simx
|
||||
$(MAKE) -C sfilter run-simx
|
||||
$(MAKE) -C nearn run-simx
|
||||
$(MAKE) -C guassian run-simx
|
||||
$(MAKE) -C dotproduct run-simx
|
||||
$(MAKE) -C kmeans run-simx
|
||||
$(MAKE) -C spmv run-simx
|
||||
$(MAKE) -C cutcp run-simx
|
||||
$(MAKE) -C stencil run-simx
|
||||
$(MAKE) -C lbm run-simx
|
||||
$(MAKE) -C oclprintf run-simx
|
||||
$(MAKE) -C psort run-simx
|
||||
$(MAKE) -C blackscholes run-simx
|
||||
$(MAKE) -C matmul run-simx
|
||||
$(MAKE) -C transpose run-simx
|
||||
# $(MAKE) -C vectorhypot run-simx
|
||||
# $(MAKE) -C mri-q run-simx
|
||||
|
||||
run-rtlsim:
|
||||
$(MAKE) -C vecadd run-rtlsim
|
||||
$(MAKE) -C vecadd run-rtlsim
|
||||
$(MAKE) -C sgemm run-rtlsim
|
||||
$(MAKE) -C psort run-rtlsim
|
||||
$(MAKE) -C saxpy run-rtlsim
|
||||
$(MAKE) -C sfilter run-rtlsim
|
||||
$(MAKE) -C nearn run-rtlsim
|
||||
$(MAKE) -C guassian run-rtlsim
|
||||
$(MAKE) -C dotproduct run-rtlsim
|
||||
$(MAKE) -C kmeans run-rtlsim
|
||||
$(MAKE) -C spmv run-rtlsim
|
||||
$(MAKE) -C transpose run-rtlsim
|
||||
$(MAKE) -C cutcp run-rtlsim
|
||||
$(MAKE) -C stencil run-rtlsim
|
||||
$(MAKE) -C lbm run-rtlsim
|
||||
$(MAKE) -C oclprintf run-rtlsim
|
||||
$(MAKE) -C psort run-rtlsim
|
||||
$(MAKE) -C blackscholes run-rtlsim
|
||||
$(MAKE) -C matmul run-rtlsim
|
||||
# $(MAKE) -C vectorhypot run-rtlsim
|
||||
# $(MAKE) -C mri-q run-rtlsim
|
||||
|
||||
run-vlsim:
|
||||
$(MAKE) -C vecadd run-vlsim
|
||||
$(MAKE) -C sgemm run-vlsim
|
||||
$(MAKE) -C saxpy run-vlsim
|
||||
$(MAKE) -C sfilter run-vlsim
|
||||
$(MAKE) -C nearn run-vlsim
|
||||
$(MAKE) -C guassian run-vlsim
|
||||
$(MAKE) -C oclprintf run-vlsim
|
||||
$(MAKE) -C psort run-vlsim
|
||||
run-opae:
|
||||
$(MAKE) -C vecadd run-opae
|
||||
$(MAKE) -C sgemm run-opae
|
||||
$(MAKE) -C psort run-opae
|
||||
$(MAKE) -C saxpy run-opae
|
||||
$(MAKE) -C sfilter run-opae
|
||||
$(MAKE) -C nearn run-opae
|
||||
$(MAKE) -C guassian run-opae
|
||||
$(MAKE) -C dotproduct run-opae
|
||||
$(MAKE) -C kmeans run-opae
|
||||
$(MAKE) -C spmv run-opae
|
||||
$(MAKE) -C transpose run-opae
|
||||
$(MAKE) -C cutcp run-opae
|
||||
$(MAKE) -C stencil run-opae
|
||||
$(MAKE) -C lbm run-opae
|
||||
$(MAKE) -C oclprintf run-opae
|
||||
$(MAKE) -C blackscholes run-opae
|
||||
$(MAKE) -C matmul run-opae
|
||||
# $(MAKE) -C vectorhypot run-opae
|
||||
# $(MAKE) -C mri-q run-opae
|
||||
|
||||
clean:
|
||||
$(MAKE) -C vecadd clean
|
||||
$(MAKE) -C sgemm clean
|
||||
$(MAKE) -C psort clean
|
||||
$(MAKE) -C saxpy clean
|
||||
$(MAKE) -C sfilter clean
|
||||
$(MAKE) -C nearn clean
|
||||
$(MAKE) -C guassian clean
|
||||
$(MAKE) -C dotproduct clean
|
||||
$(MAKE) -C kmeans clean
|
||||
$(MAKE) -C spmv clean
|
||||
$(MAKE) -C transpose clean
|
||||
$(MAKE) -C cutcp clean
|
||||
$(MAKE) -C vectorhypot clean
|
||||
$(MAKE) -C stencil clean
|
||||
$(MAKE) -C mri-q clean
|
||||
$(MAKE) -C lbm clean
|
||||
$(MAKE) -C oclprintf clean
|
||||
$(MAKE) -C psort clean
|
||||
$(MAKE) -C blackscholes clean
|
||||
$(MAKE) -C matmul clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C vecadd clean-all
|
||||
$(MAKE) -C sgemm clean-all
|
||||
$(MAKE) -C psort clean-all
|
||||
$(MAKE) -C saxpy clean-all
|
||||
$(MAKE) -C sfilter clean-all
|
||||
$(MAKE) -C sfilter clean-all
|
||||
$(MAKE) -C nearn clean-all
|
||||
$(MAKE) -C guassian clean-all
|
||||
$(MAKE) -C dotproduct clean-all
|
||||
$(MAKE) -C kmeans clean-all
|
||||
$(MAKE) -C spmv clean-all
|
||||
$(MAKE) -C transpose clean-all
|
||||
$(MAKE) -C cutcp clean-all
|
||||
$(MAKE) -C vectorhypot clean-all
|
||||
$(MAKE) -C stencil clean-all
|
||||
$(MAKE) -C mri-q clean-all
|
||||
$(MAKE) -C lbm clean-all
|
||||
$(MAKE) -C oclprintf clean-all
|
||||
$(MAKE) -C psort clean-all
|
||||
$(MAKE) -C blackscholes clean-all
|
||||
$(MAKE) -C matmul clean-all
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=VectorHypot
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: VectorHypot.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,67 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 -Wstack-usage=1024 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = bfs
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?=
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
@@ -285,7 +285,7 @@ int main(int argc, char *argv[]) {
|
||||
free(h_graph_visited);
|
||||
|
||||
} catch (std::string msg) {
|
||||
std::cout << "--cambine: exception in main ->" << msg << std::endl;
|
||||
printf("--cambine: exception in main ->%s\n", msg);
|
||||
// release host memory
|
||||
free(h_graph_nodes);
|
||||
free(h_graph_mask);
|
||||
|
||||
9
tests/opencl/blackscholes/Makefile
Normal file
9
tests/opencl/blackscholes/Makefile
Normal file
@@ -0,0 +1,9 @@
|
||||
PROJECT = blackscholes
|
||||
|
||||
SRCS = main.cpp oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp
|
||||
|
||||
CXXFLAGS += -I.
|
||||
|
||||
OPTS ?=
|
||||
|
||||
include ../common.mk
|
||||
152
tests/opencl/blackscholes/cmd_arg_reader.cpp
Normal file
152
tests/opencl/blackscholes/cmd_arg_reader.cpp
Normal file
@@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
// includes, file
|
||||
#include "cmd_arg_reader.h"
|
||||
|
||||
// includes, system
|
||||
#include <vector>
|
||||
|
||||
// internal unnamed namespace
|
||||
|
||||
namespace
|
||||
{
|
||||
// types, internal (class, enum, struct, union, typedef)
|
||||
|
||||
// variables, internal
|
||||
|
||||
} // namespace {
|
||||
|
||||
// variables, exported
|
||||
|
||||
/*static*/ CmdArgReader* CmdArgReader::self;
|
||||
/*static*/ char** CmdArgReader::rargv;
|
||||
/*static*/ int CmdArgReader::rargc;
|
||||
|
||||
// functions, exported
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Public construction interface
|
||||
//! @return a handle to the class instance
|
||||
//! @param argc number of command line arguments (as given to main())
|
||||
//! @param argv command line argument string (as given to main())
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ void
|
||||
CmdArgReader::init( const int argc, const char** argv)
|
||||
{
|
||||
if ( NULL != self)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// command line arguments
|
||||
if (( 0 == argc) || ( 0 == argv))
|
||||
{
|
||||
LOGIC_EXCEPTION( "No command line arguments given.");
|
||||
}
|
||||
|
||||
self = new CmdArgReader();
|
||||
|
||||
self->createArgsMaps( argc, argv);
|
||||
|
||||
rargc = argc;
|
||||
rargv = const_cast<char**>( argv);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, default
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CmdArgReader::CmdArgReader() :
|
||||
args(),
|
||||
unprocessed(),
|
||||
iter(),
|
||||
iter_unprocessed()
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Destructor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CmdArgReader::~CmdArgReader()
|
||||
{
|
||||
for( iter = args.begin(); iter != args.end(); ++iter)
|
||||
{
|
||||
if( *(iter->second.first) == typeid( int))
|
||||
{
|
||||
delete static_cast<int*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( bool))
|
||||
{
|
||||
delete static_cast<bool*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::string))
|
||||
{
|
||||
delete static_cast<std::string*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
|
||||
{
|
||||
delete static_cast< std::vector< std::string>* >( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::vector<int>) )
|
||||
{
|
||||
delete static_cast< std::vector<int>* >( iter->second.second);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Read args as token value pair into map for better processing (Even the
|
||||
//! values remain strings until the parameter values is requested by the
|
||||
//! program.)
|
||||
//! @param argc the argument count (as given to 'main')
|
||||
//! @param argv the char* array containing the command line arguments
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void
|
||||
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
|
||||
|
||||
std::string token;
|
||||
std::string val_str;
|
||||
|
||||
std::map< std::string, std::string> args;
|
||||
|
||||
std::string::size_type pos;
|
||||
std::string arg;
|
||||
for( int i=1; i<argc; ++i)
|
||||
{
|
||||
arg = argv[i];
|
||||
|
||||
// check if valid command line argument: all arguments begin with - or --
|
||||
if (arg[0] != '-')
|
||||
{
|
||||
RUNTIME_EXCEPTION("Invalid command line argument.");
|
||||
}
|
||||
|
||||
int numDashes = (arg[1] == '-' ? 2 : 1);
|
||||
|
||||
// check if only flag or if a value is given
|
||||
if ( (pos = arg.find( '=')) == std::string::npos)
|
||||
{
|
||||
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
|
||||
}
|
||||
else
|
||||
{
|
||||
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
|
||||
std::string( arg, pos+1, arg.length()-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
488
tests/opencl/blackscholes/cmd_arg_reader.h
Normal file
488
tests/opencl/blackscholes/cmd_arg_reader.h
Normal file
@@ -0,0 +1,488 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
#ifndef _CMDARGREADER_H_
|
||||
#define _CMDARGREADER_H_
|
||||
|
||||
// includes, system
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <typeinfo>
|
||||
|
||||
// includes, project
|
||||
#include "exception.h"
|
||||
|
||||
//! Preprocessed command line arguments
|
||||
//! @note Lazy evaluation: The arguments are converted from strings to
|
||||
//! the correct data type upon request. Converted values are stored
|
||||
//! in an additonal map so that no additional conversion is
|
||||
//! necessary. Arrays of command line arguments are stored in
|
||||
//! std::vectors
|
||||
//! @note Usage:
|
||||
//! const std::string* file =
|
||||
//! CmdArgReader::getArg< std::string>( "model")
|
||||
//! const std::vector< std::string>* files =
|
||||
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
|
||||
//! @note All command line arguments begin with '--' followed by the token;
|
||||
//! token and value are seperated by '='; example --samples=50
|
||||
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
//! (without whitespaces)
|
||||
|
||||
//! Command line argument parser
|
||||
class CmdArgReader
|
||||
{
|
||||
template<class> friend class TestCmdArgReader;
|
||||
|
||||
protected:
|
||||
|
||||
//! @param self handle to the only instance of this class
|
||||
static CmdArgReader* self;
|
||||
|
||||
public:
|
||||
|
||||
//! Public construction interface
|
||||
//! @return a handle to the class instance
|
||||
//! @param argc number of command line arguments (as given to main())
|
||||
//! @param argv command line argument string (as given to main())
|
||||
static void init( const int argc, const char** argv);
|
||||
|
||||
public:
|
||||
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument.
|
||||
//! If the argument does not exist or if it
|
||||
//! is not from type T NULL is returned
|
||||
//! @param name the name of the requested argument
|
||||
//! @note T the type of the argument requested
|
||||
template<class T>
|
||||
static inline const T* getArg( const std::string& name);
|
||||
|
||||
//! Check if a command line argument with the given name exists
|
||||
//! @return true if a command line argument with name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name name of the command line argument in question
|
||||
static inline bool existArg( const std::string& name);
|
||||
|
||||
//! Get the original / raw argc program argument
|
||||
static inline int& getRArgc();
|
||||
|
||||
//! Get the original / raw argv program argument
|
||||
static inline char**& getRArgv();
|
||||
|
||||
public:
|
||||
|
||||
//! Destructor
|
||||
~CmdArgReader();
|
||||
|
||||
protected:
|
||||
|
||||
//! Constructor, default
|
||||
CmdArgReader();
|
||||
|
||||
private:
|
||||
|
||||
// private helper functions
|
||||
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @note Private helper function for 'getArg' to work on the members
|
||||
//! @return A const handle to the requested argument. If the argument
|
||||
//! does not exist or if it is not from type T a NULL pointer
|
||||
//! is returned.
|
||||
//! @param name the name of the requested argument
|
||||
//! @note T the type of the argument requested
|
||||
template<class T>
|
||||
inline const T* getArgHelper( const std::string& name);
|
||||
|
||||
//! Check if a command line argument with name \a name exists
|
||||
//! @return true if a command line argument of name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name the name of the requested argument
|
||||
inline bool existArgHelper( const std::string& name) const;
|
||||
|
||||
//! Read args as token value pair into map for better processing
|
||||
//! (Even the values remain strings until the parameter values is
|
||||
//! requested by the program.)
|
||||
//! @param argc the argument count (as given to 'main')
|
||||
//! @param argv the char* array containing the command line arguments
|
||||
void createArgsMaps( const int argc, const char** argv);
|
||||
|
||||
//! Helper for "casting" the strings from the map with the unprocessed
|
||||
//! values to the correct
|
||||
//! data type.
|
||||
//! @return true if conversion succeeded, otherwise false
|
||||
//! @param element the value as string
|
||||
//! @param val the value as type T
|
||||
template<class T>
|
||||
static inline bool convertToT( const std::string& element, T& val);
|
||||
|
||||
public:
|
||||
|
||||
// typedefs internal
|
||||
|
||||
//! container for a processed command line argument
|
||||
//! typeid is used to easily be able to decide if a re-requested token-value
|
||||
//! pair match the type of the first conversion
|
||||
typedef std::pair< const std::type_info*, void*> ValType;
|
||||
//! map of already converted values
|
||||
typedef std::map< std::string, ValType > ArgsMap;
|
||||
//! iterator for the map of already converted values
|
||||
typedef ArgsMap::iterator ArgsMapIter;
|
||||
typedef ArgsMap::const_iterator ConstArgsMapIter;
|
||||
|
||||
//! map of unprocessed (means unconverted) token-value pairs
|
||||
typedef std::map< std::string, std::string> UnpMap;
|
||||
//! iterator for the map of unprocessed (means unconverted) token-value pairs
|
||||
typedef std::map< std::string, std::string>::iterator UnpMapIter;
|
||||
|
||||
private:
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( disable: 4251)
|
||||
#endif
|
||||
|
||||
//! rargc original value of argc
|
||||
static int rargc;
|
||||
|
||||
//! rargv contains command line arguments in raw format
|
||||
static char** rargv;
|
||||
|
||||
//! args Map containing the already converted token-value pairs
|
||||
ArgsMap args;
|
||||
|
||||
//! args Map containing the unprocessed / unconverted token-value pairs
|
||||
UnpMap unprocessed;
|
||||
|
||||
//! iter Iterator for the map with the already converted token-value
|
||||
//! pairs (to avoid frequent reallocation)
|
||||
ArgsMapIter iter;
|
||||
|
||||
//! iter Iterator for the map with the unconverted token-value
|
||||
//! pairs (to avoid frequent reallocation)
|
||||
UnpMapIter iter_unprocessed;
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( default: 4251)
|
||||
#endif
|
||||
|
||||
private:
|
||||
|
||||
//! Constructor, copy (not implemented)
|
||||
CmdArgReader( const CmdArgReader&);
|
||||
|
||||
//! Assignment operator (not implemented)
|
||||
CmdArgReader& operator=( const CmdArgReader&);
|
||||
};
|
||||
|
||||
// variables, exported (extern)
|
||||
|
||||
// functions, inlined (inline)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line argument arrays
|
||||
//! @note This function is used each type for which no template specialization
|
||||
//! exist (which will cause errors if the type does not fulfill the std::vector
|
||||
//! interface).
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
/*static*/ inline bool
|
||||
CmdArgReader::convertToT( const std::string& element, T& val)
|
||||
{
|
||||
// preallocate storage
|
||||
val.resize( std::count( element.begin(), element.end(), ',') + 1);
|
||||
|
||||
unsigned int i = 0;
|
||||
std::string::size_type pos_start = 1; // leave array prefix '['
|
||||
std::string::size_type pos_end = 0;
|
||||
|
||||
// do for all elements of the comma seperated list
|
||||
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
|
||||
{
|
||||
// convert each element by the appropriate function
|
||||
if ( ! convertToT< typename T::value_type >(
|
||||
std::string( element, pos_start, pos_end - pos_start), val[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
pos_start = pos_end + 1;
|
||||
++i;
|
||||
}
|
||||
|
||||
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
|
||||
|
||||
// process last element (leave array postfix ']')
|
||||
if ( ! convertToT< typename T::value_type >( std::string( element,
|
||||
pos_start,
|
||||
element.length() - pos_start - 1),
|
||||
val[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// possible to process all elements?
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type int
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<int>( const std::string& element, int& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type float
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<float>( const std::string& element, float& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type double
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<double>( const std::string& element, double& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type string
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<std::string>( const std::string& element,
|
||||
std::string& val)
|
||||
{
|
||||
val = element;
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type bool
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
|
||||
{
|
||||
// check if value is given as string-type { true | false }
|
||||
if ( "true" == element)
|
||||
{
|
||||
val = true;
|
||||
return true;
|
||||
}
|
||||
else if ( "false" == element)
|
||||
{
|
||||
val = false;
|
||||
return true;
|
||||
}
|
||||
// check if argument is given as integer { 0 | 1 }
|
||||
else
|
||||
{
|
||||
int tmp;
|
||||
if ( convertToT<int>( element, tmp))
|
||||
{
|
||||
if ( 1 == tmp)
|
||||
{
|
||||
val = true;
|
||||
return true;
|
||||
}
|
||||
else if ( 0 == tmp)
|
||||
{
|
||||
val = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument. If the argument does
|
||||
//! not exist or if it is not from type T NULL is returned
|
||||
//! @param T the type of the argument requested
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
/*static*/ const T*
|
||||
CmdArgReader::getArg( const std::string& name)
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return self->getArgHelper<T>( name);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if a command line argument with the given name exists
|
||||
//! @return true if a command line argument with name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name name of the command line argument in question
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline bool
|
||||
CmdArgReader::existArg( const std::string& name)
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return self->existArgHelper( name);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! @brief Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument. If the argument does
|
||||
//! not exist or if it is not from type T NULL is returned
|
||||
//! @param T the type of the argument requested
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
const T*
|
||||
CmdArgReader::getArgHelper( const std::string& name)
|
||||
{
|
||||
// check if argument already processed and stored in correct type
|
||||
if ( args.end() != (iter = args.find( name)))
|
||||
{
|
||||
if ( (*(iter->second.first)) == typeid( T) )
|
||||
{
|
||||
return (T*) iter->second.second;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
T* tmp = new T;
|
||||
|
||||
// check the array with unprocessed values
|
||||
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
|
||||
{
|
||||
// try to "cast" the string to the type requested
|
||||
if ( convertToT< T >( iter_unprocessed->second, *tmp))
|
||||
{
|
||||
// add the token element pair to map of already converted values
|
||||
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
|
||||
|
||||
return tmp;
|
||||
}
|
||||
}
|
||||
|
||||
// not used while not inserted into the map -> cleanup
|
||||
delete tmp;
|
||||
}
|
||||
|
||||
// failed, argument not available
|
||||
return NULL;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if a command line argument with name \a name exists
|
||||
//! @return true if a command line argument of name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
inline bool
|
||||
CmdArgReader::existArgHelper( const std::string& name) const
|
||||
{
|
||||
bool ret_val = false;
|
||||
|
||||
// check if argument already processed and stored in correct type
|
||||
if( args.end() != args.find( name))
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
// check the array with unprocessed values
|
||||
if ( unprocessed.end() != unprocessed.find( name))
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the original / raw argc program argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline int&
|
||||
CmdArgReader::getRArgc()
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
|
||||
}
|
||||
|
||||
return rargc;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the original / raw argv program argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline char**&
|
||||
CmdArgReader::getRArgv()
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
|
||||
}
|
||||
|
||||
return rargv;
|
||||
}
|
||||
|
||||
// functions, exported (extern)
|
||||
|
||||
#endif // #ifndef _CMDARGREADER_H_
|
||||
151
tests/opencl/blackscholes/exception.h
Normal file
151
tests/opencl/blackscholes/exception.h
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
#ifndef _EXCEPTION_H_
|
||||
#define _EXCEPTION_H_
|
||||
|
||||
// includes, system
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
|
||||
//! Exception wrapper.
|
||||
//! @param Std_Exception Exception out of namespace std for easy typing.
|
||||
template<class Std_Exception>
|
||||
class Exception : public Std_Exception
|
||||
{
|
||||
public:
|
||||
|
||||
//! @brief Static construction interface
|
||||
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||
//! @param file file in which the Exception occurs
|
||||
//! @param line line in which the Exception occurs
|
||||
//! @param detailed details on the code fragment causing the Exception
|
||||
static void throw_it( const char* file,
|
||||
const int line,
|
||||
const char* detailed = "-" );
|
||||
|
||||
//! Static construction interface
|
||||
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||
//! @param file file in which the Exception occurs
|
||||
//! @param line line in which the Exception occurs
|
||||
//! @param detailed details on the code fragment causing the Exception
|
||||
static void throw_it( const char* file,
|
||||
const int line,
|
||||
const std::string& detailed);
|
||||
|
||||
//! Destructor
|
||||
virtual ~Exception() throw();
|
||||
|
||||
private:
|
||||
|
||||
//! Constructor, default (private)
|
||||
Exception();
|
||||
|
||||
//! Constructor, standard
|
||||
//! @param str string returned by what()
|
||||
Exception( const std::string& str);
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Exception handler function for arbitrary exceptions
|
||||
//! @param ex exception to handle
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Exception_Typ>
|
||||
inline void
|
||||
handleException( const Exception_Typ& ex)
|
||||
{
|
||||
std::cerr << ex.what() << std::endl;
|
||||
|
||||
exit( EXIT_FAILURE);
|
||||
}
|
||||
|
||||
//! Convenience macros
|
||||
|
||||
//! Exception caused by dynamic program behavior, e.g. file does not exist
|
||||
#define RUNTIME_EXCEPTION( msg) \
|
||||
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
//! Logic exception in program, e.g. an assert failed
|
||||
#define LOGIC_EXCEPTION( msg) \
|
||||
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
//! Out of range exception
|
||||
#define RANGE_EXCEPTION( msg) \
|
||||
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Implementation
|
||||
|
||||
// includes, system
|
||||
#include <sstream>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Static construction interface.
|
||||
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ template<class Std_Exception>
|
||||
void
|
||||
Exception<Std_Exception>::
|
||||
throw_it( const char* file, const int line, const char* detailed)
|
||||
{
|
||||
std::stringstream s;
|
||||
|
||||
// Quiet heavy-weight but exceptions are not for
|
||||
// performance / release versions
|
||||
s << "Exception in file '" << file << "' in line " << line << "\n"
|
||||
<< "Detailed description: " << detailed << "\n";
|
||||
|
||||
throw Exception( s.str());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Static construction interface.
|
||||
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ template<class Std_Exception>
|
||||
void
|
||||
Exception<Std_Exception>::
|
||||
throw_it( const char* file, const int line, const std::string& msg)
|
||||
{
|
||||
throw_it( file, line, msg.c_str());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, default (private).
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::Exception() :
|
||||
Exception("Unknown Exception.\n")
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, standard (private).
|
||||
//! String returned by what().
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::Exception( const std::string& s) :
|
||||
Std_Exception( s)
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Destructor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::~Exception() throw() { }
|
||||
|
||||
// functions, exported
|
||||
|
||||
#endif // #ifndef _EXCEPTION_H_
|
||||
|
||||
@@ -61,7 +61,7 @@ int main(int argc, char **argv)
|
||||
*h_X,
|
||||
*h_T;
|
||||
|
||||
const unsigned int optionCount = 4000000;
|
||||
const unsigned int optionCount = 64;
|
||||
const float R = 0.02f;
|
||||
const float V = 0.30f;
|
||||
|
||||
@@ -69,7 +69,7 @@ int main(int argc, char **argv)
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
|
||||
//Get all the devices
|
||||
@@ -78,10 +78,10 @@ int main(int argc, char **argv)
|
||||
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
// Get command line device options and config accordingly
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
@@ -92,7 +92,7 @@ int main(int argc, char **argv)
|
||||
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||
|
||||
// set logfile name and start logs
|
||||
@@ -120,31 +120,31 @@ int main(int argc, char **argv)
|
||||
shrLog("Initializing OpenCL...\n");
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Get a GPU device
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Create the context
|
||||
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Create a command-queue
|
||||
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Creating OpenCL memory objects...\n");
|
||||
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Starting up BlackScholes...\n");
|
||||
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
|
||||
@@ -204,9 +204,9 @@ int main(int argc, char **argv)
|
||||
|
||||
shrLog("\nReading back OpenCL BlackScholes results...\n");
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Comparing against Host/C++ computation...\n");
|
||||
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
|
||||
@@ -232,7 +232,7 @@ int main(int argc, char **argv)
|
||||
ciErrNum |= clReleaseMemObject(d_Call);
|
||||
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
|
||||
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
free(h_T);
|
||||
free(h_X);
|
||||
@@ -9,8 +9,6 @@
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <oclUtils.h>
|
||||
#include "oclBlackScholes_common.h"
|
||||
|
||||
@@ -18,19 +16,47 @@ static cl_program cpBlackScholes; //OpenCL program
|
||||
static cl_kernel ckBlackScholes; //OpenCL kernel
|
||||
static cl_command_queue cqDefaultCommandQueue;
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (NULL == filename || NULL == data || 0 == size)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
|
||||
cl_int ciErrNum;
|
||||
size_t kernelLength;
|
||||
|
||||
shrLog("...loading BlackScholes.cl\n");
|
||||
/*shrLog("...loading BlackScholes.cl\n");
|
||||
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
|
||||
shrCheckError(cPathAndName != NULL, shrTRUE);
|
||||
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
|
||||
shrCheckError(cBlackScholes != NULL, shrTRUE);
|
||||
shrCheckError(cBlackScholes != NULL, shrTRUE);*/
|
||||
|
||||
shrLog("...creating BlackScholes program\n");
|
||||
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
|
||||
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
|
||||
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
|
||||
cpBlackScholes = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("...building BlackScholes program\n");
|
||||
@@ -66,7 +92,7 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
|
||||
shrLog("*** Exiting ***\n");
|
||||
free(logTxt);
|
||||
free(cdDevices);
|
||||
exit(666);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
//Save ptx code to separate file
|
||||
@@ -77,8 +103,8 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
cqDefaultCommandQueue = cqParamCommandQueue;
|
||||
free(cBlackScholes);
|
||||
free(cPathAndName);
|
||||
//free(cBlackScholes);
|
||||
//free(cPathAndName);
|
||||
}
|
||||
|
||||
extern "C" void closeBlackScholes(void){
|
||||
@@ -118,8 +144,8 @@ extern "C" void BlackScholes(
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Run the kernel
|
||||
size_t globalWorkSize = 60 * 1024;
|
||||
size_t localWorkSize = 128;
|
||||
size_t globalWorkSize = 16;//60 * 1024;
|
||||
size_t localWorkSize = 16;//128;
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
806
tests/opencl/blackscholes/oclUtils.cpp
Normal file
806
tests/opencl/blackscholes/oclUtils.cpp
Normal file
@@ -0,0 +1,806 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdarg.h>
|
||||
#include "oclUtils.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platoform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
|
||||
{
|
||||
char chBuffer[1024];
|
||||
cl_uint num_platforms;
|
||||
cl_platform_id* clPlatformIDs;
|
||||
cl_int ciErrNum;
|
||||
*clSelectedPlatformID = NULL;
|
||||
|
||||
// Get OpenCL platform count
|
||||
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
|
||||
return -1000;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(num_platforms == 0)
|
||||
{
|
||||
shrLog("No OpenCL platform found!\n\n");
|
||||
return -2000;
|
||||
}
|
||||
else
|
||||
{
|
||||
// if there's a platform or more, make space for ID's
|
||||
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
|
||||
{
|
||||
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
|
||||
return -3000;
|
||||
}
|
||||
|
||||
// get platform info for each platform and trap the NVIDIA platform if found
|
||||
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
|
||||
for(cl_uint i = 0; i < num_platforms; ++i)
|
||||
{
|
||||
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
|
||||
if(ciErrNum == CL_SUCCESS)
|
||||
{
|
||||
if(strstr(chBuffer, "NVIDIA") != NULL)
|
||||
{
|
||||
*clSelectedPlatformID = clPlatformIDs[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// default to zeroeth platform if NVIDIA not found
|
||||
if(*clSelectedPlatformID == NULL)
|
||||
{
|
||||
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
|
||||
*clSelectedPlatformID = clPlatformIDs[0];
|
||||
}
|
||||
|
||||
free(clPlatformIDs);
|
||||
}
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclPrintDevName(int iLogMode, cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, "%s", device_string);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclPrintDevInfo(int iLogMode, cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
bool nv_device_attibute_query = false;
|
||||
|
||||
// CL_DEVICE_NAME
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_VENDOR
|
||||
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DRIVER_VERSION
|
||||
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_VERSION
|
||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
|
||||
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
|
||||
{
|
||||
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
|
||||
// This constant isn't #defined in 1.0
|
||||
#ifndef CL_DEVICE_OPENCL_C_VERSION
|
||||
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
|
||||
#endif
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
|
||||
}
|
||||
|
||||
// CL_DEVICE_TYPE
|
||||
cl_device_type type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
|
||||
if( type & CL_DEVICE_TYPE_CPU )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
|
||||
if( type & CL_DEVICE_TYPE_GPU )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
|
||||
if( type & CL_DEVICE_TYPE_ACCELERATOR )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
|
||||
if( type & CL_DEVICE_TYPE_DEFAULT )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
|
||||
size_t workitem_dims;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_SIZES
|
||||
size_t workitem_size[3];
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
||||
size_t workgroup_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
|
||||
|
||||
// CL_DEVICE_ADDRESS_BITS
|
||||
cl_uint addr_bits;
|
||||
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
|
||||
|
||||
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
|
||||
cl_ulong max_mem_alloc_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
|
||||
|
||||
// CL_DEVICE_GLOBAL_MEM_SIZE
|
||||
cl_ulong mem_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
|
||||
|
||||
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
|
||||
cl_bool error_correction_support;
|
||||
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_TYPE
|
||||
cl_device_local_mem_type local_mem_type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
|
||||
|
||||
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
|
||||
|
||||
// CL_DEVICE_QUEUE_PROPERTIES
|
||||
cl_command_queue_properties queue_properties;
|
||||
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
|
||||
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
|
||||
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
|
||||
|
||||
// CL_DEVICE_IMAGE_SUPPORT
|
||||
cl_bool image_support;
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
|
||||
|
||||
// CL_DEVICE_MAX_READ_IMAGE_ARGS
|
||||
cl_uint max_read_image_args;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
|
||||
|
||||
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
|
||||
cl_uint max_write_image_args;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
|
||||
|
||||
// CL_DEVICE_SINGLE_FP_CONFIG
|
||||
cl_device_fp_config fp_config;
|
||||
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
|
||||
fp_config & CL_FP_DENORM ? "denorms " : "",
|
||||
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
|
||||
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
|
||||
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
|
||||
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
|
||||
fp_config & CL_FP_FMA ? "fma " : "");
|
||||
|
||||
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
|
||||
size_t szMaxDims[5];
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
|
||||
|
||||
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
|
||||
if (device_string != 0)
|
||||
{
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
|
||||
std::string stdDevString;
|
||||
stdDevString = std::string(device_string);
|
||||
size_t szOldPos = 0;
|
||||
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
|
||||
while (szSpacePos != stdDevString.npos)
|
||||
{
|
||||
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
|
||||
nv_device_attibute_query = true;
|
||||
|
||||
if (szOldPos > 0)
|
||||
{
|
||||
shrLogEx(iLogMode, 0, "\t\t");
|
||||
}
|
||||
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
|
||||
|
||||
do {
|
||||
szOldPos = szSpacePos + 1;
|
||||
szSpacePos = stdDevString.find(' ', szOldPos);
|
||||
} while (szSpacePos == szOldPos);
|
||||
}
|
||||
shrLogEx(iLogMode, 0, "\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
|
||||
}
|
||||
|
||||
if(nv_device_attibute_query)
|
||||
{
|
||||
cl_uint compute_capability_major, compute_capability_minor;
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
|
||||
|
||||
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
|
||||
|
||||
cl_uint regs_per_block;
|
||||
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), ®s_per_block, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
|
||||
|
||||
cl_uint warp_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
|
||||
|
||||
cl_bool gpu_overlap;
|
||||
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
|
||||
cl_bool exec_timeout;
|
||||
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
|
||||
cl_bool integrated_memory;
|
||||
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
}
|
||||
|
||||
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
|
||||
cl_uint vec_width [6];
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
|
||||
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
|
||||
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
int oclGetDevCap(cl_device_id device)
|
||||
{
|
||||
char cDevString[1024];
|
||||
bool bDevAttributeQuery = false;
|
||||
int iDevArch = -1;
|
||||
|
||||
// Get device extensions, and if any then search for cl_nv_device_attribute_query
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
|
||||
if (cDevString != 0)
|
||||
{
|
||||
std::string stdDevString;
|
||||
stdDevString = std::string(cDevString);
|
||||
size_t szOldPos = 0;
|
||||
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
|
||||
while (szSpacePos != stdDevString.npos)
|
||||
{
|
||||
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
|
||||
{
|
||||
bDevAttributeQuery = true;
|
||||
}
|
||||
|
||||
do {
|
||||
szOldPos = szSpacePos + 1;
|
||||
szSpacePos = stdDevString.find(' ', szOldPos);
|
||||
} while (szSpacePos == szOldPos);
|
||||
}
|
||||
}
|
||||
|
||||
// if search succeeded, get device caps
|
||||
if(bDevAttributeQuery)
|
||||
{
|
||||
cl_int iComputeCapMajor, iComputeCapMinor;
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
|
||||
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
|
||||
}
|
||||
|
||||
return iDevArch;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id first = cdDevices[0];
|
||||
free(cdDevices);
|
||||
|
||||
return first;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id max_flops_device = cdDevices[0];
|
||||
int max_flops = 0;
|
||||
|
||||
size_t current_device = 0;
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
|
||||
max_flops = compute_units * clock_frequency;
|
||||
++current_device;
|
||||
|
||||
while( current_device < device_count )
|
||||
{
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
|
||||
int flops = compute_units * clock_frequency;
|
||||
if( flops > max_flops )
|
||||
{
|
||||
max_flops = flops;
|
||||
max_flops_device = cdDevices[current_device];
|
||||
}
|
||||
++current_device;
|
||||
}
|
||||
|
||||
free(cdDevices);
|
||||
|
||||
return max_flops_device;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||
{
|
||||
// locals
|
||||
FILE* pFileStream = NULL;
|
||||
size_t szSourceLength;
|
||||
|
||||
// open the OpenCL source code file
|
||||
#ifdef _WIN32 // Windows version
|
||||
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#else // Linux version
|
||||
pFileStream = fopen(cFilename, "rb");
|
||||
if(pFileStream == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t szPreambleLength = strlen(cPreamble);
|
||||
|
||||
// get the length of the source code
|
||||
fseek(pFileStream, 0, SEEK_END);
|
||||
szSourceLength = ftell(pFileStream);
|
||||
fseek(pFileStream, 0, SEEK_SET);
|
||||
|
||||
// allocate a buffer for the source code string and read it in
|
||||
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
|
||||
{
|
||||
fclose(pFileStream);
|
||||
free(cSourceString);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// close the file and return the total length of the combined (preamble + source) string
|
||||
fclose(pFileStream);
|
||||
if(szFinalLength != 0)
|
||||
{
|
||||
*szFinalLength = szSourceLength + szPreambleLength;
|
||||
}
|
||||
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||
|
||||
return cSourceString;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
|
||||
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
|
||||
return (cl_device_id)-1;
|
||||
}
|
||||
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id device = cdDevices[nr];
|
||||
free(cdDevices);
|
||||
|
||||
return device;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
|
||||
{
|
||||
// Grab the number of devices associated witht the program
|
||||
cl_uint num_devices;
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
|
||||
|
||||
// Grab the device ids
|
||||
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
|
||||
|
||||
// Grab the sizes of the binaries
|
||||
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
|
||||
|
||||
// Now get the binaries
|
||||
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
|
||||
for( unsigned int i=0; i<num_devices; ++i) {
|
||||
ptx_code[i]= (char*)malloc(binary_sizes[i]);
|
||||
}
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
|
||||
|
||||
// Find the index of the device of interest
|
||||
unsigned int idx = 0;
|
||||
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
|
||||
|
||||
// If it is associated prepare the result
|
||||
if( idx < num_devices )
|
||||
{
|
||||
*binary = ptx_code[idx];
|
||||
*length = binary_sizes[idx];
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
free( devices );
|
||||
free( binary_sizes );
|
||||
for( unsigned int i=0; i<num_devices; ++i) {
|
||||
if( i != idx ) free(ptx_code[i]);
|
||||
}
|
||||
free( ptx_code );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
|
||||
{
|
||||
// Grab the number of devices associated with the program
|
||||
cl_uint num_devices;
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
|
||||
|
||||
// Grab the device ids
|
||||
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
|
||||
|
||||
// Grab the sizes of the binaries
|
||||
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
|
||||
|
||||
// Now get the binaries
|
||||
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
|
||||
for( unsigned int i=0; i<num_devices; ++i)
|
||||
{
|
||||
ptx_code[i] = (char*)malloc(binary_sizes[i]);
|
||||
}
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
|
||||
|
||||
// Find the index of the device of interest
|
||||
unsigned int idx = 0;
|
||||
while((idx < num_devices) && (devices[idx] != cdDevice))
|
||||
{
|
||||
++idx;
|
||||
}
|
||||
|
||||
// If the index is associated, log the result
|
||||
if(idx < num_devices)
|
||||
{
|
||||
|
||||
// if a separate filename is supplied, dump ptx there
|
||||
if (NULL != cPtxFileName)
|
||||
{
|
||||
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
|
||||
FILE* pFileStream = NULL;
|
||||
#ifdef _WIN32
|
||||
fopen_s(&pFileStream, cPtxFileName, "wb");
|
||||
#else
|
||||
pFileStream = fopen(cPtxFileName, "wb");
|
||||
#endif
|
||||
|
||||
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
|
||||
fclose(pFileStream);
|
||||
}
|
||||
else // log to logfile and console if no ptx file specified
|
||||
{
|
||||
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
free(devices);
|
||||
free(binary_sizes);
|
||||
for(unsigned int i = 0; i < num_devices; ++i)
|
||||
{
|
||||
free(ptx_code[i]);
|
||||
}
|
||||
free( ptx_code );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
|
||||
{
|
||||
// write out the build log and ptx, then exit
|
||||
char cBuildLog[10240];
|
||||
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
|
||||
sizeof(cBuildLog), cBuildLog, NULL );
|
||||
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
|
||||
}
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < iNumObjs; i++)
|
||||
{
|
||||
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
const char* oclErrorString(cl_int error)
|
||||
{
|
||||
static const char* errorString[] = {
|
||||
"CL_SUCCESS",
|
||||
"CL_DEVICE_NOT_FOUND",
|
||||
"CL_DEVICE_NOT_AVAILABLE",
|
||||
"CL_COMPILER_NOT_AVAILABLE",
|
||||
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
|
||||
"CL_OUT_OF_RESOURCES",
|
||||
"CL_OUT_OF_HOST_MEMORY",
|
||||
"CL_PROFILING_INFO_NOT_AVAILABLE",
|
||||
"CL_MEM_COPY_OVERLAP",
|
||||
"CL_IMAGE_FORMAT_MISMATCH",
|
||||
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
|
||||
"CL_BUILD_PROGRAM_FAILURE",
|
||||
"CL_MAP_FAILURE",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"CL_INVALID_VALUE",
|
||||
"CL_INVALID_DEVICE_TYPE",
|
||||
"CL_INVALID_PLATFORM",
|
||||
"CL_INVALID_DEVICE",
|
||||
"CL_INVALID_CONTEXT",
|
||||
"CL_INVALID_QUEUE_PROPERTIES",
|
||||
"CL_INVALID_COMMAND_QUEUE",
|
||||
"CL_INVALID_HOST_PTR",
|
||||
"CL_INVALID_MEM_OBJECT",
|
||||
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
|
||||
"CL_INVALID_IMAGE_SIZE",
|
||||
"CL_INVALID_SAMPLER",
|
||||
"CL_INVALID_BINARY",
|
||||
"CL_INVALID_BUILD_OPTIONS",
|
||||
"CL_INVALID_PROGRAM",
|
||||
"CL_INVALID_PROGRAM_EXECUTABLE",
|
||||
"CL_INVALID_KERNEL_NAME",
|
||||
"CL_INVALID_KERNEL_DEFINITION",
|
||||
"CL_INVALID_KERNEL",
|
||||
"CL_INVALID_ARG_INDEX",
|
||||
"CL_INVALID_ARG_VALUE",
|
||||
"CL_INVALID_ARG_SIZE",
|
||||
"CL_INVALID_KERNEL_ARGS",
|
||||
"CL_INVALID_WORK_DIMENSION",
|
||||
"CL_INVALID_WORK_GROUP_SIZE",
|
||||
"CL_INVALID_WORK_ITEM_SIZE",
|
||||
"CL_INVALID_GLOBAL_OFFSET",
|
||||
"CL_INVALID_EVENT_WAIT_LIST",
|
||||
"CL_INVALID_EVENT",
|
||||
"CL_INVALID_OPERATION",
|
||||
"CL_INVALID_GL_OBJECT",
|
||||
"CL_INVALID_BUFFER_SIZE",
|
||||
"CL_INVALID_MIP_LEVEL",
|
||||
"CL_INVALID_GLOBAL_WORK_SIZE",
|
||||
};
|
||||
|
||||
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
|
||||
|
||||
const int index = -error;
|
||||
|
||||
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
|
||||
}
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
const char* oclImageFormatString(cl_uint uiImageFormat)
|
||||
{
|
||||
// cl_channel_order
|
||||
if (uiImageFormat == CL_R)return "CL_R";
|
||||
if (uiImageFormat == CL_A)return "CL_A";
|
||||
if (uiImageFormat == CL_RG)return "CL_RG";
|
||||
if (uiImageFormat == CL_RA)return "CL_RA";
|
||||
if (uiImageFormat == CL_RGB)return "CL_RGB";
|
||||
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
|
||||
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
|
||||
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
|
||||
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
|
||||
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
|
||||
|
||||
// cl_channel_type
|
||||
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
|
||||
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
|
||||
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
|
||||
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
|
||||
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
|
||||
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
|
||||
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
|
||||
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
|
||||
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
|
||||
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
|
||||
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
|
||||
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
|
||||
|
||||
// unknown constant
|
||||
return "Unknown";
|
||||
}
|
||||
1954
tests/opencl/blackscholes/shrUtils.cpp
Normal file
1954
tests/opencl/blackscholes/shrUtils.cpp
Normal file
File diff suppressed because it is too large
Load Diff
117
tests/opencl/common.mk
Normal file
117
tests/opencl/common.mk
Normal file
@@ -0,0 +1,117 @@
|
||||
XLEN ?= 32
|
||||
|
||||
TARGET ?= opaesim
|
||||
|
||||
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
|
||||
XRT_DEVICE_INDEX ?= 0
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
|
||||
STARTUP_ADDR ?= 0x180000000
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
STARTUP_ADDR ?= 0x80000000
|
||||
endif
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||
|
||||
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
|
||||
|
||||
LLVM_VORTEX ?= /opt/llvm-vortex
|
||||
LLVM_POCL ?= /opt/llvm-vortex
|
||||
|
||||
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
|
||||
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
|
||||
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
|
||||
CXXFLAGS += -pthread
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex
|
||||
|
||||
ifdef HOSTGPU
|
||||
CXXFLAGS += -DHOSTGPU
|
||||
LDFLAGS += -lOpenCL
|
||||
else
|
||||
LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so
|
||||
endif
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), fpga)
|
||||
OPAE_DRV_PATHS ?= libopae-c.so
|
||||
else
|
||||
ifeq ($(TARGET), asesim)
|
||||
OPAE_DRV_PATHS ?= libopae-c-ase.so
|
||||
else
|
||||
ifeq ($(TARGET), opaesim)
|
||||
OPAE_DRV_PATHS ?= libopae-c-sim.so
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||
|
||||
%.cc.o: %.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
%.cpp.o: %.cpp
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
%.c.o: %.c
|
||||
$(CC) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
$(PROJECT): $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-opae: $(PROJECT) kernel.pocl
|
||||
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-xrt: $(PROJECT) kernel.pocl
|
||||
ifeq ($(TARGET), hw)
|
||||
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
else
|
||||
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
endif
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.dump *.pocl
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
1
tests/opencl/convolution/.gitignore
vendored
1
tests/opencl/convolution/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
convolution
|
||||
@@ -1,67 +0,0 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/simx -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = convolution
|
||||
|
||||
SRCS = main.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
Before Width: | Height: | Size: 44 KiB |
@@ -1,54 +0,0 @@
|
||||
__kernel
|
||||
void convolution(
|
||||
__read_only image2d_t sourceImage,
|
||||
__write_only image2d_t outputImage,
|
||||
int rows,
|
||||
int cols,
|
||||
__constant float* filter,
|
||||
int filterWidth,
|
||||
sampler_t sampler)
|
||||
{
|
||||
// Store each work-item’s unique row and column
|
||||
int column = get_global_id(0);
|
||||
int row = get_global_id(1);
|
||||
|
||||
// Half the width of the filter is needed for indexing
|
||||
// memory later
|
||||
int halfWidth = (int)(filterWidth/2);
|
||||
|
||||
// All accesses to images return data as four-element vector
|
||||
// (i.e., float4), although only the 'x' component will contain
|
||||
// meaningful data in this code
|
||||
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
// Iterator for the filter
|
||||
int filterIdx = 0;
|
||||
|
||||
// Each work-item iterates around its local area based on the
|
||||
// size of the filter
|
||||
int2 coords; // Coordinates for accessing the image
|
||||
// Iterate the filter rows
|
||||
for(int i = -halfWidth; i <= halfWidth; i++) {
|
||||
coords.y = row + i;
|
||||
|
||||
// Iterate over the filter columns
|
||||
for(int j = -halfWidth; j <= halfWidth; j++) {
|
||||
coords.x = column + j;
|
||||
|
||||
float4 pixel;
|
||||
// Read a pixel from the image. A single channel image
|
||||
// stores the pixel in the 'x' coordinate of the returned
|
||||
// vector.
|
||||
pixel = read_imagef(sourceImage, sampler, coords);
|
||||
sum.x += pixel.x * filter[filterIdx++];
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the data to the output image if the
|
||||
// work-item is in bounds
|
||||
if(row < rows && column < cols) {
|
||||
coords.x = column;
|
||||
coords.y = row;
|
||||
write_imagef(outputImage, coords, sum);
|
||||
}
|
||||
}
|
||||
@@ -1,261 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
// This function takes a positive integer and rounds it up to
|
||||
// the nearest multiple of another provided integer
|
||||
unsigned int roundUp(unsigned int value, unsigned int multiple) {
|
||||
|
||||
// Determine how far past the nearest multiple the value is
|
||||
unsigned int remainder = value % multiple;
|
||||
|
||||
// Add the difference to make the value a multiple
|
||||
if(remainder != 0) {
|
||||
value += (multiple-remainder);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// This function reads in a text file and stores it as a char pointer
|
||||
char* readSource(char* kernelPath) {
|
||||
|
||||
cl_int status;
|
||||
FILE *fp;
|
||||
char *source;
|
||||
long int size;
|
||||
|
||||
printf("Program file is: %s\n", kernelPath);
|
||||
|
||||
fp = fopen(kernelPath, "rb");
|
||||
if(!fp) {
|
||||
printf("Could not open kernel file\n");
|
||||
exit(-1);
|
||||
}
|
||||
status = fseek(fp, 0, SEEK_END);
|
||||
if(status != 0) {
|
||||
printf("Error seeking to end of file\n");
|
||||
exit(-1);
|
||||
}
|
||||
size = ftell(fp);
|
||||
if(size < 0) {
|
||||
printf("Error getting file position\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
rewind(fp);
|
||||
|
||||
source = (char *)malloc(size + 1);
|
||||
|
||||
int i;
|
||||
for (i = 0; i < size+1; i++) {
|
||||
source[i]='\0';
|
||||
}
|
||||
|
||||
if(source == NULL) {
|
||||
printf("Error allocating space for the kernel source\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fread(source, 1, size, fp);
|
||||
source[size] = '\0';
|
||||
|
||||
return source;
|
||||
}
|
||||
|
||||
void chk(cl_int status, const char* cmd) {
|
||||
|
||||
if(status != CL_SUCCESS) {
|
||||
printf("%s failed (%d)\n", cmd, status);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
int i, j, k, l;
|
||||
|
||||
// Rows and columns in the input image
|
||||
int imageHeight;
|
||||
int imageWidth;
|
||||
|
||||
const char* inputFile = "input.bmp";
|
||||
const char* outputFile = "output.bmp";
|
||||
|
||||
// Homegrown function to read a BMP from file
|
||||
float* inputImage = readImage(inputFile, &imageWidth,
|
||||
&imageHeight);
|
||||
|
||||
// Size of the input and output images on the host
|
||||
int dataSize = imageHeight*imageWidth*sizeof(float);
|
||||
|
||||
// Output image on the host
|
||||
float* outputImage = NULL;
|
||||
outputImage = (float*)malloc(dataSize);
|
||||
float* refImage = NULL;
|
||||
refImage = (float*)malloc(dataSize);
|
||||
|
||||
// 45 degree motion blur
|
||||
float filter[49] =
|
||||
{0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, -2, 0, 2, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
// The convolution filter is 7x7
|
||||
int filterWidth = 7;
|
||||
int filterSize = filterWidth*filterWidth; // Assume a square kernel
|
||||
|
||||
// Set up the OpenCL environment
|
||||
cl_int status;
|
||||
|
||||
// Discovery platform
|
||||
cl_platform_id platform;
|
||||
status = clGetPlatformIDs(1, &platform, NULL);
|
||||
chk(status, "clGetPlatformIDs");
|
||||
|
||||
// Discover device
|
||||
cl_device_id device;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
chk(status, "clGetDeviceIDs");
|
||||
|
||||
// Create context
|
||||
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)(platform), 0};
|
||||
cl_context context;
|
||||
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
|
||||
chk(status, "clCreateContext");
|
||||
|
||||
// Create command queue
|
||||
cl_command_queue queue;
|
||||
queue = clCreateCommandQueue(context, device, 0, &status);
|
||||
chk(status, "clCreateCommandQueue");
|
||||
|
||||
// The image format describes how the data will be stored in memory
|
||||
cl_image_format format;
|
||||
format.image_channel_order = CL_R; // single channel
|
||||
format.image_channel_data_type = CL_FLOAT; // float data type
|
||||
|
||||
// Create space for the source image on the device
|
||||
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the output image on the device
|
||||
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the 7x7 filter on the device
|
||||
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
|
||||
NULL, &status);
|
||||
chk(status, "clCreateBuffer");
|
||||
|
||||
// Copy the source image to the device
|
||||
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
|
||||
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
|
||||
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
|
||||
0, 0, inputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteImage");
|
||||
|
||||
// Copy the 7x7 filter to the device
|
||||
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
|
||||
filterSize*sizeof(float), filter, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteBuffer");
|
||||
|
||||
// Create the image sampler
|
||||
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
|
||||
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
|
||||
chk(status, "clCreateSampler");
|
||||
|
||||
const char* source = readSource("kernel.cl");
|
||||
|
||||
// Create a program object with source and build it
|
||||
cl_program program;
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
chk(status, "clCreateProgramWithSource");
|
||||
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
|
||||
chk(status, "clBuildProgram");
|
||||
|
||||
// Create the kernel object
|
||||
cl_kernel kernel;
|
||||
kernel = clCreateKernel(program, "convolution", &status);
|
||||
chk(status, "clCreateKernel");
|
||||
|
||||
// Set the kernel arguments
|
||||
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
|
||||
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
|
||||
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
|
||||
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
|
||||
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
|
||||
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
|
||||
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
|
||||
chk(status, "clSetKernelArg");
|
||||
|
||||
// Set the work item dimensions
|
||||
size_t globalSize[2] = {imageWidth, imageHeight};
|
||||
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
|
||||
NULL, NULL);
|
||||
chk(status, "clEnqueueNDRange");
|
||||
|
||||
// Read the image back to the host
|
||||
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
|
||||
region, 0, 0, outputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueReadImage");
|
||||
|
||||
// Write the output image to file
|
||||
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
|
||||
|
||||
// Compute the reference image
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
refImage[i*imageWidth+j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over the rows of the source image
|
||||
int halfFilterWidth = filterWidth/2;
|
||||
float sum;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
// Iterate over the columns of the source image
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
sum = 0; // Reset sum for new source pixel
|
||||
// Apply the filter to the neighborhood
|
||||
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
|
||||
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
|
||||
if(i+k >= 0 && i+k < imageHeight &&
|
||||
j+l >= 0 && j+l < imageWidth) {
|
||||
sum += inputImage[(i+k)*imageWidth + j+l] *
|
||||
filter[(k+halfFilterWidth)*filterWidth +
|
||||
l+halfFilterWidth];
|
||||
}
|
||||
}
|
||||
}
|
||||
refImage[i*imageWidth+j] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
int failed = 0;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
|
||||
printf("Results are INCORRECT\n");
|
||||
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
|
||||
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
|
||||
failed = 1;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(!failed) {
|
||||
printf("Results are correct\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,180 +0,0 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
void storeImage(float *imageOut,
|
||||
const char *filename,
|
||||
int rows,
|
||||
int cols,
|
||||
const char* refFilename) {
|
||||
|
||||
FILE *ifp, *ofp;
|
||||
unsigned char tmp;
|
||||
int offset;
|
||||
unsigned char *buffer;
|
||||
int i, j;
|
||||
|
||||
int bytes;
|
||||
|
||||
int height, width;
|
||||
|
||||
ifp = fopen(refFilename, "rb");
|
||||
if(ifp == NULL) {
|
||||
perror(filename);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fseek(ifp, 10, SEEK_SET);
|
||||
fread(&offset, 4, 1, ifp);
|
||||
|
||||
fseek(ifp, 18, SEEK_SET);
|
||||
fread(&width, 4, 1, ifp);
|
||||
fread(&height, 4, 1, ifp);
|
||||
|
||||
fseek(ifp, 0, SEEK_SET);
|
||||
|
||||
buffer = (unsigned char *)malloc(offset);
|
||||
if(buffer == NULL) {
|
||||
perror("malloc");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fread(buffer, 1, offset, ifp);
|
||||
|
||||
printf("Writing output image to %s\n", filename);
|
||||
ofp = fopen(filename, "wb");
|
||||
if(ofp == NULL) {
|
||||
perror("opening output file");
|
||||
exit(-1);
|
||||
}
|
||||
bytes = fwrite(buffer, 1, offset, ofp);
|
||||
if(bytes != offset) {
|
||||
printf("error writing header!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// NOTE bmp formats store data in reverse raster order (see comment in
|
||||
// readImage function), so we need to flip it upside down here.
|
||||
int mod = width % 4;
|
||||
if(mod != 0) {
|
||||
mod = 4 - mod;
|
||||
}
|
||||
// printf("mod = %d\n", mod);
|
||||
for(i = height-1; i >= 0; i--) {
|
||||
for(j = 0; j < width; j++) {
|
||||
tmp = (unsigned char)imageOut[i*cols+j];
|
||||
fwrite(&tmp, sizeof(char), 1, ofp);
|
||||
}
|
||||
// In bmp format, rows must be a multiple of 4-bytes.
|
||||
// So if we're not at a multiple of 4, add junk padding.
|
||||
for(j = 0; j < mod; j++) {
|
||||
fwrite(&tmp, sizeof(char), 1, ofp);
|
||||
}
|
||||
}
|
||||
|
||||
fclose(ofp);
|
||||
fclose(ifp);
|
||||
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read bmp image and convert to byte array. Also output the width and height
|
||||
*/
|
||||
float* readImage(const char *filename, int* widthOut, int* heightOut) {
|
||||
|
||||
uchar* imageData;
|
||||
|
||||
int height, width;
|
||||
uchar tmp;
|
||||
int offset;
|
||||
int i, j;
|
||||
|
||||
printf("Reading input image from %s\n", filename);
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if(fp == NULL) {
|
||||
perror(filename);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fseek(fp, 10, SEEK_SET);
|
||||
fread(&offset, 4, 1, fp);
|
||||
|
||||
fseek(fp, 18, SEEK_SET);
|
||||
fread(&width, 4, 1, fp);
|
||||
fread(&height, 4, 1, fp);
|
||||
|
||||
printf("width = %d\n", width);
|
||||
printf("height = %d\n", height);
|
||||
|
||||
*widthOut = width;
|
||||
*heightOut = height;
|
||||
|
||||
imageData = (uchar*)malloc(width*height);
|
||||
if(imageData == NULL) {
|
||||
perror("malloc");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fseek(fp, offset, SEEK_SET);
|
||||
fflush(NULL);
|
||||
|
||||
int mod = width % 4;
|
||||
if(mod != 0) {
|
||||
mod = 4 - mod;
|
||||
}
|
||||
|
||||
// NOTE bitmaps are stored in upside-down raster order. So we begin
|
||||
// reading from the bottom left pixel, then going from left-to-right,
|
||||
// read from the bottom to the top of the image. For image analysis,
|
||||
// we want the image to be right-side up, so we'll modify it here.
|
||||
|
||||
// First we read the image in upside-down
|
||||
|
||||
// Read in the actual image
|
||||
for(i = 0; i < height; i++) {
|
||||
|
||||
// add actual data to the image
|
||||
for(j = 0; j < width; j++) {
|
||||
fread(&tmp, sizeof(char), 1, fp);
|
||||
imageData[i*width + j] = tmp;
|
||||
}
|
||||
// For the bmp format, each row has to be a multiple of 4,
|
||||
// so I need to read in the junk data and throw it away
|
||||
for(j = 0; j < mod; j++) {
|
||||
fread(&tmp, sizeof(char), 1, fp);
|
||||
}
|
||||
}
|
||||
|
||||
// Then we flip it over
|
||||
int flipRow;
|
||||
for(i = 0; i < height/2; i++) {
|
||||
flipRow = height - (i+1);
|
||||
for(j = 0; j < width; j++) {
|
||||
tmp = imageData[i*width+j];
|
||||
imageData[i*width+j] = imageData[flipRow*width+j];
|
||||
imageData[flipRow*width+j] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
// Input image on the host
|
||||
float* floatImage = NULL;
|
||||
floatImage = (float*)malloc(sizeof(float)*width*height);
|
||||
if(floatImage == NULL) {
|
||||
perror("malloc");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
// Convert the BMP image to float (not required)
|
||||
for(i = 0; i < height; i++) {
|
||||
for(j = 0; j < width; j++) {
|
||||
floatImage[i*width+j] = (float)imageData[i*width+j];
|
||||
}
|
||||
}
|
||||
|
||||
free(imageData);
|
||||
return floatImage;
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
#ifndef __UTILS__
|
||||
#define __UTILS__
|
||||
|
||||
typedef unsigned char uchar;
|
||||
|
||||
float* readImage(const char *filename, int* widthOut, int* heightOut);
|
||||
|
||||
void storeImage(float *imageOut, const char *filename, int rows, int cols,
|
||||
const char* refFilename);
|
||||
|
||||
#endif
|
||||
@@ -1,69 +1,9 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = cutcp
|
||||
|
||||
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
CXXFLAGS += -I.
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
OPTS ?=
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
include ../common.mk
|
||||
|
||||
@@ -19,6 +19,27 @@
|
||||
#include "macros.h"
|
||||
#include "ocl.h"
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (NULL == filename || NULL == data || 0 == size)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
|
||||
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
|
||||
typedef cl_int4 xyz;
|
||||
@@ -294,8 +315,6 @@ int gpu_compute_cutoff_potential_lattice(
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Ok!\n");
|
||||
|
||||
pb_Context* pb_context;
|
||||
pb_context = pb_InitOpenCLContext(parameters);
|
||||
if (pb_context == NULL) {
|
||||
@@ -303,8 +322,6 @@ int gpu_compute_cutoff_potential_lattice(
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("Ok!\n");
|
||||
|
||||
cl_int clStatus;
|
||||
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
|
||||
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
|
||||
@@ -317,8 +334,13 @@ int gpu_compute_cutoff_potential_lattice(
|
||||
|
||||
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
|
||||
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
|
||||
cl_program clProgram = clCreateProgramWithBuiltInKernels(
|
||||
clContext, 1, &clDevice, "opencl_cutoff_potential_lattice", &clStatus);
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
cl_program clProgram = clCreateProgramWithBinary(
|
||||
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
|
||||
char clOptions[50];
|
||||
@@ -399,9 +421,6 @@ int gpu_compute_cutoff_potential_lattice(
|
||||
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
printf("Ok!!\n");
|
||||
|
||||
|
||||
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
|
||||
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
|
||||
@@ -412,26 +431,16 @@ int gpu_compute_cutoff_potential_lattice(
|
||||
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
CHECK_ERROR("clEnqueueNDRangeKernel")
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
clStatus = clFinish(clCommandQueue);
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
CHECK_ERROR("clFinish")
|
||||
}
|
||||
|
||||
printf("Ok++!\n");
|
||||
|
||||
printf("Finished OpenCL kernel calls \n");
|
||||
printf("Finished OpenCL kernel calls\n");
|
||||
|
||||
/* copy result regions from OpenCL device */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_COPY);
|
||||
|
||||
@@ -9,6 +9,10 @@
|
||||
#ifndef __GPUINFOH__
|
||||
#define __GPUINFOH__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
|
||||
int minor,
|
||||
int sm);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
Binary file not shown.
@@ -124,8 +124,6 @@ int main(int argc, char *argv[]) {
|
||||
pb_InitializeTimerSet(&timers);
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
{
|
||||
const char *pqrfilename = parameters->inpFiles[0];
|
||||
|
||||
@@ -136,8 +134,6 @@ int main(int argc, char *argv[]) {
|
||||
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
|
||||
}
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
/* find extent of domain */
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
get_atom_extent(&min_ext, &max_ext, atom);
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void clMemSet(cl_command_queue, cl_mem, int, size_t);
|
||||
char* readFile(const char*);
|
||||
|
||||
@@ -14,4 +18,8 @@ char* readFile(const char*);
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
7
tests/opencl/dotproduct/Makefile
Normal file
7
tests/opencl/dotproduct/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
PROJECT = dotproduct
|
||||
|
||||
SRCS = main.cc oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
include ../common.mk
|
||||
152
tests/opencl/dotproduct/cmd_arg_reader.cpp
Normal file
152
tests/opencl/dotproduct/cmd_arg_reader.cpp
Normal file
@@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
// includes, file
|
||||
#include "cmd_arg_reader.h"
|
||||
|
||||
// includes, system
|
||||
#include <vector>
|
||||
|
||||
// internal unnamed namespace
|
||||
|
||||
namespace
|
||||
{
|
||||
// types, internal (class, enum, struct, union, typedef)
|
||||
|
||||
// variables, internal
|
||||
|
||||
} // namespace {
|
||||
|
||||
// variables, exported
|
||||
|
||||
/*static*/ CmdArgReader* CmdArgReader::self;
|
||||
/*static*/ char** CmdArgReader::rargv;
|
||||
/*static*/ int CmdArgReader::rargc;
|
||||
|
||||
// functions, exported
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Public construction interface
|
||||
//! @return a handle to the class instance
|
||||
//! @param argc number of command line arguments (as given to main())
|
||||
//! @param argv command line argument string (as given to main())
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ void
|
||||
CmdArgReader::init( const int argc, const char** argv)
|
||||
{
|
||||
if ( NULL != self)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// command line arguments
|
||||
if (( 0 == argc) || ( 0 == argv))
|
||||
{
|
||||
LOGIC_EXCEPTION( "No command line arguments given.");
|
||||
}
|
||||
|
||||
self = new CmdArgReader();
|
||||
|
||||
self->createArgsMaps( argc, argv);
|
||||
|
||||
rargc = argc;
|
||||
rargv = const_cast<char**>( argv);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, default
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CmdArgReader::CmdArgReader() :
|
||||
args(),
|
||||
unprocessed(),
|
||||
iter(),
|
||||
iter_unprocessed()
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Destructor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CmdArgReader::~CmdArgReader()
|
||||
{
|
||||
for( iter = args.begin(); iter != args.end(); ++iter)
|
||||
{
|
||||
if( *(iter->second.first) == typeid( int))
|
||||
{
|
||||
delete static_cast<int*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( bool))
|
||||
{
|
||||
delete static_cast<bool*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::string))
|
||||
{
|
||||
delete static_cast<std::string*>( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
|
||||
{
|
||||
delete static_cast< std::vector< std::string>* >( iter->second.second);
|
||||
break;
|
||||
}
|
||||
else if( *(iter->second.first) == typeid( std::vector<int>) )
|
||||
{
|
||||
delete static_cast< std::vector<int>* >( iter->second.second);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Read args as token value pair into map for better processing (Even the
|
||||
//! values remain strings until the parameter values is requested by the
|
||||
//! program.)
|
||||
//! @param argc the argument count (as given to 'main')
|
||||
//! @param argv the char* array containing the command line arguments
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void
|
||||
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
|
||||
|
||||
std::string token;
|
||||
std::string val_str;
|
||||
|
||||
std::map< std::string, std::string> args;
|
||||
|
||||
std::string::size_type pos;
|
||||
std::string arg;
|
||||
for( int i=1; i<argc; ++i)
|
||||
{
|
||||
arg = argv[i];
|
||||
|
||||
// check if valid command line argument: all arguments begin with - or --
|
||||
if (arg[0] != '-')
|
||||
{
|
||||
RUNTIME_EXCEPTION("Invalid command line argument.");
|
||||
}
|
||||
|
||||
int numDashes = (arg[1] == '-' ? 2 : 1);
|
||||
|
||||
// check if only flag or if a value is given
|
||||
if ( (pos = arg.find( '=')) == std::string::npos)
|
||||
{
|
||||
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
|
||||
}
|
||||
else
|
||||
{
|
||||
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
|
||||
std::string( arg, pos+1, arg.length()-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
488
tests/opencl/dotproduct/cmd_arg_reader.h
Normal file
488
tests/opencl/dotproduct/cmd_arg_reader.h
Normal file
@@ -0,0 +1,488 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
|
||||
#ifndef _CMDARGREADER_H_
|
||||
#define _CMDARGREADER_H_
|
||||
|
||||
// includes, system
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
#include <typeinfo>
|
||||
|
||||
// includes, project
|
||||
#include "exception.h"
|
||||
|
||||
//! Preprocessed command line arguments
|
||||
//! @note Lazy evaluation: The arguments are converted from strings to
|
||||
//! the correct data type upon request. Converted values are stored
|
||||
//! in an additonal map so that no additional conversion is
|
||||
//! necessary. Arrays of command line arguments are stored in
|
||||
//! std::vectors
|
||||
//! @note Usage:
|
||||
//! const std::string* file =
|
||||
//! CmdArgReader::getArg< std::string>( "model")
|
||||
//! const std::vector< std::string>* files =
|
||||
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
|
||||
//! @note All command line arguments begin with '--' followed by the token;
|
||||
//! token and value are seperated by '='; example --samples=50
|
||||
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
//! (without whitespaces)
|
||||
|
||||
//! Command line argument parser
|
||||
class CmdArgReader
|
||||
{
|
||||
template<class> friend class TestCmdArgReader;
|
||||
|
||||
protected:
|
||||
|
||||
//! @param self handle to the only instance of this class
|
||||
static CmdArgReader* self;
|
||||
|
||||
public:
|
||||
|
||||
//! Public construction interface
|
||||
//! @return a handle to the class instance
|
||||
//! @param argc number of command line arguments (as given to main())
|
||||
//! @param argv command line argument string (as given to main())
|
||||
static void init( const int argc, const char** argv);
|
||||
|
||||
public:
|
||||
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument.
|
||||
//! If the argument does not exist or if it
|
||||
//! is not from type T NULL is returned
|
||||
//! @param name the name of the requested argument
|
||||
//! @note T the type of the argument requested
|
||||
template<class T>
|
||||
static inline const T* getArg( const std::string& name);
|
||||
|
||||
//! Check if a command line argument with the given name exists
|
||||
//! @return true if a command line argument with name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name name of the command line argument in question
|
||||
static inline bool existArg( const std::string& name);
|
||||
|
||||
//! Get the original / raw argc program argument
|
||||
static inline int& getRArgc();
|
||||
|
||||
//! Get the original / raw argv program argument
|
||||
static inline char**& getRArgv();
|
||||
|
||||
public:
|
||||
|
||||
//! Destructor
|
||||
~CmdArgReader();
|
||||
|
||||
protected:
|
||||
|
||||
//! Constructor, default
|
||||
CmdArgReader();
|
||||
|
||||
private:
|
||||
|
||||
// private helper functions
|
||||
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @note Private helper function for 'getArg' to work on the members
|
||||
//! @return A const handle to the requested argument. If the argument
|
||||
//! does not exist or if it is not from type T a NULL pointer
|
||||
//! is returned.
|
||||
//! @param name the name of the requested argument
|
||||
//! @note T the type of the argument requested
|
||||
template<class T>
|
||||
inline const T* getArgHelper( const std::string& name);
|
||||
|
||||
//! Check if a command line argument with name \a name exists
|
||||
//! @return true if a command line argument of name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name the name of the requested argument
|
||||
inline bool existArgHelper( const std::string& name) const;
|
||||
|
||||
//! Read args as token value pair into map for better processing
|
||||
//! (Even the values remain strings until the parameter values is
|
||||
//! requested by the program.)
|
||||
//! @param argc the argument count (as given to 'main')
|
||||
//! @param argv the char* array containing the command line arguments
|
||||
void createArgsMaps( const int argc, const char** argv);
|
||||
|
||||
//! Helper for "casting" the strings from the map with the unprocessed
|
||||
//! values to the correct
|
||||
//! data type.
|
||||
//! @return true if conversion succeeded, otherwise false
|
||||
//! @param element the value as string
|
||||
//! @param val the value as type T
|
||||
template<class T>
|
||||
static inline bool convertToT( const std::string& element, T& val);
|
||||
|
||||
public:
|
||||
|
||||
// typedefs internal
|
||||
|
||||
//! container for a processed command line argument
|
||||
//! typeid is used to easily be able to decide if a re-requested token-value
|
||||
//! pair match the type of the first conversion
|
||||
typedef std::pair< const std::type_info*, void*> ValType;
|
||||
//! map of already converted values
|
||||
typedef std::map< std::string, ValType > ArgsMap;
|
||||
//! iterator for the map of already converted values
|
||||
typedef ArgsMap::iterator ArgsMapIter;
|
||||
typedef ArgsMap::const_iterator ConstArgsMapIter;
|
||||
|
||||
//! map of unprocessed (means unconverted) token-value pairs
|
||||
typedef std::map< std::string, std::string> UnpMap;
|
||||
//! iterator for the map of unprocessed (means unconverted) token-value pairs
|
||||
typedef std::map< std::string, std::string>::iterator UnpMapIter;
|
||||
|
||||
private:
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( disable: 4251)
|
||||
#endif
|
||||
|
||||
//! rargc original value of argc
|
||||
static int rargc;
|
||||
|
||||
//! rargv contains command line arguments in raw format
|
||||
static char** rargv;
|
||||
|
||||
//! args Map containing the already converted token-value pairs
|
||||
ArgsMap args;
|
||||
|
||||
//! args Map containing the unprocessed / unconverted token-value pairs
|
||||
UnpMap unprocessed;
|
||||
|
||||
//! iter Iterator for the map with the already converted token-value
|
||||
//! pairs (to avoid frequent reallocation)
|
||||
ArgsMapIter iter;
|
||||
|
||||
//! iter Iterator for the map with the unconverted token-value
|
||||
//! pairs (to avoid frequent reallocation)
|
||||
UnpMapIter iter_unprocessed;
|
||||
|
||||
#ifdef _WIN32
|
||||
# pragma warning( default: 4251)
|
||||
#endif
|
||||
|
||||
private:
|
||||
|
||||
//! Constructor, copy (not implemented)
|
||||
CmdArgReader( const CmdArgReader&);
|
||||
|
||||
//! Assignment operator (not implemented)
|
||||
CmdArgReader& operator=( const CmdArgReader&);
|
||||
};
|
||||
|
||||
// variables, exported (extern)
|
||||
|
||||
// functions, inlined (inline)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line argument arrays
|
||||
//! @note This function is used each type for which no template specialization
|
||||
//! exist (which will cause errors if the type does not fulfill the std::vector
|
||||
//! interface).
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
/*static*/ inline bool
|
||||
CmdArgReader::convertToT( const std::string& element, T& val)
|
||||
{
|
||||
// preallocate storage
|
||||
val.resize( std::count( element.begin(), element.end(), ',') + 1);
|
||||
|
||||
unsigned int i = 0;
|
||||
std::string::size_type pos_start = 1; // leave array prefix '['
|
||||
std::string::size_type pos_end = 0;
|
||||
|
||||
// do for all elements of the comma seperated list
|
||||
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
|
||||
{
|
||||
// convert each element by the appropriate function
|
||||
if ( ! convertToT< typename T::value_type >(
|
||||
std::string( element, pos_start, pos_end - pos_start), val[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
pos_start = pos_end + 1;
|
||||
++i;
|
||||
}
|
||||
|
||||
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
|
||||
|
||||
// process last element (leave array postfix ']')
|
||||
if ( ! convertToT< typename T::value_type >( std::string( element,
|
||||
pos_start,
|
||||
element.length() - pos_start - 1),
|
||||
val[i]))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// possible to process all elements?
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type int
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<int>( const std::string& element, int& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type float
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<float>( const std::string& element, float& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type double
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<double>( const std::string& element, double& val)
|
||||
{
|
||||
std::istringstream ios( element);
|
||||
ios >> val;
|
||||
|
||||
bool ret_val = false;
|
||||
if ( ios.eof())
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type string
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<std::string>( const std::string& element,
|
||||
std::string& val)
|
||||
{
|
||||
val = element;
|
||||
return true;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Conversion function for command line arguments of type bool
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<>
|
||||
inline bool
|
||||
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
|
||||
{
|
||||
// check if value is given as string-type { true | false }
|
||||
if ( "true" == element)
|
||||
{
|
||||
val = true;
|
||||
return true;
|
||||
}
|
||||
else if ( "false" == element)
|
||||
{
|
||||
val = false;
|
||||
return true;
|
||||
}
|
||||
// check if argument is given as integer { 0 | 1 }
|
||||
else
|
||||
{
|
||||
int tmp;
|
||||
if ( convertToT<int>( element, tmp))
|
||||
{
|
||||
if ( 1 == tmp)
|
||||
{
|
||||
val = true;
|
||||
return true;
|
||||
}
|
||||
else if ( 0 == tmp)
|
||||
{
|
||||
val = false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument. If the argument does
|
||||
//! not exist or if it is not from type T NULL is returned
|
||||
//! @param T the type of the argument requested
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
/*static*/ const T*
|
||||
CmdArgReader::getArg( const std::string& name)
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return self->getArgHelper<T>( name);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if a command line argument with the given name exists
|
||||
//! @return true if a command line argument with name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name name of the command line argument in question
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline bool
|
||||
CmdArgReader::existArg( const std::string& name)
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return self->existArgHelper( name);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! @brief Get the value of the command line argument with given name
|
||||
//! @return A const handle to the requested argument. If the argument does
|
||||
//! not exist or if it is not from type T NULL is returned
|
||||
//! @param T the type of the argument requested
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
const T*
|
||||
CmdArgReader::getArgHelper( const std::string& name)
|
||||
{
|
||||
// check if argument already processed and stored in correct type
|
||||
if ( args.end() != (iter = args.find( name)))
|
||||
{
|
||||
if ( (*(iter->second.first)) == typeid( T) )
|
||||
{
|
||||
return (T*) iter->second.second;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
T* tmp = new T;
|
||||
|
||||
// check the array with unprocessed values
|
||||
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
|
||||
{
|
||||
// try to "cast" the string to the type requested
|
||||
if ( convertToT< T >( iter_unprocessed->second, *tmp))
|
||||
{
|
||||
// add the token element pair to map of already converted values
|
||||
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
|
||||
|
||||
return tmp;
|
||||
}
|
||||
}
|
||||
|
||||
// not used while not inserted into the map -> cleanup
|
||||
delete tmp;
|
||||
}
|
||||
|
||||
// failed, argument not available
|
||||
return NULL;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if a command line argument with name \a name exists
|
||||
//! @return true if a command line argument of name \a name exists,
|
||||
//! otherwise false
|
||||
//! @param name the name of the requested argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
inline bool
|
||||
CmdArgReader::existArgHelper( const std::string& name) const
|
||||
{
|
||||
bool ret_val = false;
|
||||
|
||||
// check if argument already processed and stored in correct type
|
||||
if( args.end() != args.find( name))
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
// check the array with unprocessed values
|
||||
if ( unprocessed.end() != unprocessed.find( name))
|
||||
{
|
||||
ret_val = true;
|
||||
}
|
||||
}
|
||||
|
||||
return ret_val;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the original / raw argc program argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline int&
|
||||
CmdArgReader::getRArgc()
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
|
||||
}
|
||||
|
||||
return rargc;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the original / raw argv program argument
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ inline char**&
|
||||
CmdArgReader::getRArgv()
|
||||
{
|
||||
if( ! self)
|
||||
{
|
||||
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
|
||||
}
|
||||
|
||||
return rargv;
|
||||
}
|
||||
|
||||
// functions, exported (extern)
|
||||
|
||||
#endif // #ifndef _CMDARGREADER_H_
|
||||
151
tests/opencl/dotproduct/exception.h
Normal file
151
tests/opencl/dotproduct/exception.h
Normal file
@@ -0,0 +1,151 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* CUda UTility Library */
|
||||
#ifndef _EXCEPTION_H_
|
||||
#define _EXCEPTION_H_
|
||||
|
||||
// includes, system
|
||||
#include <exception>
|
||||
#include <stdexcept>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
|
||||
//! Exception wrapper.
|
||||
//! @param Std_Exception Exception out of namespace std for easy typing.
|
||||
template<class Std_Exception>
|
||||
class Exception : public Std_Exception
|
||||
{
|
||||
public:
|
||||
|
||||
//! @brief Static construction interface
|
||||
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||
//! @param file file in which the Exception occurs
|
||||
//! @param line line in which the Exception occurs
|
||||
//! @param detailed details on the code fragment causing the Exception
|
||||
static void throw_it( const char* file,
|
||||
const int line,
|
||||
const char* detailed = "-" );
|
||||
|
||||
//! Static construction interface
|
||||
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||
//! @param file file in which the Exception occurs
|
||||
//! @param line line in which the Exception occurs
|
||||
//! @param detailed details on the code fragment causing the Exception
|
||||
static void throw_it( const char* file,
|
||||
const int line,
|
||||
const std::string& detailed);
|
||||
|
||||
//! Destructor
|
||||
virtual ~Exception() throw();
|
||||
|
||||
private:
|
||||
|
||||
//! Constructor, default (private)
|
||||
Exception();
|
||||
|
||||
//! Constructor, standard
|
||||
//! @param str string returned by what()
|
||||
Exception( const std::string& str);
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Exception handler function for arbitrary exceptions
|
||||
//! @param ex exception to handle
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Exception_Typ>
|
||||
inline void
|
||||
handleException( const Exception_Typ& ex)
|
||||
{
|
||||
std::cerr << ex.what() << std::endl;
|
||||
|
||||
exit( EXIT_FAILURE);
|
||||
}
|
||||
|
||||
//! Convenience macros
|
||||
|
||||
//! Exception caused by dynamic program behavior, e.g. file does not exist
|
||||
#define RUNTIME_EXCEPTION( msg) \
|
||||
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
//! Logic exception in program, e.g. an assert failed
|
||||
#define LOGIC_EXCEPTION( msg) \
|
||||
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
//! Out of range exception
|
||||
#define RANGE_EXCEPTION( msg) \
|
||||
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Implementation
|
||||
|
||||
// includes, system
|
||||
#include <sstream>
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Static construction interface.
|
||||
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ template<class Std_Exception>
|
||||
void
|
||||
Exception<Std_Exception>::
|
||||
throw_it( const char* file, const int line, const char* detailed)
|
||||
{
|
||||
std::stringstream s;
|
||||
|
||||
// Quiet heavy-weight but exceptions are not for
|
||||
// performance / release versions
|
||||
s << "Exception in file '" << file << "' in line " << line << "\n"
|
||||
<< "Detailed description: " << detailed << "\n";
|
||||
|
||||
throw Exception( s.str());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Static construction interface.
|
||||
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
/*static*/ template<class Std_Exception>
|
||||
void
|
||||
Exception<Std_Exception>::
|
||||
throw_it( const char* file, const int line, const std::string& msg)
|
||||
{
|
||||
throw_it( file, line, msg.c_str());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, default (private).
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::Exception() :
|
||||
Exception("Unknown Exception.\n")
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Constructor, standard (private).
|
||||
//! String returned by what().
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::Exception( const std::string& s) :
|
||||
Std_Exception( s)
|
||||
{ }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Destructor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class Std_Exception>
|
||||
Exception<Std_Exception>::~Exception() throw() { }
|
||||
|
||||
// functions, exported
|
||||
|
||||
#endif // #ifndef _EXCEPTION_H_
|
||||
|
||||
22
tests/opencl/dotproduct/kernel.cl
Normal file
22
tests/opencl/dotproduct/kernel.cl
Normal file
@@ -0,0 +1,22 @@
|
||||
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
|
||||
{
|
||||
// find position in global arrays
|
||||
int iGID = get_global_id(0);
|
||||
|
||||
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
|
||||
//printf("%d, %d\n", iGID, iNumElements);
|
||||
if (iGID >= iNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// process
|
||||
int iInOffset = iGID << 2;
|
||||
c[iGID] = a[iInOffset] * b[iInOffset]
|
||||
+ a[iInOffset + 1] * b[iInOffset + 1]
|
||||
+ a[iInOffset + 2] * b[iInOffset + 2]
|
||||
+ a[iInOffset + 3] * b[iInOffset + 3];
|
||||
//float cc = c[iGID];
|
||||
|
||||
//printf("c[%d]=%f\n", iGID, cc);
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
@@ -10,25 +11,26 @@
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// oclDotProduct Notes:
|
||||
// oclDotProduct Notes:
|
||||
//
|
||||
// A simple OpenCL API demo application that implements a
|
||||
// vector dot product computation between 2 float arrays.
|
||||
// vector dot product computation between 2 float arrays.
|
||||
//
|
||||
// Runs computations with OpenCL on the GPU device and then checks results
|
||||
// Runs computations with OpenCL on the GPU device and then checks results
|
||||
// against basic host CPU/C++ computation.
|
||||
//
|
||||
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
|
||||
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
|
||||
// But these are NOT required libs for OpenCL developement in general.
|
||||
// *********************************************************************
|
||||
|
||||
// standard utilities and systems includes
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
#include "oclUtils.h"
|
||||
#include "shrQATest.h"
|
||||
|
||||
// Name of the file with the source code for the computation kernel
|
||||
// *********************************************************************
|
||||
const char* cSourceFile = "DotProduct.cl";
|
||||
const char* cSourceFile = "kernel.pocl";
|
||||
|
||||
// Host buffers for demo
|
||||
// *********************************************************************
|
||||
@@ -43,20 +45,20 @@ cl_command_queue cqCommandQueue;// OpenCL command que
|
||||
cl_program program; // OpenCL program
|
||||
cl_kernel ckKernel; // OpenCL kernel
|
||||
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevDst; // OpenCL device destination buffer
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevDst; // OpenCL device destination buffer
|
||||
size_t szGlobalWorkSize; // Total # of work items in the 1D range
|
||||
size_t szLocalWorkSize; // # of work items in the 1D work group
|
||||
size_t szLocalWorkSize; // # of work items in the 1D work group
|
||||
size_t szParmDataBytes; // Byte size of context information
|
||||
size_t szKernelLength; // Byte size of kernel code
|
||||
cl_int ciErrNum; // Error code var
|
||||
char* cPathAndName = NULL; // var for full paths to data, src, etc.
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
const char* cExecutableName = NULL;
|
||||
|
||||
// demo config vars
|
||||
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
|
||||
shrBOOL bNoPrompt = shrFALSE;
|
||||
int iNumElements= 1024; // Length of float arrays to process (odd # for illustration)
|
||||
shrBOOL bNoPrompt = shrFALSE;
|
||||
|
||||
// Forward Declarations
|
||||
// *********************************************************************
|
||||
@@ -67,7 +69,7 @@ void (*pCleanup)(int) = &Cleanup;
|
||||
int *gp_argc = NULL;
|
||||
char ***gp_argv = NULL;
|
||||
|
||||
// Main function
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
@@ -76,38 +78,29 @@ int main(int argc, char **argv)
|
||||
|
||||
shrQAStart(argc, argv);
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
cl_uint uiNumComputeUnits;
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
cl_uint uiNumDevices = 1;
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id));
|
||||
cl_uint uiTargetDevice = 0;
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
//Get all the devices
|
||||
cl_uint uiNumDevices = 0; // Number of devices available
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
// Get command line device options and config accordingly
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||
{
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
}
|
||||
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||
|
||||
// get command line arg for quick test, if provided
|
||||
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
|
||||
@@ -115,16 +108,16 @@ int main(int argc, char **argv)
|
||||
// start logs
|
||||
cExecutableName = argv[0];
|
||||
shrSetLogFileName ("oclDotProduct.txt");
|
||||
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
|
||||
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
|
||||
|
||||
// set and log Global and Local work size dimensions
|
||||
szLocalWorkSize = 256;
|
||||
szLocalWorkSize = 16;
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
|
||||
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
|
||||
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
|
||||
|
||||
// Allocate and initialize host arrays
|
||||
shrLog( "Allocate and Init Host Mem...\n");
|
||||
shrLog( "Allocate and Init Host Mem...\n");
|
||||
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
|
||||
@@ -134,49 +127,50 @@ int main(int argc, char **argv)
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Get a GPU device
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create the context
|
||||
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create a command-queue
|
||||
shrLog("clCreateCommandQueue...\n");
|
||||
shrLog("clCreateCommandQueue...\n");
|
||||
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
|
||||
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
|
||||
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
|
||||
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
|
||||
oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
|
||||
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
|
||||
oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
|
||||
|
||||
// Create the program
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
cl_int binary_status;
|
||||
cl_program program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
|
||||
clCreateProgramWithBinary(cxGPUContext, 1, cdDevices, &szKernelLength, (const uint8_t**)&cSourceCL, &binary_status, &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
// Build the program with 'mad' Optimization option
|
||||
#ifdef MAC
|
||||
char* flags = "-cl-fast-relaxed-math -DMAC";
|
||||
#else
|
||||
char* flags = "-cl-fast-relaxed-math";
|
||||
#endif
|
||||
shrLog("clBuildProgram...\n");
|
||||
shrLog("clBuildProgram...\n");
|
||||
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
@@ -184,47 +178,50 @@ int main(int argc, char **argv)
|
||||
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
|
||||
Cleanup(EXIT_FAILURE);
|
||||
Cleanup(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
shrLog("clCreateKernel (DotProduct)...\n");
|
||||
shrLog("clCreateKernel (nDotProduct)...\n");
|
||||
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Set the Argument values
|
||||
shrLog("clSetKernelArg 0 - 3...\n\n");
|
||||
shrLog("clSetKernelArg 0 - 3...\n\n");
|
||||
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Core sequence... copy input data to GPU, compute, copy results back
|
||||
|
||||
// Asynchronous write of data to GPU device
|
||||
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
|
||||
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Launch kernel
|
||||
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
|
||||
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Read back results and check accumulated errors
|
||||
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
|
||||
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Compute and compare results for golden-host and report errors and pass/fail
|
||||
shrLog("Comparing against Host/C++ computation...\n\n");
|
||||
shrLog("Comparing against Host/C++ computation...\n\n");
|
||||
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
|
||||
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
|
||||
|
||||
// Cleanup and leave
|
||||
Cleanup (EXIT_SUCCESS);
|
||||
|
||||
return (bMatch == shrTRUE) ? 0 : 1;
|
||||
}
|
||||
|
||||
// "Golden" Host processing dot product function for comparison purposes
|
||||
@@ -232,13 +229,13 @@ int main(int argc, char **argv)
|
||||
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = 0, j = 0; i < iNumElements; i++)
|
||||
for (i = 0, j = 0; i < iNumElements; i++)
|
||||
{
|
||||
pfResult[i] = 0.0f;
|
||||
for (k = 0; k < 4; k++, j++)
|
||||
for (k = 0; k < 4; k++, j++)
|
||||
{
|
||||
pfResult[i] += pfData1[j] * pfData2[j];
|
||||
}
|
||||
pfResult[i] += pfData1[j] * pfData2[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,7 +247,7 @@ void Cleanup(int iExitCode)
|
||||
shrLog("Starting Cleanup...\n\n");
|
||||
if(cPathAndName)free(cPathAndName);
|
||||
if(cSourceCL)free(cSourceCL);
|
||||
if(ckKernel)clReleaseKernel(ckKernel);
|
||||
if(ckKernel)clReleaseKernel(ckKernel);
|
||||
if(program)clReleaseProgram(program);
|
||||
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
|
||||
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||
@@ -259,7 +256,7 @@ void Cleanup(int iExitCode)
|
||||
if (cmDevDst)clReleaseMemObject(cmDevDst);
|
||||
|
||||
// Free host memory
|
||||
free(srcA);
|
||||
free(srcA);
|
||||
free(srcB);
|
||||
free (dst);
|
||||
free(Golden);
|
||||
@@ -267,4 +264,4 @@ void Cleanup(int iExitCode)
|
||||
if (cdDevices) free(cdDevices);
|
||||
|
||||
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
|
||||
}
|
||||
}
|
||||
806
tests/opencl/dotproduct/oclUtils.cpp
Normal file
806
tests/opencl/dotproduct/oclUtils.cpp
Normal file
@@ -0,0 +1,806 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdarg.h>
|
||||
#include "oclUtils.h"
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platoform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
|
||||
{
|
||||
char chBuffer[1024];
|
||||
cl_uint num_platforms;
|
||||
cl_platform_id* clPlatformIDs;
|
||||
cl_int ciErrNum;
|
||||
*clSelectedPlatformID = NULL;
|
||||
|
||||
// Get OpenCL platform count
|
||||
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
|
||||
return -1000;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(num_platforms == 0)
|
||||
{
|
||||
shrLog("No OpenCL platform found!\n\n");
|
||||
return -2000;
|
||||
}
|
||||
else
|
||||
{
|
||||
// if there's a platform or more, make space for ID's
|
||||
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
|
||||
{
|
||||
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
|
||||
return -3000;
|
||||
}
|
||||
|
||||
// get platform info for each platform and trap the NVIDIA platform if found
|
||||
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
|
||||
for(cl_uint i = 0; i < num_platforms; ++i)
|
||||
{
|
||||
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
|
||||
if(ciErrNum == CL_SUCCESS)
|
||||
{
|
||||
if(strstr(chBuffer, "NVIDIA") != NULL)
|
||||
{
|
||||
*clSelectedPlatformID = clPlatformIDs[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// default to zeroeth platform if NVIDIA not found
|
||||
if(*clSelectedPlatformID == NULL)
|
||||
{
|
||||
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
|
||||
*clSelectedPlatformID = clPlatformIDs[0];
|
||||
}
|
||||
|
||||
free(clPlatformIDs);
|
||||
}
|
||||
}
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclPrintDevName(int iLogMode, cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, "%s", device_string);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclPrintDevInfo(int iLogMode, cl_device_id device)
|
||||
{
|
||||
char device_string[1024];
|
||||
bool nv_device_attibute_query = false;
|
||||
|
||||
// CL_DEVICE_NAME
|
||||
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_VENDOR
|
||||
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DRIVER_VERSION
|
||||
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_VERSION
|
||||
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
|
||||
|
||||
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
|
||||
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
|
||||
{
|
||||
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
|
||||
// This constant isn't #defined in 1.0
|
||||
#ifndef CL_DEVICE_OPENCL_C_VERSION
|
||||
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
|
||||
#endif
|
||||
|
||||
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
|
||||
}
|
||||
|
||||
// CL_DEVICE_TYPE
|
||||
cl_device_type type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
|
||||
if( type & CL_DEVICE_TYPE_CPU )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
|
||||
if( type & CL_DEVICE_TYPE_GPU )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
|
||||
if( type & CL_DEVICE_TYPE_ACCELERATOR )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
|
||||
if( type & CL_DEVICE_TYPE_DEFAULT )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
|
||||
size_t workitem_dims;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_ITEM_SIZES
|
||||
size_t workitem_size[3];
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
|
||||
|
||||
// CL_DEVICE_MAX_WORK_GROUP_SIZE
|
||||
size_t workgroup_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
|
||||
|
||||
// CL_DEVICE_ADDRESS_BITS
|
||||
cl_uint addr_bits;
|
||||
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
|
||||
|
||||
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
|
||||
cl_ulong max_mem_alloc_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
|
||||
|
||||
// CL_DEVICE_GLOBAL_MEM_SIZE
|
||||
cl_ulong mem_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
|
||||
|
||||
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
|
||||
cl_bool error_correction_support;
|
||||
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_TYPE
|
||||
cl_device_local_mem_type local_mem_type;
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
|
||||
|
||||
// CL_DEVICE_LOCAL_MEM_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
|
||||
|
||||
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
|
||||
|
||||
// CL_DEVICE_QUEUE_PROPERTIES
|
||||
cl_command_queue_properties queue_properties;
|
||||
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
|
||||
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
|
||||
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
|
||||
|
||||
// CL_DEVICE_IMAGE_SUPPORT
|
||||
cl_bool image_support;
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
|
||||
|
||||
// CL_DEVICE_MAX_READ_IMAGE_ARGS
|
||||
cl_uint max_read_image_args;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
|
||||
|
||||
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
|
||||
cl_uint max_write_image_args;
|
||||
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
|
||||
|
||||
// CL_DEVICE_SINGLE_FP_CONFIG
|
||||
cl_device_fp_config fp_config;
|
||||
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
|
||||
fp_config & CL_FP_DENORM ? "denorms " : "",
|
||||
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
|
||||
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
|
||||
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
|
||||
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
|
||||
fp_config & CL_FP_FMA ? "fma " : "");
|
||||
|
||||
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
|
||||
size_t szMaxDims[5];
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
|
||||
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
|
||||
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
|
||||
|
||||
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
|
||||
if (device_string != 0)
|
||||
{
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
|
||||
std::string stdDevString;
|
||||
stdDevString = std::string(device_string);
|
||||
size_t szOldPos = 0;
|
||||
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
|
||||
while (szSpacePos != stdDevString.npos)
|
||||
{
|
||||
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
|
||||
nv_device_attibute_query = true;
|
||||
|
||||
if (szOldPos > 0)
|
||||
{
|
||||
shrLogEx(iLogMode, 0, "\t\t");
|
||||
}
|
||||
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
|
||||
|
||||
do {
|
||||
szOldPos = szSpacePos + 1;
|
||||
szSpacePos = stdDevString.find(' ', szOldPos);
|
||||
} while (szSpacePos == szOldPos);
|
||||
}
|
||||
shrLogEx(iLogMode, 0, "\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
|
||||
}
|
||||
|
||||
if(nv_device_attibute_query)
|
||||
{
|
||||
cl_uint compute_capability_major, compute_capability_minor;
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
|
||||
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
|
||||
|
||||
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
|
||||
|
||||
cl_uint regs_per_block;
|
||||
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), ®s_per_block, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
|
||||
|
||||
cl_uint warp_size;
|
||||
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
|
||||
|
||||
cl_bool gpu_overlap;
|
||||
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
|
||||
cl_bool exec_timeout;
|
||||
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
|
||||
cl_bool integrated_memory;
|
||||
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
|
||||
}
|
||||
|
||||
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
|
||||
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
|
||||
cl_uint vec_width [6];
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
|
||||
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
|
||||
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
int oclGetDevCap(cl_device_id device)
|
||||
{
|
||||
char cDevString[1024];
|
||||
bool bDevAttributeQuery = false;
|
||||
int iDevArch = -1;
|
||||
|
||||
// Get device extensions, and if any then search for cl_nv_device_attribute_query
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
|
||||
if (cDevString != 0)
|
||||
{
|
||||
std::string stdDevString;
|
||||
stdDevString = std::string(cDevString);
|
||||
size_t szOldPos = 0;
|
||||
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
|
||||
while (szSpacePos != stdDevString.npos)
|
||||
{
|
||||
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
|
||||
{
|
||||
bDevAttributeQuery = true;
|
||||
}
|
||||
|
||||
do {
|
||||
szOldPos = szSpacePos + 1;
|
||||
szSpacePos = stdDevString.find(' ', szOldPos);
|
||||
} while (szSpacePos == szOldPos);
|
||||
}
|
||||
}
|
||||
|
||||
// if search succeeded, get device caps
|
||||
if(bDevAttributeQuery)
|
||||
{
|
||||
cl_int iComputeCapMajor, iComputeCapMinor;
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
|
||||
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
|
||||
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
|
||||
}
|
||||
|
||||
return iDevArch;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id first = cdDevices[0];
|
||||
free(cdDevices);
|
||||
|
||||
return first;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id max_flops_device = cdDevices[0];
|
||||
int max_flops = 0;
|
||||
|
||||
size_t current_device = 0;
|
||||
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
|
||||
max_flops = compute_units * clock_frequency;
|
||||
++current_device;
|
||||
|
||||
while( current_device < device_count )
|
||||
{
|
||||
// CL_DEVICE_MAX_COMPUTE_UNITS
|
||||
cl_uint compute_units;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
|
||||
|
||||
// CL_DEVICE_MAX_CLOCK_FREQUENCY
|
||||
cl_uint clock_frequency;
|
||||
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
|
||||
|
||||
int flops = compute_units * clock_frequency;
|
||||
if( flops > max_flops )
|
||||
{
|
||||
max_flops = flops;
|
||||
max_flops_device = cdDevices[current_device];
|
||||
}
|
||||
++current_device;
|
||||
}
|
||||
|
||||
free(cdDevices);
|
||||
|
||||
return max_flops_device;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||
{
|
||||
// locals
|
||||
FILE* pFileStream = NULL;
|
||||
size_t szSourceLength;
|
||||
|
||||
// open the OpenCL source code file
|
||||
#ifdef _WIN32 // Windows version
|
||||
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#else // Linux version
|
||||
pFileStream = fopen(cFilename, "rb");
|
||||
if(pFileStream == 0)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
size_t szPreambleLength = strlen(cPreamble);
|
||||
|
||||
// get the length of the source code
|
||||
fseek(pFileStream, 0, SEEK_END);
|
||||
szSourceLength = ftell(pFileStream);
|
||||
fseek(pFileStream, 0, SEEK_SET);
|
||||
|
||||
// allocate a buffer for the source code string and read it in
|
||||
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
|
||||
{
|
||||
fclose(pFileStream);
|
||||
free(cSourceString);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// close the file and return the total length of the combined (preamble + source) string
|
||||
fclose(pFileStream);
|
||||
if(szFinalLength != 0)
|
||||
{
|
||||
*szFinalLength = szSourceLength + szPreambleLength;
|
||||
}
|
||||
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||
|
||||
return cSourceString;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
|
||||
{
|
||||
size_t szParmDataBytes;
|
||||
cl_device_id* cdDevices;
|
||||
|
||||
// get the list of GPU devices associated with context
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
|
||||
|
||||
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
|
||||
return (cl_device_id)-1;
|
||||
}
|
||||
|
||||
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
|
||||
|
||||
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
|
||||
|
||||
cl_device_id device = cdDevices[nr];
|
||||
free(cdDevices);
|
||||
|
||||
return device;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
|
||||
{
|
||||
// Grab the number of devices associated witht the program
|
||||
cl_uint num_devices;
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
|
||||
|
||||
// Grab the device ids
|
||||
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
|
||||
|
||||
// Grab the sizes of the binaries
|
||||
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
|
||||
|
||||
// Now get the binaries
|
||||
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
|
||||
for( unsigned int i=0; i<num_devices; ++i) {
|
||||
ptx_code[i]= (char*)malloc(binary_sizes[i]);
|
||||
}
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
|
||||
|
||||
// Find the index of the device of interest
|
||||
unsigned int idx = 0;
|
||||
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
|
||||
|
||||
// If it is associated prepare the result
|
||||
if( idx < num_devices )
|
||||
{
|
||||
*binary = ptx_code[idx];
|
||||
*length = binary_sizes[idx];
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
free( devices );
|
||||
free( binary_sizes );
|
||||
for( unsigned int i=0; i<num_devices; ++i) {
|
||||
if( i != idx ) free(ptx_code[i]);
|
||||
}
|
||||
free( ptx_code );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
|
||||
{
|
||||
// Grab the number of devices associated with the program
|
||||
cl_uint num_devices;
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
|
||||
|
||||
// Grab the device ids
|
||||
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
|
||||
|
||||
// Grab the sizes of the binaries
|
||||
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
|
||||
|
||||
// Now get the binaries
|
||||
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
|
||||
for( unsigned int i=0; i<num_devices; ++i)
|
||||
{
|
||||
ptx_code[i] = (char*)malloc(binary_sizes[i]);
|
||||
}
|
||||
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
|
||||
|
||||
// Find the index of the device of interest
|
||||
unsigned int idx = 0;
|
||||
while((idx < num_devices) && (devices[idx] != cdDevice))
|
||||
{
|
||||
++idx;
|
||||
}
|
||||
|
||||
// If the index is associated, log the result
|
||||
if(idx < num_devices)
|
||||
{
|
||||
|
||||
// if a separate filename is supplied, dump ptx there
|
||||
if (NULL != cPtxFileName)
|
||||
{
|
||||
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
|
||||
FILE* pFileStream = NULL;
|
||||
#ifdef _WIN32
|
||||
fopen_s(&pFileStream, cPtxFileName, "wb");
|
||||
#else
|
||||
pFileStream = fopen(cPtxFileName, "wb");
|
||||
#endif
|
||||
|
||||
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
|
||||
fclose(pFileStream);
|
||||
}
|
||||
else // log to logfile and console if no ptx file specified
|
||||
{
|
||||
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
free(devices);
|
||||
free(binary_sizes);
|
||||
for(unsigned int i = 0; i < num_devices; ++i)
|
||||
{
|
||||
free(ptx_code[i]);
|
||||
}
|
||||
free( ptx_code );
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
|
||||
{
|
||||
// write out the build log and ptx, then exit
|
||||
char cBuildLog[10240];
|
||||
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
|
||||
sizeof(cBuildLog), cBuildLog, NULL );
|
||||
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
|
||||
}
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < iNumObjs; i++)
|
||||
{
|
||||
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
const char* oclErrorString(cl_int error)
|
||||
{
|
||||
static const char* errorString[] = {
|
||||
"CL_SUCCESS",
|
||||
"CL_DEVICE_NOT_FOUND",
|
||||
"CL_DEVICE_NOT_AVAILABLE",
|
||||
"CL_COMPILER_NOT_AVAILABLE",
|
||||
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
|
||||
"CL_OUT_OF_RESOURCES",
|
||||
"CL_OUT_OF_HOST_MEMORY",
|
||||
"CL_PROFILING_INFO_NOT_AVAILABLE",
|
||||
"CL_MEM_COPY_OVERLAP",
|
||||
"CL_IMAGE_FORMAT_MISMATCH",
|
||||
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
|
||||
"CL_BUILD_PROGRAM_FAILURE",
|
||||
"CL_MAP_FAILURE",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"CL_INVALID_VALUE",
|
||||
"CL_INVALID_DEVICE_TYPE",
|
||||
"CL_INVALID_PLATFORM",
|
||||
"CL_INVALID_DEVICE",
|
||||
"CL_INVALID_CONTEXT",
|
||||
"CL_INVALID_QUEUE_PROPERTIES",
|
||||
"CL_INVALID_COMMAND_QUEUE",
|
||||
"CL_INVALID_HOST_PTR",
|
||||
"CL_INVALID_MEM_OBJECT",
|
||||
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
|
||||
"CL_INVALID_IMAGE_SIZE",
|
||||
"CL_INVALID_SAMPLER",
|
||||
"CL_INVALID_BINARY",
|
||||
"CL_INVALID_BUILD_OPTIONS",
|
||||
"CL_INVALID_PROGRAM",
|
||||
"CL_INVALID_PROGRAM_EXECUTABLE",
|
||||
"CL_INVALID_KERNEL_NAME",
|
||||
"CL_INVALID_KERNEL_DEFINITION",
|
||||
"CL_INVALID_KERNEL",
|
||||
"CL_INVALID_ARG_INDEX",
|
||||
"CL_INVALID_ARG_VALUE",
|
||||
"CL_INVALID_ARG_SIZE",
|
||||
"CL_INVALID_KERNEL_ARGS",
|
||||
"CL_INVALID_WORK_DIMENSION",
|
||||
"CL_INVALID_WORK_GROUP_SIZE",
|
||||
"CL_INVALID_WORK_ITEM_SIZE",
|
||||
"CL_INVALID_GLOBAL_OFFSET",
|
||||
"CL_INVALID_EVENT_WAIT_LIST",
|
||||
"CL_INVALID_EVENT",
|
||||
"CL_INVALID_OPERATION",
|
||||
"CL_INVALID_GL_OBJECT",
|
||||
"CL_INVALID_BUFFER_SIZE",
|
||||
"CL_INVALID_MIP_LEVEL",
|
||||
"CL_INVALID_GLOBAL_WORK_SIZE",
|
||||
};
|
||||
|
||||
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
|
||||
|
||||
const int index = -error;
|
||||
|
||||
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
|
||||
}
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
const char* oclImageFormatString(cl_uint uiImageFormat)
|
||||
{
|
||||
// cl_channel_order
|
||||
if (uiImageFormat == CL_R)return "CL_R";
|
||||
if (uiImageFormat == CL_A)return "CL_A";
|
||||
if (uiImageFormat == CL_RG)return "CL_RG";
|
||||
if (uiImageFormat == CL_RA)return "CL_RA";
|
||||
if (uiImageFormat == CL_RGB)return "CL_RGB";
|
||||
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
|
||||
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
|
||||
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
|
||||
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
|
||||
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
|
||||
|
||||
// cl_channel_type
|
||||
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
|
||||
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
|
||||
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
|
||||
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
|
||||
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
|
||||
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
|
||||
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
|
||||
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
|
||||
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
|
||||
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
|
||||
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
|
||||
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
|
||||
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
|
||||
|
||||
// unknown constant
|
||||
return "Unknown";
|
||||
}
|
||||
@@ -17,7 +17,7 @@
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
#include "shrUtils.h"
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
1954
tests/opencl/dotproduct/shrUtils.cpp
Normal file
1954
tests/opencl/dotproduct/shrUtils.cpp
Normal file
File diff suppressed because it is too large
Load Diff
@@ -13,7 +13,7 @@
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
@@ -639,4 +639,4 @@ inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,70 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
|
||||
CXXFLAGS += -Wno-unused-variable -Wno-narrowing -Wno-unused-result -Wno-unused-but-set-variable
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = guassian
|
||||
|
||||
SRCS = main.cc clutils.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?=
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
Binary file not shown.
@@ -31,8 +31,6 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
a = (float *)malloc(size * size * sizeof(float));
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
InitMat(fp, size, a, size, size);
|
||||
// printf("The input matrix a is:\n");
|
||||
// PrintMat(a, size, size, size);
|
||||
|
||||
@@ -1,67 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = kmeans
|
||||
|
||||
SRCS = main.cc read_input.c rmse.c kmeans_clustering.c cluster.c getopt.c
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?=
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
Binary file not shown.
@@ -170,6 +170,7 @@ float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
|
||||
free(new_centers[0]);
|
||||
free(new_centers);
|
||||
free(new_centers_len);
|
||||
free(initial);
|
||||
|
||||
return clusters;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -104,8 +104,8 @@ static int initialize(int use_gpu) {
|
||||
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
|
||||
|
||||
// create command queue for the first device
|
||||
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
|
||||
if (!cmd_queue) {
|
||||
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, &result);
|
||||
if (!cmd_queue || result != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateCommandQueue() failed\n");
|
||||
return -1;
|
||||
}
|
||||
@@ -120,7 +120,7 @@ static int shutdown() {
|
||||
if (context)
|
||||
clReleaseContext(context);
|
||||
if (device_list)
|
||||
delete device_list;
|
||||
delete [] device_list;
|
||||
|
||||
// reset all variables
|
||||
cmd_queue = 0;
|
||||
@@ -188,7 +188,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
fread(source + strlen(source), sourcesize, 1, fp);
|
||||
fclose(fp);*/
|
||||
|
||||
// OpenCL initialization
|
||||
// OpenCL initialization
|
||||
int use_gpu = 1;
|
||||
if (initialize(use_gpu))
|
||||
return -1;
|
||||
@@ -197,12 +197,25 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
cl_int err = 0;
|
||||
//const char *slist[2] = {source, 0};
|
||||
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
|
||||
cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
|
||||
printf("ERROR: read_kernel_file() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
|
||||
|
||||
cl_program prog = clCreateProgramWithBinary(
|
||||
context, 1, device_list, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateProgramWithBinary() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
free(kernel_bin);
|
||||
|
||||
err = clBuildProgram(prog, 1, &device_list[0], NULL, NULL, NULL);
|
||||
{ // show warnings/errors
|
||||
// static char log[65536]; memset(log, 0, sizeof(log));
|
||||
// cl_device_id device_id = 0;
|
||||
@@ -226,6 +239,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
printf("ERROR: clCreateKernel() 0 => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
kernel2 = clCreateKernel(prog, kernel_swap, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateKernel() 0 => %d\n", err);
|
||||
@@ -241,6 +255,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
d_feature_swap =
|
||||
clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * n_features * sizeof(float), NULL, &err);
|
||||
@@ -249,6 +264,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
d_cluster =
|
||||
clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_clusters * n_features * sizeof(float), NULL, &err);
|
||||
@@ -257,6 +273,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
n_clusters * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * sizeof(int), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
@@ -296,6 +313,8 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
}
|
||||
|
||||
membership_OCL = (int *)malloc(n_points * sizeof(int));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void deallocateMemory() {
|
||||
|
||||
@@ -331,7 +331,9 @@ int setup(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
/* free up memory */
|
||||
/* free up memory */
|
||||
free(cluster_centres[0]);
|
||||
free(cluster_centres);
|
||||
free(features[0]);
|
||||
free(features);
|
||||
return (0);
|
||||
|
||||
@@ -1,69 +1,9 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = lbm
|
||||
|
||||
|
||||
SRCS = main.cc args.c parboil_opencl.c gpu_info.c lbm.c ocl.c
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
CXXFLAGS += -I.
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
OPTS ?=
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
include ../common.mk
|
||||
|
||||
@@ -9,6 +9,10 @@
|
||||
#ifndef __GPUINFOH__
|
||||
#define __GPUINFOH__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
|
||||
int minor,
|
||||
int sm);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -6,16 +6,16 @@
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#ifndef _LBM_H_
|
||||
#define _LBM_H_
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#include "ocl.h"
|
||||
#include "lbm_macros.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void LBM_allocateGrid( float** ptr );
|
||||
void LBM_freeGrid( float** ptr );
|
||||
void LBM_initializeGrid( LBM_Grid grid );
|
||||
@@ -34,6 +34,8 @@ void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid
|
||||
void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
|
||||
void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
|
||||
|
||||
/*############################################################################*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LBM_H_ */
|
||||
|
||||
Binary file not shown.
@@ -21,6 +21,26 @@
|
||||
#include "main.h"
|
||||
#include "ocl.h"
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
@@ -170,8 +190,6 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
|
||||
|
||||
LBM_freeGrid((float **)&TEMP_srcGrid);
|
||||
LBM_freeGrid((float **)&TEMP_dstGrid);
|
||||
|
||||
printf("OK\n");
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
@@ -188,7 +206,9 @@ void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
LBM_showGridStatistics(TEMP_srcGrid);
|
||||
|
||||
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
|
||||
if (param->resultFilename) {
|
||||
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
|
||||
}
|
||||
|
||||
LBM_freeGrid((float **)&TEMP_srcGrid);
|
||||
OpenCL_LBM_freeGrid(OpenCL_srcGrid);
|
||||
@@ -220,8 +240,14 @@ void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
|
||||
|
||||
//const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
|
||||
//prm->clProgram = clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
|
||||
prm->clProgram = clCreateProgramWithBuiltInKernels(
|
||||
prm->clContext, 1, &prm->clDevice, "performStreamCollide_kernel", &clStatus);
|
||||
// read kernel binary from file
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
prm->clProgram = clCreateProgramWithBinary(
|
||||
prm->clContext, 1, &prm->clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
|
||||
//char clOptions[100];
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#ifndef __OCLH__
|
||||
#define __OCLH__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
cl_platform_id clPlatform;
|
||||
cl_context_properties clCps[3];
|
||||
@@ -22,4 +26,8 @@ typedef struct {
|
||||
|
||||
char* readFile(char*);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1216,9 +1216,24 @@ pb_InitOpenCLContext(struct pb_Parameters* parameters) {
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
cl_context context;
|
||||
clGetPlatformIDs(1, &platform_id, NULL);
|
||||
clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
|
||||
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err);
|
||||
|
||||
_err = clGetPlatformIDs(1, &platform_id, NULL);
|
||||
if (_err != CL_SUCCESS) {
|
||||
fprintf(stderr, "Error querying platform!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
_err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
|
||||
if (_err != CL_SUCCESS) {
|
||||
fprintf(stderr, "Error querying device IDs!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err);
|
||||
if (_err != CL_SUCCESS) {
|
||||
fprintf(stderr, "Error Creating device context!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
pb_Context* c = (pb_Context*)malloc(sizeof(pb_Context));
|
||||
c->clContext = context;
|
||||
|
||||
7
tests/opencl/matmul/Makefile
Normal file
7
tests/opencl/matmul/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
PROJECT = matmul
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
include ../common.mk
|
||||
73
tests/opencl/matmul/kernel.cl
Normal file
73
tests/opencl/matmul/kernel.cl
Normal file
@@ -0,0 +1,73 @@
|
||||
__kernel void matmul(__global float *A,
|
||||
__global float *B,
|
||||
__global float *C,
|
||||
const unsigned int N,
|
||||
__local float *localA,
|
||||
__local float *localB)
|
||||
{
|
||||
int row = get_global_id(1);
|
||||
int col = get_global_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
int localSize = get_local_size(0); // assuming square local size
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
// Loop over all blocks of both matrices
|
||||
for (int k = 0; k < N; k += localSize) {
|
||||
// Load block of matrix A to local memory
|
||||
localA[localRow * localSize + localCol] = A[row * N + k + localCol];
|
||||
|
||||
// Load block of matrix B to local memory, adjusting for column-major access
|
||||
localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
|
||||
|
||||
// Synchronize to make sure the tiles are loaded
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Multiply the two matrix blocks and accumulate result
|
||||
for (int j = 0; j < localSize; j++) {
|
||||
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
|
||||
}
|
||||
|
||||
// Synchronize before loading the next block
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
C[row * N + col] = sum;
|
||||
}
|
||||
|
||||
/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
|
||||
{
|
||||
int globalRow = get_global_id(1);
|
||||
int globalCol = get_global_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
|
||||
// Static local memory declaration
|
||||
__local float localA[16][16];
|
||||
__local float localB[16][16];
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
// Iterate over blocks
|
||||
for (int k = 0; k < N; k += 16) {
|
||||
// Load a block of matrix A into local memory
|
||||
localA[localRow][localCol] = A[globalRow * N + k + localCol];
|
||||
|
||||
// Load a block of matrix B into local memory
|
||||
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
|
||||
|
||||
// Ensure the entire block is loaded
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Compute multiplication for this block
|
||||
for (int j = 0; j < 16; j++) {
|
||||
sum += localA[localRow][j] * localB[j][localCol];
|
||||
}
|
||||
|
||||
// Wait until all threads have computed before loading the next block
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
C[globalRow * N + globalCol] = sum;
|
||||
}*/
|
||||
246
tests/opencl/matmul/main.cc
Normal file
246
tests/opencl/matmul/main.cc
Normal file
@@ -0,0 +1,246 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <CL/opencl.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
|
||||
#define LOCAL_SIZE 16
|
||||
|
||||
#define KERNEL_NAME "matmul"
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
cl_int _err = _expr; \
|
||||
if (_err == CL_SUCCESS) \
|
||||
break; \
|
||||
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (0)
|
||||
|
||||
#define CL_CHECK2(_expr) \
|
||||
({ \
|
||||
cl_int _err = CL_INVALID_VALUE; \
|
||||
decltype(_expr) _ret = _expr; \
|
||||
if (_err != CL_SUCCESS) { \
|
||||
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool compare_equal(float a, float b, int ulp = 21) {
|
||||
union fi_t { int i; float f; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
return std::abs(fa.i - fb.i) <= ulp;
|
||||
}
|
||||
|
||||
static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
float sum = 0.0f;
|
||||
for (int k = 0; k < N; k++) {
|
||||
sum += A[i * N + k] * B[k * N + j];
|
||||
}
|
||||
C[i * N + j] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cl_device_id device_id = NULL;
|
||||
cl_context context = NULL;
|
||||
cl_command_queue commandQueue = NULL;
|
||||
cl_program program = NULL;
|
||||
cl_kernel kernel = NULL;
|
||||
cl_mem a_memobj = NULL;
|
||||
cl_mem b_memobj = NULL;
|
||||
cl_mem c_memobj = NULL;
|
||||
uint8_t *kernel_bin = NULL;
|
||||
|
||||
static void cleanup() {
|
||||
if (commandQueue) clReleaseCommandQueue(commandQueue);
|
||||
if (kernel) clReleaseKernel(kernel);
|
||||
if (program) clReleaseProgram(program);
|
||||
if (a_memobj) clReleaseMemObject(a_memobj);
|
||||
if (b_memobj) clReleaseMemObject(b_memobj);
|
||||
if (c_memobj) clReleaseMemObject(c_memobj);
|
||||
if (context) clReleaseContext(context);
|
||||
if (device_id) clReleaseDevice(device_id);
|
||||
if (kernel_bin) free(kernel_bin);
|
||||
}
|
||||
|
||||
int size = 64;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "fn:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
printf("Matrix size=%d\n", size);
|
||||
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
|
||||
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
|
||||
printf("Create context\n");
|
||||
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
|
||||
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
printf("Using device: %s\n", device_string);
|
||||
|
||||
printf("Allocate device buffers\n");
|
||||
size_t nbytes = size * size * sizeof(float);
|
||||
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
||||
// Create kernel
|
||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
|
||||
size_t global_size[2] = {size, size};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
|
||||
|
||||
// Allocate memories for input arrays and output arrays.
|
||||
std::vector<float> h_a(size * size);
|
||||
std::vector<float> h_b(size * size);
|
||||
std::vector<float> h_c(size * size);
|
||||
|
||||
// Initialize values for array members.
|
||||
for (int i = 0; i < (size * size); ++i) {
|
||||
#ifdef USE_FLOAT
|
||||
h_a[i] = (float)rand() / (float)RAND_MAX;
|
||||
h_b[i] = (float)rand() / (float)RAND_MAX;
|
||||
#else
|
||||
h_a[i] = rand();
|
||||
h_b[i] = rand();
|
||||
#endif
|
||||
h_c[i] = 0xdeadbeef;
|
||||
}
|
||||
|
||||
// Creating command queue
|
||||
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
|
||||
|
||||
printf("Upload source buffers\n");
|
||||
CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a.data(), 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Execute the kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(commandQueue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
printf("Download destination buffer\n");
|
||||
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Verify result\n");
|
||||
std::vector<float> ref_vec(size * size);
|
||||
matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
|
||||
int errors = 0;
|
||||
for (int i = 0; i < (size * size); i++) {
|
||||
if (!compare_equal(h_c[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
printf("FAILED! - %d errors\n", errors);
|
||||
} else {
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
|
||||
// Clean up
|
||||
cleanup();
|
||||
|
||||
return errors;
|
||||
}
|
||||
@@ -1,69 +1,9 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = mri-q
|
||||
|
||||
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.cc computeQ.c
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
CXXFLAGS += -I.
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
OPTS ?=
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
include ../common.mk
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
#ifndef __COMPUTEQ__
|
||||
#define __COMPUTEQ__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm);
|
||||
void computeQ_GPU (int numK,int numX,
|
||||
cl_mem x_d, cl_mem y_d, cl_mem z_d,
|
||||
@@ -11,4 +15,8 @@ void computeQ_GPU (int numK,int numX,
|
||||
void createDataStructsCPU(int numK, int numX, float** phiMag,
|
||||
float** Qr, float** Qi);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -9,6 +9,10 @@
|
||||
#ifndef __GPUINFOH__
|
||||
#define __GPUINFOH__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
@@ -17,4 +21,8 @@ void compute_active_thread(size_t *thread,
|
||||
int minor,
|
||||
int sm);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@@ -34,6 +34,27 @@
|
||||
#include "macros.h"
|
||||
#include "computeQ.h"
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return CL_INVALID_VALUE;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
static void
|
||||
setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
|
||||
{
|
||||
@@ -93,8 +114,6 @@ main (int argc, char *argv[]) {
|
||||
&x, &y, &z,
|
||||
&phiR, &phiI);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
/* Reduce the number of k-space samples if a number is given
|
||||
* on the command line */
|
||||
if (argc < 2)
|
||||
@@ -137,13 +156,20 @@ main (int argc, char *argv[]) {
|
||||
|
||||
pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
//const char* clSource[] = {readFile("src/opencl_base/kernels.cl")};
|
||||
//cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
|
||||
cl_program clProgram = clCreateProgramWithBuiltInKernels(
|
||||
clPrm.clContext, 1, &clDevice, "ComputePhiMag_GPU;ComputeQ_GPU", &clStatus);
|
||||
#ifdef HOSTGPU
|
||||
const char* clSource[] = {readFile("kernel.cl")};
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
|
||||
#else
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
cl_program clProgram = clCreateProgramWithBinary(
|
||||
clPrm.clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
||||
#endif
|
||||
|
||||
char options[50];
|
||||
sprintf(options,"-I src/opencl_nvidia");
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
cl_context clContext;
|
||||
cl_command_queue clCommandQueue;
|
||||
@@ -20,4 +24,8 @@ char* readFile(const char*);
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,72 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
OPTS ?= filelist.txt
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
|
||||
CXXFLAGS += -Wno-unused-variable -Wno-narrowing -Wno-unused-result -Wno-unused-but-set-variable
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = nearn
|
||||
|
||||
SRCS = main.cc clutils.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?= filelist.txt
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,4 +1,2 @@
|
||||
cane4_0.db
|
||||
cane4_1.db
|
||||
cane4_2.db
|
||||
cane4_3.db
|
||||
cane4_1.db
|
||||
Binary file not shown.
@@ -172,6 +172,7 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
|
||||
|
||||
printf("%f\n\n", (float)(totalTime / 1e9));
|
||||
}
|
||||
|
||||
// 6. return finalized data and release buffers
|
||||
clReleaseEvent(writeEvent);
|
||||
clReleaseEvent(kernelEvent);
|
||||
|
||||
@@ -1,71 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
OPTS ?= -n1
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = oclprintf
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?= -n1
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -1,71 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
OPTS ?= -f -n16
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-strict-aliasing -Wno-narrowing
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = psort
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?= -f -n16
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,67 +0,0 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=reduce0
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: oclReduction_kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
@@ -1,638 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
Parallel reduction
|
||||
|
||||
This sample shows how to perform a reduction operation on an array of values
|
||||
to produce a single value.
|
||||
|
||||
Reductions are a very common computation in parallel algorithms. Any time
|
||||
an array of values needs to be reduced to a single value using a binary
|
||||
associative operator, a reduction can be used. Example applications include
|
||||
statistics computaions such as mean and standard deviation, and image
|
||||
processing applications such as finding the total luminance of an
|
||||
image.
|
||||
|
||||
This code performs sum reductions, but any associative operator such as
|
||||
min() or max() could also be used.
|
||||
|
||||
It assumes the input size is a power of 2.
|
||||
|
||||
COMMAND LINE ARGUMENTS
|
||||
|
||||
"--shmoo": Test performance for 1 to 32M elements with each of the 7 different kernels
|
||||
"--n=<N>": Specify the number of elements to reduce (default 1048576)
|
||||
"--threads=<N>": Specify the number of threads per block (default 128)
|
||||
"--kernel=<N>": Specify which kernel to run (0-6, default 6)
|
||||
"--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
|
||||
"--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false)
|
||||
"--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
|
||||
|
||||
*/
|
||||
|
||||
// Common system and utility includes
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
// additional includes
|
||||
#include <sstream>
|
||||
#include <oclReduction.h>
|
||||
|
||||
// Forward declarations and sample-specific defines
|
||||
// *********************************************************************
|
||||
enum ReduceType
|
||||
{
|
||||
REDUCE_INT,
|
||||
REDUCE_FLOAT,
|
||||
REDUCE_DOUBLE
|
||||
};
|
||||
|
||||
template <class T>
|
||||
bool runTest( int argc, const char** argv, ReduceType datatype);
|
||||
|
||||
#define MAX_BLOCK_DIM_SIZE 65535
|
||||
|
||||
extern "C"
|
||||
bool isPow2(unsigned int x)
|
||||
{
|
||||
return ((x&(x-1))==0);
|
||||
}
|
||||
|
||||
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2);
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main( int argc, const char** argv)
|
||||
{
|
||||
shrQAStart(argc, (char **)argv);
|
||||
|
||||
// start logs
|
||||
shrSetLogFileName ("oclReduction.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
char *typeChoice;
|
||||
shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice);
|
||||
|
||||
// determine type of array from command line args
|
||||
if (0 == typeChoice)
|
||||
{
|
||||
typeChoice = (char*)malloc(7 * sizeof(char));
|
||||
#ifdef WIN32
|
||||
strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int");
|
||||
#else
|
||||
strcpy(typeChoice, "int");
|
||||
#endif
|
||||
}
|
||||
ReduceType datatype = REDUCE_INT;
|
||||
|
||||
#ifdef WIN32
|
||||
if (!_strcmpi(typeChoice, "float"))
|
||||
datatype = REDUCE_FLOAT;
|
||||
else if (!_strcmpi(typeChoice, "double"))
|
||||
datatype = REDUCE_DOUBLE;
|
||||
else
|
||||
datatype = REDUCE_INT;
|
||||
#else
|
||||
if (!strcmp(typeChoice, "float"))
|
||||
datatype = REDUCE_FLOAT;
|
||||
else if (!strcmp(typeChoice, "double"))
|
||||
datatype = REDUCE_DOUBLE;
|
||||
else
|
||||
datatype = REDUCE_INT;
|
||||
#endif
|
||||
|
||||
shrLog("Reducing array of type %s.\n", typeChoice);
|
||||
|
||||
//Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Get the devices
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Create the context
|
||||
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// get and log the device info
|
||||
if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
|
||||
int device_nr = 0;
|
||||
shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr);
|
||||
if( device_nr < uiNumDevices ) {
|
||||
device = oclGetDev(cxGPUContext, device_nr);
|
||||
} else {
|
||||
shrLog("Invalid Device %d Requested.\n", device_nr);
|
||||
shrExitEX(argc, argv, EXIT_FAILURE);
|
||||
}
|
||||
} else {
|
||||
device = oclGetMaxFlopsDev(cxGPUContext);
|
||||
}
|
||||
oclPrintDevName(LOGBOTH, device);
|
||||
shrLog("\n");
|
||||
|
||||
// create a command-queue
|
||||
cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]);
|
||||
|
||||
bool bSuccess = false;
|
||||
switch (datatype)
|
||||
{
|
||||
default:
|
||||
case REDUCE_INT:
|
||||
bSuccess = runTest<int>( argc, argv, datatype);
|
||||
break;
|
||||
case REDUCE_FLOAT:
|
||||
bSuccess = runTest<float>( argc, argv, datatype);
|
||||
break;
|
||||
}
|
||||
|
||||
// finish
|
||||
shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compute sum reduction on CPU
|
||||
//! We use Kahan summation for an accurate sum of large arrays.
|
||||
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
|
||||
//!
|
||||
//! @param data pointer to input data
|
||||
//! @param size number of input data elements
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template<class T>
|
||||
T reduceCPU(T *data, int size)
|
||||
{
|
||||
T sum = data[0];
|
||||
T c = (T)0.0;
|
||||
for (int i = 1; i < size; i++)
|
||||
{
|
||||
T y = data[i] - c;
|
||||
T t = sum + y;
|
||||
c = (t - sum) - y;
|
||||
sum = t;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
unsigned int nextPow2( unsigned int x ) {
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Compute the number of threads and blocks to use for the given reduction kernel
|
||||
// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
|
||||
// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel
|
||||
// 6, we observe the maximum specified number of blocks, because each thread in
|
||||
// that kernel can process a variable number of elements.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
|
||||
{
|
||||
if (whichKernel < 3)
|
||||
{
|
||||
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
|
||||
blocks = (n + threads - 1) / threads;
|
||||
}
|
||||
else
|
||||
{
|
||||
threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
|
||||
blocks = (n + (threads * 2 - 1)) / (threads * 2);
|
||||
}
|
||||
|
||||
|
||||
if (whichKernel == 6)
|
||||
blocks = MIN(maxBlocks, blocks);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// This function performs a reduction of the input data multiple times and
|
||||
// measures the average reduction time.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <class T>
|
||||
T profileReduce(ReduceType datatype,
|
||||
cl_int n,
|
||||
int numThreads,
|
||||
int numBlocks,
|
||||
int maxThreads,
|
||||
int maxBlocks,
|
||||
int whichKernel,
|
||||
int testIterations,
|
||||
bool cpuFinalReduction,
|
||||
int cpuFinalThreshold,
|
||||
double* dTotalTime,
|
||||
T* h_odata,
|
||||
cl_mem d_idata,
|
||||
cl_mem d_odata)
|
||||
{
|
||||
|
||||
|
||||
T gpu_result = 0;
|
||||
bool needReadBack = true;
|
||||
cl_kernel finalReductionKernel[10];
|
||||
int finalReductionIterations=0;
|
||||
|
||||
//shrLog("Profile Kernel %d\n", whichKernel);
|
||||
|
||||
cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) );
|
||||
clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata);
|
||||
clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata);
|
||||
clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n);
|
||||
clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL);
|
||||
|
||||
if( !cpuFinalReduction ) {
|
||||
int s=numBlocks;
|
||||
int threads = 0, blocks = 0;
|
||||
int kernel = (whichKernel == 6) ? 5 : whichKernel;
|
||||
|
||||
while(s > cpuFinalThreshold)
|
||||
{
|
||||
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
|
||||
|
||||
finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) );
|
||||
clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata);
|
||||
clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata);
|
||||
clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n);
|
||||
clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL);
|
||||
|
||||
if (kernel < 3)
|
||||
s = (s + threads - 1) / threads;
|
||||
else
|
||||
s = (s + (threads*2-1)) / (threads*2);
|
||||
|
||||
finalReductionIterations++;
|
||||
}
|
||||
}
|
||||
|
||||
size_t globalWorkSize[1];
|
||||
size_t localWorkSize[1];
|
||||
|
||||
for (int i = 0; i < testIterations; ++i)
|
||||
{
|
||||
gpu_result = 0;
|
||||
|
||||
clFinish(cqCommandQueue);
|
||||
if(i>0) shrDeltaT(1);
|
||||
|
||||
// execute the kernel
|
||||
globalWorkSize[0] = numBlocks * numThreads;
|
||||
localWorkSize[0] = numThreads;
|
||||
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize,
|
||||
0, NULL, NULL);
|
||||
|
||||
// check if kernel execution generated an error
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
if (cpuFinalReduction)
|
||||
{
|
||||
// sum partial sums from each block on CPU
|
||||
// copy result from device to host
|
||||
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T),
|
||||
h_odata, 0, NULL, NULL);
|
||||
|
||||
for(int i=0; i<numBlocks; i++)
|
||||
{
|
||||
gpu_result += h_odata[i];
|
||||
}
|
||||
|
||||
needReadBack = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
// sum partial block sums on GPU
|
||||
int s=numBlocks;
|
||||
int kernel = (whichKernel == 6) ? 5 : whichKernel;
|
||||
int it = 0;
|
||||
|
||||
|
||||
while(s > cpuFinalThreshold)
|
||||
{
|
||||
int threads = 0, blocks = 0;
|
||||
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
|
||||
|
||||
globalWorkSize[0] = threads * blocks;
|
||||
localWorkSize[0] = threads;
|
||||
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0,
|
||||
globalWorkSize, localWorkSize, 0, NULL, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
if (kernel < 3)
|
||||
s = (s + threads - 1) / threads;
|
||||
else
|
||||
s = (s + (threads*2-1)) / (threads*2);
|
||||
|
||||
it++;
|
||||
}
|
||||
|
||||
if (s > 1)
|
||||
{
|
||||
// copy result from device to host
|
||||
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T),
|
||||
h_odata, 0, NULL, NULL);
|
||||
|
||||
for(int i=0; i < s; i++)
|
||||
{
|
||||
gpu_result += h_odata[i];
|
||||
}
|
||||
|
||||
needReadBack = false;
|
||||
}
|
||||
}
|
||||
|
||||
clFinish(cqCommandQueue);
|
||||
if(i>0) *dTotalTime += shrDeltaT(1);
|
||||
}
|
||||
|
||||
if (needReadBack)
|
||||
{
|
||||
// copy final sum from device to host
|
||||
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T),
|
||||
&gpu_result, 0, NULL, NULL);
|
||||
}
|
||||
|
||||
// Release the kernels
|
||||
clReleaseKernel(reductionKernel);
|
||||
if( !cpuFinalReduction ) {
|
||||
for(int it=0; it<finalReductionIterations; ++it) {
|
||||
clReleaseKernel(finalReductionKernel[it]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return gpu_result;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// This function calls profileReduce multple times for a range of array sizes
|
||||
// and prints a report in CSV (comma-separated value) format that can be used for
|
||||
// generating a "shmoo" plot showing the performance for each kernel variation
|
||||
// over a wide range of input sizes.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <class T>
|
||||
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
|
||||
{
|
||||
// create random input data on CPU
|
||||
unsigned int bytes = maxN * sizeof(T);
|
||||
|
||||
T* h_idata = (T*)malloc(bytes);
|
||||
|
||||
for(int i = 0; i < maxN; i++) {
|
||||
// Keep the numbers small so we don't get truncation error in the sum
|
||||
if (datatype == REDUCE_INT)
|
||||
h_idata[i] = (T)(rand() & 0xFF);
|
||||
else
|
||||
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
|
||||
}
|
||||
|
||||
int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
|
||||
|
||||
// allocate mem for the result on host side
|
||||
T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));
|
||||
|
||||
// allocate device memory and data
|
||||
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
|
||||
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL);
|
||||
|
||||
int testIterations = 100;
|
||||
double dTotalTime = 0.0;
|
||||
|
||||
// print headers
|
||||
shrLog("Time in seconds for various numbers of elements for each kernel\n");
|
||||
shrLog("\n\n");
|
||||
shrLog("Kernel");
|
||||
for (int i = minN; i <= maxN; i *= 2)
|
||||
{
|
||||
shrLog(", %d", i);
|
||||
}
|
||||
|
||||
for (int kernel = 0; kernel < 7; kernel++)
|
||||
{
|
||||
shrLog("\n");
|
||||
shrLog("%d", kernel);
|
||||
for (int i = minN; i <= maxN; i *= 2)
|
||||
{
|
||||
int numBlocks = 0;
|
||||
int numThreads = 0;
|
||||
getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
|
||||
|
||||
double reduceTime;
|
||||
if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
|
||||
profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel,
|
||||
testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata);
|
||||
reduceTime = dTotalTime/(double)testIterations;
|
||||
} else {
|
||||
reduceTime = -1.0;
|
||||
}
|
||||
shrLog(", %.4f m", reduceTime);
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
free(h_idata);
|
||||
free(h_odata);
|
||||
clReleaseMemObject(d_idata);
|
||||
clReleaseMemObject(d_odata);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The main function whihc runs the reduction test.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <class T>
|
||||
bool
|
||||
runTest( int argc, const char** argv, ReduceType datatype)
|
||||
{
|
||||
int size = 1<<24; // number of elements to reduce
|
||||
int maxThreads;
|
||||
|
||||
cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1);
|
||||
clReleaseKernel(reductionKernel);
|
||||
|
||||
if (smallBlock)
|
||||
maxThreads = 64; // number of threads per block
|
||||
else
|
||||
maxThreads = 128;
|
||||
|
||||
int whichKernel = 6;
|
||||
int maxBlocks = 64;
|
||||
bool cpuFinalReduction = false;
|
||||
int cpuFinalThreshold = 1;
|
||||
|
||||
shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
|
||||
shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
|
||||
shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
|
||||
shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
|
||||
|
||||
shrLog(" %d elements\n", size);
|
||||
shrLog(" %d threads (max)\n", maxThreads);
|
||||
|
||||
cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE);
|
||||
shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);
|
||||
|
||||
bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE);
|
||||
|
||||
#ifdef GPU_PROFILING
|
||||
if (runShmoo)
|
||||
{
|
||||
shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
// create random input data on CPU
|
||||
unsigned int bytes = size * sizeof(T);
|
||||
T* h_idata = (T*)malloc(bytes);
|
||||
|
||||
for(int i=0; i<size; i++)
|
||||
{
|
||||
// Keep the numbers small so we don't get truncation error in the sum
|
||||
if (datatype == REDUCE_INT)
|
||||
h_idata[i] = (T)(rand() & 0xFF);
|
||||
else
|
||||
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
|
||||
}
|
||||
|
||||
int numBlocks = 0;
|
||||
int numThreads = 0;
|
||||
getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
|
||||
if (numBlocks == 1) cpuFinalThreshold = 1;
|
||||
shrLog(" %d blocks\n\n", numBlocks);
|
||||
|
||||
// allocate mem for the result on host side
|
||||
T* h_odata = (T*)malloc(numBlocks * sizeof(T));
|
||||
|
||||
// allocate device memory and data
|
||||
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
|
||||
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL);
|
||||
|
||||
int testIterations = 100;
|
||||
double dTotalTime = 0.0;
|
||||
T gpu_result = 0;
|
||||
gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks,
|
||||
whichKernel, testIterations, cpuFinalReduction,
|
||||
cpuFinalThreshold, &dTotalTime,
|
||||
h_odata, d_idata, d_odata);
|
||||
|
||||
#ifdef GPU_PROFILING
|
||||
double reduceTime = dTotalTime/(double)testIterations;
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
|
||||
1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
|
||||
#endif
|
||||
|
||||
// compute reference solution
|
||||
shrLog("\nComparing against Host/C++ computation...\n");
|
||||
T cpu_result = reduceCPU<T>(h_idata, size);
|
||||
if (datatype == REDUCE_INT)
|
||||
{
|
||||
shrLog(" GPU result = %d\n", gpu_result);
|
||||
shrLog(" CPU result = %d\n\n", cpu_result);
|
||||
shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLog(" GPU result = %.9f\n", gpu_result);
|
||||
shrLog(" CPU result = %.9f\n\n", cpu_result);
|
||||
|
||||
double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12;
|
||||
double diff = abs((double)gpu_result - (double)cpu_result);
|
||||
shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED");
|
||||
}
|
||||
|
||||
// cleanup
|
||||
free(h_idata);
|
||||
free(h_odata);
|
||||
clReleaseMemObject(d_idata);
|
||||
clReleaseMemObject(d_odata);
|
||||
|
||||
return (gpu_result == cpu_result);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to create and build program and kernel
|
||||
// *********************************************************************
|
||||
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
|
||||
{
|
||||
// compile cl program
|
||||
size_t program_length;
|
||||
char *source;
|
||||
|
||||
std::ostringstream preamble;
|
||||
|
||||
// create the program
|
||||
// with type specification depending on datatype argument
|
||||
switch (datatype)
|
||||
{
|
||||
default:
|
||||
case REDUCE_INT:
|
||||
preamble << "#define T int" << std::endl;
|
||||
break;
|
||||
case REDUCE_FLOAT:
|
||||
preamble << "#define T float" << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
// set blockSize at compile time
|
||||
preamble << "#define blockSize " << blockSize << std::endl;
|
||||
|
||||
// set isPow2 at compile time
|
||||
preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
|
||||
|
||||
// Load the source code and prepend the preamble
|
||||
source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
|
||||
//oclCheckError(source != NULL, shrTRUE);
|
||||
|
||||
program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL);
|
||||
//cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source,
|
||||
// &program_length, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
free(source);
|
||||
|
||||
// build the program
|
||||
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
// create Kernel
|
||||
std::ostringstream kernelName;
|
||||
kernelName << "reduce" << whichKernel;
|
||||
cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
size_t wgSize;
|
||||
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
|
||||
if (wgSize == 64)
|
||||
smallBlock = true;
|
||||
else smallBlock = false;
|
||||
|
||||
// NOTE: the program will get deleted when the kernel is also released
|
||||
clReleaseProgram(rv_program);
|
||||
|
||||
return ckKernel;
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef __REDUCTION_H__
|
||||
#define __REDUCTION_H__
|
||||
|
||||
template <class T>
|
||||
void reduce_sm10(int size, int threads, int blocks,
|
||||
int whichKernel, T *d_idata, T *d_odata);
|
||||
|
||||
template <class T>
|
||||
void reduce_sm13(int size, int threads, int blocks,
|
||||
int whichKernel, T *d_idata, T *d_odata);
|
||||
|
||||
// CL objects
|
||||
cl_platform_id cpPlatform;
|
||||
cl_uint uiNumDevices;
|
||||
cl_device_id* cdDevices;
|
||||
cl_context cxGPUContext;
|
||||
cl_command_queue cqCommandQueue;
|
||||
cl_device_id device;
|
||||
cl_int ciErrNum;
|
||||
const char* source_path;
|
||||
bool smallBlock = true;
|
||||
|
||||
#endif
|
||||
@@ -1,273 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
Parallel reduction kernels
|
||||
*/
|
||||
|
||||
// The following defines are set during runtime compilation, see reduction.cpp
|
||||
// #define T float
|
||||
// #define blockSize 128
|
||||
// #define nIsPow2 1
|
||||
|
||||
#ifndef _REDUCE_KERNEL_H_
|
||||
#define _REDUCE_KERNEL_H_
|
||||
|
||||
/*
|
||||
Parallel sum reduction using shared memory
|
||||
- takes log(n) steps for n input elements
|
||||
- uses n threads
|
||||
- only works for power-of-2 arrays
|
||||
*/
|
||||
|
||||
/* This reduction interleaves which threads are active by using the modulo
|
||||
operator. This operator is very expensive on GPUs, and the interleaved
|
||||
inactivity means that no whole warps are active, which is also very
|
||||
inefficient */
|
||||
__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// load shared mem
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_global_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=1; s < get_local_size(0); s *= 2) {
|
||||
// modulo arithmetic is slow!
|
||||
if ((tid % (2*s)) == 0) {
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
|
||||
/* This version uses contiguous threads, but its interleaved
|
||||
addressing results in many shared memory bank conflicts. */
|
||||
__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// load shared mem
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_global_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=1; s < get_local_size(0); s *= 2)
|
||||
{
|
||||
int index = 2 * s * tid;
|
||||
|
||||
if (index < get_local_size(0))
|
||||
{
|
||||
sdata[index] += sdata[index + s];
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version uses sequential addressing -- no divergence or bank conflicts.
|
||||
*/
|
||||
__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// load shared mem
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_global_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version uses n/2 threads --
|
||||
it performs the first level of reduction when reading from global memory
|
||||
*/
|
||||
__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
if (i + get_local_size(0) < n)
|
||||
sdata[tid] += g_idata[i+get_local_size(0)];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version unrolls the last warp to avoid synchronization where it
|
||||
isn't needed
|
||||
*/
|
||||
__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
if (i + get_local_size(0) < n)
|
||||
sdata[tid] += g_idata[i+get_local_size(0)];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
#pragma unroll 1
|
||||
for(unsigned int s=get_local_size(0)/2; s>32; s>>=1)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
sdata[tid] += sdata[tid + s];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version is completely unrolled. It uses a template parameter to achieve
|
||||
optimal code for any (power of 2) number of threads. This requires a switch
|
||||
statement in the host code to handle all the different thread block sizes at
|
||||
compile time.
|
||||
*/
|
||||
__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
|
||||
sdata[tid] = (i < n) ? g_idata[i] : 0;
|
||||
if (i + blockSize < n)
|
||||
sdata[tid] += g_idata[i+blockSize];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
/*
|
||||
This version adds multiple elements per thread sequentially. This reduces the overall
|
||||
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
|
||||
(Brent's Theorem optimization)
|
||||
*/
|
||||
__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
|
||||
{
|
||||
// perform first level of reduction,
|
||||
// reading from global memory, writing to shared memory
|
||||
unsigned int tid = get_local_id(0);
|
||||
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
|
||||
unsigned int gridSize = blockSize*2*get_num_groups(0);
|
||||
sdata[tid] = 0;
|
||||
|
||||
// we reduce multiple elements per thread. The number is determined by the
|
||||
// number of active thread blocks (via gridDim). More blocks will result
|
||||
// in a larger gridSize and therefore fewer elements per thread
|
||||
while (i < n)
|
||||
{
|
||||
sdata[tid] += g_idata[i];
|
||||
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
|
||||
if (nIsPow2 || i + blockSize < n)
|
||||
sdata[tid] += g_idata[i+blockSize];
|
||||
i += gridSize;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// do reduction in shared mem
|
||||
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
|
||||
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
|
||||
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
|
||||
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
|
||||
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
|
||||
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
|
||||
}
|
||||
|
||||
// write result for this block to global mem
|
||||
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
|
||||
}
|
||||
|
||||
#endif // #ifndef _REDUCE_KERNEL_H_
|
||||
@@ -1,198 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,238 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,642 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1 +0,0 @@
|
||||
Inputs: reference.bin frame.bin
|
||||
@@ -1,69 +0,0 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simx/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VORTEX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/intrinsics/vx_intrinsics.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/io/vx_io.S $(VORTEX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/fileio/fileio.S
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VORTEX_RT_PATH)/vx_api/vx_api.c
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/startup/vx_link$(XLEN).ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VORTEX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = sad
|
||||
|
||||
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.c image.c OpenCL_common.cpp
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOLCHAIN_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOLCHAIN_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user