regression restructure

This commit is contained in:
Richard Yan
2025-01-29 18:28:09 -08:00
parent 3de51577ef
commit 5ba132e87b
101 changed files with 1052 additions and 7795 deletions

View File

@@ -34,7 +34,7 @@ DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I./include -I../hw
CFLAGS += -I./include
CFLAGS += -DXLEN_$(XLEN)
PROJECT = libvortexrt

685
kernel/include/VX_config.h Normal file
View File

@@ -0,0 +1,685 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2024-05-07 13:55:58.398687
// Translated from ./rtl/VX_config.vh:
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef VX_CONFIG_VH
#define VX_CONFIG_VH
#ifndef MIN
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#endif
#ifndef MAX
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#endif
#ifndef CLAMP
#define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
#endif
#ifndef UP
#define UP(x) (((x) != 0) ? (x) : 1)
#endif
///////////////////////////////////////////////////////////////////////////////
#ifndef EXT_M_DISABLE
#define EXT_M_ENABLE
#endif
#ifndef EXT_F_DISABLE
#define EXT_F_ENABLE
#endif
#ifndef XLEN_32
#ifndef XLEN_64
#define XLEN_32
#endif
#endif
#ifdef XLEN_64
#define XLEN 64
#endif
#ifdef XLEN_32
#define XLEN 32
#endif
#ifdef EXT_D_ENABLE
#define FLEN_64
#else
#define FLEN_32
#endif
#ifdef FLEN_64
#define FLEN 64
#endif
#ifdef FLEN_32
#define FLEN 32
#endif
#ifdef XLEN_64
#ifdef FLEN_32
#define FPU_RV64F
#endif
#endif
#ifndef NUM_CLUSTERS
#define NUM_CLUSTERS 1
#endif
#ifndef NUM_CORES
#define NUM_CORES 8
#endif
#ifndef NUM_WARPS
#define NUM_WARPS 8
#endif
#ifndef NUM_THREADS
#define NUM_THREADS 8
#endif
#ifndef NUM_BARRIERS
#define NUM_BARRIERS 8
#endif
#ifndef SOCKET_SIZE
#define SOCKET_SIZE MIN(4, NUM_CORES)
#endif
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
#ifdef L2_ENABLE
#define L2_ENABLED 1
#else
#define L2_ENABLED 0
#endif
#ifdef L3_ENABLE
#define L3_ENABLED 1
#else
#define L3_ENABLED 0
#endif
#ifdef L1_DISABLE
#define ICACHE_DISABLE
#define DCACHE_DISABLE
#endif
#ifndef MEM_BLOCK_SIZE
#define MEM_BLOCK_SIZE 64
#endif
#ifndef MEM_ADDR_WIDTH
#ifdef XLEN_64
#define MEM_ADDR_WIDTH 48
#else
#define MEM_ADDR_WIDTH 32
#endif
#endif
#ifndef L1_LINE_SIZE
#ifdef L1_DISABLE
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 4 : MEM_BLOCK_SIZE)
#else
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 16 : MEM_BLOCK_SIZE)
#endif
#endif
#ifdef L2_ENABLE
#define L2_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L2_LINE_SIZE L1_LINE_SIZE
#endif
#ifdef L3_ENABLE
#define L3_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L3_LINE_SIZE L2_LINE_SIZE
#endif
#ifdef XLEN_64
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x180000000
#endif
#ifndef STACK_BASE_ADDR
#define STACK_BASE_ADDR 0x1FF000000
#endif
#else
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x80000000
#endif
#ifndef STACK_BASE_ADDR
#define STACK_BASE_ADDR 0xFF000000
#endif
#endif
#ifndef SMEM_BASE_ADDR
#define SMEM_BASE_ADDR STACK_BASE_ADDR
#endif
#ifndef SMEM_LOG_SIZE
#define SMEM_LOG_SIZE 19
#endif
#ifndef IO_BASE_ADDR
#define IO_BASE_ADDR (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE))
#endif
#ifndef IO_COUT_ADDR
#define IO_COUT_ADDR IO_BASE_ADDR
#endif
#define IO_COUT_SIZE MEM_BLOCK_SIZE
#ifndef IO_CSR_ADDR
#define IO_CSR_ADDR (IO_COUT_ADDR + IO_COUT_SIZE)
#endif
#define IO_CSR_SIZE (4 * 64 * NUM_CORES * NUM_CLUSTERS)
#ifndef STACK_LOG2_SIZE
#define STACK_LOG2_SIZE 13
#endif
#define STACK_SIZE (1 << STACK_LOG2_SIZE)
#define RESET_DELAY 8
#ifndef STALL_TIMEOUT
#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED)))
#endif
#ifndef SV_DPI
#define DPI_DISABLE
#endif
#ifndef FPU_FPNEW
#ifndef FPU_DSP
#ifndef FPU_DPI
#ifndef SYNTHESIS
#ifndef DPI_DISABLE
#define FPU_DPI
#else
#define FPU_DSP
#endif
#else
#define FPU_DSP
#endif
#endif
#endif
#endif
#ifndef SYNTHESIS
#ifndef DPI_DISABLE
#define IMUL_DPI
#define IDIV_DPI
#endif
#endif
#ifndef DEBUG_LEVEL
#define DEBUG_LEVEL 3
#endif
// Pipeline Configuration /////////////////////////////////////////////////////
// Issue width
#ifndef ISSUE_WIDTH
#define ISSUE_WIDTH NUM_WARPS
#endif
// Number of ALU units
#ifndef NUM_ALU_LANES
#define NUM_ALU_LANES NUM_THREADS
#endif
#ifndef NUM_ALU_BLOCKS
#define NUM_ALU_BLOCKS 4
#endif
// Number of FPU units
#ifndef NUM_FPU_LANES
#define NUM_FPU_LANES NUM_THREADS
#endif
#ifndef NUM_FPU_BLOCKS
#define NUM_FPU_BLOCKS 2
#endif
// Number of LSU units
#ifndef NUM_LSU_LANES
#define NUM_LSU_LANES NUM_THREADS
#endif
// Number of SFU units
#ifndef NUM_SFU_LANES
#define NUM_SFU_LANES MIN(NUM_THREADS, 4)
#endif
// Size of Instruction Buffer
#ifndef IBUF_SIZE
#define IBUF_SIZE (4 * ISSUE_WIDTH)
#endif
// Size of LSU Request Queue
#ifndef LSUQ_SIZE
#define LSUQ_SIZE (4 * NUM_WARPS * (NUM_THREADS / NUM_LSU_LANES))
#endif
// LSU Duplicate Address Check
#ifndef LSU_DUP_DISABLE
#define LSU_DUP_ENABLE
#endif
#ifdef LSU_DUP_ENABLE
#define LSU_DUP_ENABLED 1
#else
#define LSU_DUP_ENABLED 0
#endif
#ifdef GBAR_ENABLE
#define GBAR_ENABLED 1
#else
#define GBAR_ENABLED 0
#endif
#ifndef LATENCY_IMUL
#ifdef VIVADO
#define LATENCY_IMUL 4
#endif
#ifdef QUARTUS
#define LATENCY_IMUL 3
#endif
#ifndef LATENCY_IMUL
#define LATENCY_IMUL 4
#endif
#endif
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue
#ifndef FPUQ_SIZE
#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
#endif
// FNCP Latency
#ifndef LATENCY_FNCP
#define LATENCY_FNCP 2
#endif
// FMA Latency
#ifndef LATENCY_FMA
#ifdef FPU_DPI
#define LATENCY_FMA 4
#endif
#ifdef FPU_FPNEW
#define LATENCY_FMA 4
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FMA 4
#endif
#ifdef VIVADO
#define LATENCY_FMA 16
#endif
#ifndef LATENCY_FMA
#define LATENCY_FMA 4
#endif
#endif
#endif
// FDIV Latency
#ifndef LATENCY_FDIV
#ifdef FPU_DPI
#define LATENCY_FDIV 15
#endif
#ifdef FPU_FPNEW
#define LATENCY_FDIV 16
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FDIV 15
#endif
#ifdef VIVADO
#define LATENCY_FDIV 28
#endif
#ifndef LATENCY_FDIV
#define LATENCY_FDIV 16
#endif
#endif
#endif
// FSQRT Latency
#ifndef LATENCY_FSQRT
#ifdef FPU_DPI
#define LATENCY_FSQRT 10
#endif
#ifdef FPU_FPNEW
#define LATENCY_FSQRT 16
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FSQRT 10
#endif
#ifdef VIVADO
#define LATENCY_FSQRT 28
#endif
#ifndef LATENCY_FSQRT
#define LATENCY_FSQRT 16
#endif
#endif
#endif
// FCVT Latency
#ifndef LATENCY_FCVT
#define LATENCY_FCVT 5
#endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
#ifndef ICACHE_DISABLE
#define ICACHE_ENABLE
#endif
#ifdef ICACHE_ENABLE
#define ICACHE_ENABLED 1
#else
#define ICACHE_ENABLED 0
#define NUM_ICACHES 0
#endif
// Number of Cache Units
#ifndef NUM_ICACHES
#define NUM_ICACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
#ifndef ICACHE_SIZE
#define ICACHE_SIZE 16384
#endif
// Core Response Queue Size
#ifndef ICACHE_CRSQ_SIZE
#define ICACHE_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef ICACHE_MSHR_SIZE
#define ICACHE_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef ICACHE_MREQ_SIZE
#define ICACHE_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef ICACHE_MRSQ_SIZE
#define ICACHE_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef ICACHE_NUM_WAYS
#define ICACHE_NUM_WAYS 1
#endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
#ifndef DCACHE_DISABLE
#define DCACHE_ENABLE
#endif
#ifdef DCACHE_ENABLE
#define DCACHE_ENABLED 1
#else
#define DCACHE_ENABLED 0
#define NUM_DCACHES 0
#define DCACHE_NUM_BANKS 1
#endif
// Number of Cache Units
#ifndef NUM_DCACHES
#define NUM_DCACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
#ifndef DCACHE_SIZE
#define DCACHE_SIZE 16384
#endif
// Number of Banks
#ifndef DCACHE_NUM_BANKS
#define DCACHE_NUM_BANKS NUM_LSU_LANES
#endif
// Core Response Queue Size
#ifndef DCACHE_CRSQ_SIZE
#define DCACHE_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef DCACHE_MSHR_SIZE
#define DCACHE_MSHR_SIZE 8
#endif
// Memory Request Queue Size
#ifndef DCACHE_MREQ_SIZE
#define DCACHE_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef DCACHE_MRSQ_SIZE
#define DCACHE_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef DCACHE_NUM_WAYS
#define DCACHE_NUM_WAYS 1
#endif
// SM Configurable Knobs //////////////////////////////////////////////////////
#ifndef SM_DISABLE
#define SM_ENABLE
#endif
#ifdef SM_ENABLE
#define SM_ENABLED 1
#else
#define SM_ENABLED 0
#define SMEM_NUM_BANKS 1
#endif
// Number of Banks
#ifndef SMEM_NUM_BANKS
#define SMEM_NUM_BANKS (NUM_LSU_LANES)
#endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
#ifndef L2_CACHE_SIZE
#ifdef ALTERA_S10
#define L2_CACHE_SIZE 2097152
#else
#define L2_CACHE_SIZE 1048576
#endif
#endif
// Number of Banks
#ifndef L2_NUM_BANKS
#define L2_NUM_BANKS MIN(4, NUM_SOCKETS)
#endif
// Core Response Queue Size
#ifndef L2_CRSQ_SIZE
#define L2_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef L2_MSHR_SIZE
#define L2_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef L2_MREQ_SIZE
#define L2_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef L2_MRSQ_SIZE
#define L2_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef L2_NUM_WAYS
#define L2_NUM_WAYS 2
#endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
#ifndef L3_CACHE_SIZE
#ifdef ALTERA_S10
#define L3_CACHE_SIZE 2097152
#else
#define L3_CACHE_SIZE 1048576
#endif
#endif
// Number of Banks
#ifndef L3_NUM_BANKS
#define L3_NUM_BANKS MIN(4, NUM_CLUSTERS)
#endif
// Core Response Queue Size
#ifndef L3_CRSQ_SIZE
#define L3_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef L3_MSHR_SIZE
#define L3_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef L3_MREQ_SIZE
#define L3_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef L3_MRSQ_SIZE
#define L3_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef L3_NUM_WAYS
#define L3_NUM_WAYS 4
#endif
// ISA Extensions /////////////////////////////////////////////////////////////
#ifdef EXT_A_ENABLE
#define EXT_A_ENABLED 1
#else
#define EXT_A_ENABLED 0
#endif
#ifdef EXT_C_ENABLE
#define EXT_C_ENABLED 1
#else
#define EXT_C_ENABLED 0
#endif
#ifdef EXT_D_ENABLE
#define EXT_D_ENABLED 1
#else
#define EXT_D_ENABLED 0
#endif
#ifdef EXT_F_ENABLE
#define EXT_F_ENABLED 1
#else
#define EXT_F_ENABLED 0
#endif
#ifdef EXT_M_ENABLE
#define EXT_M_ENABLED 1
#else
#define EXT_M_ENABLED 0
#endif
#define ISA_STD_A 0
#define ISA_STD_C 2
#define ISA_STD_D 3
#define ISA_STD_E 4
#define ISA_STD_F 5
#define ISA_STD_H 7
#define ISA_STD_I 8
#define ISA_STD_N 13
#define ISA_STD_Q 16
#define ISA_STD_S 18
#define ISA_STD_U 20
#define ISA_EXT_ICACHE 0
#define ISA_EXT_DCACHE 1
#define ISA_EXT_L2CACHE 2
#define ISA_EXT_L3CACHE 3
#define ISA_EXT_SMEM 4
#define MISA_EXT (ICACHE_ENABLED << ISA_EXT_ICACHE) \
| (DCACHE_ENABLED << ISA_EXT_DCACHE) \
| (L2_ENABLED << ISA_EXT_L2CACHE) \
| (L3_ENABLED << ISA_EXT_L3CACHE) \
| (SM_ENABLED << ISA_EXT_SMEM)
#define MISA_STD (EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
| (EXT_C_ENABLED << 2) /* C - Compressed extension */ \
| (EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
| (0 << 4) /* E - RV32E base ISA */ \
| (EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
| (0 << 6) /* G - Additional standard extensions present */ \
| (0 << 7) /* H - Hypervisor mode implemented */ \
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
| (0 << 9) /* J - Reserved */ \
| (0 << 10) /* K - Reserved */ \
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
| (EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
| (0 << 13) /* N - User level interrupts supported */ \
| (0 << 14) /* O - Reserved */ \
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
| (0 << 17) /* R - Reserved */ \
| (0 << 18) /* S - Supervisor mode implemented */ \
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
| (1 << 20) /* U - User mode implemented */ \
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
| (0 << 22) /* W - Reserved */ \
| (1 << 23) /* X - Non-standard extensions present */ \
| (0 << 24) /* Y - Reserved */ \
| (0 << 25) /* Z - Reserved */
// Device identification //////////////////////////////////////////////////////
#define VENDOR_ID 0
#define ARCHITECTURE_ID 0
#define IMPLEMENTATION_ID 0
#endif // VX_CONFIG_VH

193
kernel/include/VX_types.h Normal file
View File

@@ -0,0 +1,193 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2024-06-15 00:25:12.935689
// Translated from ./rtl/VX_types.vh:
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef VX_TYPES_VH
#define VX_TYPES_VH
// Device configuration registers
#define VX_CSR_ADDR_BITS 12
#define VX_DCR_ADDR_BITS 12
#define VX_DCR_BASE_STATE_BEGIN 0x001
#define VX_DCR_BASE_STARTUP_ADDR0 0x001
#define VX_DCR_BASE_STARTUP_ADDR1 0x002
#define VX_DCR_BASE_MPM_CLASS 0x003
#define VX_DCR_BASE_STATE_END 0x004
#define VX_DCR_BASE_STATE(addr) ((addr) - VX_DCR_BASE_STATE_BEGIN)
#define VX_DCR_BASE_STATE_COUNT (VX_DCR_BASE_STATE_END-VX_DCR_BASE_STATE_BEGIN)
// Machine Performance-monitoring counters classes
#define VX_DCR_MPM_CLASS_NONE 0
#define VX_DCR_MPM_CLASS_CORE 1
#define VX_DCR_MPM_CLASS_MEM 2
// User Floating-Point CSRs
#define VX_CSR_FFLAGS 0x001
#define VX_CSR_FRM 0x002
#define VX_CSR_FCSR 0x003
#define VX_CSR_SATP 0x180
#define VX_CSR_PMPCFG0 0x3A0
#define VX_CSR_PMPADDR0 0x3B0
#define VX_CSR_MSTATUS 0x300
#define VX_CSR_MISA 0x301
#define VX_CSR_MEDELEG 0x302
#define VX_CSR_MIDELEG 0x303
#define VX_CSR_MIE 0x304
#define VX_CSR_MTVEC 0x305
#define VX_CSR_MEPC 0x341
#define VX_CSR_MNSTATUS 0x744
#define VX_CSR_MPM_BASE 0xB00
#define VX_CSR_MPM_BASE_H 0xB80
#define VX_CSR_MPM_USER 0xB03
#define VX_CSR_MPM_USER_H 0xB83
// Machine Performance-monitoring core counters
// PERF: Standard
#define VX_CSR_MCYCLE 0xB00
#define VX_CSR_MCYCLE_H 0xB80
#define VX_CSR_MPM_RESERVED 0xB01
#define VX_CSR_MPM_RESERVED_H 0xB81
#define VX_CSR_MINSTRET 0xB02
#define VX_CSR_MINSTRET_H 0xB82
// PERF: pipeline
#define VX_CSR_MPM_SCHED_ID 0xB03
#define VX_CSR_MPM_SCHED_ID_H 0xB83
#define VX_CSR_MPM_SCHED_ST 0xB04
#define VX_CSR_MPM_SCHED_ST_H 0xB84
#define VX_CSR_MPM_IBUF_ST 0xB05
#define VX_CSR_MPM_IBUF_ST_H 0xB85
#define VX_CSR_MPM_SCRB_ST 0xB06
#define VX_CSR_MPM_SCRB_ST_H 0xB86
#define VX_CSR_MPM_SCRB_ALU 0xB07
#define VX_CSR_MPM_SCRB_ALU_H 0xB87
#define VX_CSR_MPM_SCRB_FPU 0xB08
#define VX_CSR_MPM_SCRB_FPU_H 0xB88
#define VX_CSR_MPM_SCRB_LSU 0xB09
#define VX_CSR_MPM_SCRB_LSU_H 0xB89
#define VX_CSR_MPM_SCRB_SFU 0xB0A
#define VX_CSR_MPM_SCRB_SFU_H 0xB8A
// PERF: memory
#define VX_CSR_MPM_IFETCHES 0xB0B
#define VX_CSR_MPM_IFETCHES_H 0xB8B
#define VX_CSR_MPM_LOADS 0xB0C
#define VX_CSR_MPM_LOADS_H 0xB8C
#define VX_CSR_MPM_STORES 0xB0D
#define VX_CSR_MPM_STORES_H 0xB8D
#define VX_CSR_MPM_IFETCH_LT 0xB0E
#define VX_CSR_MPM_IFETCH_LT_H 0xB8E
#define VX_CSR_MPM_LOAD_LT 0xB0F
#define VX_CSR_MPM_LOAD_LT_H 0xB8F
// SFU: scoreboard
#define VX_CSR_MPM_SCRB_WCTL 0xB10
#define VX_CSR_MPM_SCRB_WCTL_H 0xB90
#define VX_CSR_MPM_SCRB_CSRS 0xB11
#define VX_CSR_MPM_SCRB_CSRS_H 0xB91
// Machine Performance-monitoring memory counters
// PERF: icache
#define VX_CSR_MPM_ICACHE_READS 0xB03 // total reads
#define VX_CSR_MPM_ICACHE_READS_H 0xB83
#define VX_CSR_MPM_ICACHE_MISS_R 0xB04 // read misses
#define VX_CSR_MPM_ICACHE_MISS_R_H 0xB84
#define VX_CSR_MPM_ICACHE_MSHR_ST 0xB05 // MSHR stalls
#define VX_CSR_MPM_ICACHE_MSHR_ST_H 0xB85
// PERF: dcache
#define VX_CSR_MPM_DCACHE_READS 0xB06 // total reads
#define VX_CSR_MPM_DCACHE_READS_H 0xB86
#define VX_CSR_MPM_DCACHE_WRITES 0xB07 // total writes
#define VX_CSR_MPM_DCACHE_WRITES_H 0xB87
#define VX_CSR_MPM_DCACHE_MISS_R 0xB08 // read misses
#define VX_CSR_MPM_DCACHE_MISS_R_H 0xB88
#define VX_CSR_MPM_DCACHE_MISS_W 0xB09 // write misses
#define VX_CSR_MPM_DCACHE_MISS_W_H 0xB89
#define VX_CSR_MPM_DCACHE_BANK_ST 0xB0A // bank conflicts
#define VX_CSR_MPM_DCACHE_BANK_ST_H 0xB8A
#define VX_CSR_MPM_DCACHE_MSHR_ST 0xB0B // MSHR stalls
#define VX_CSR_MPM_DCACHE_MSHR_ST_H 0xB8B
// PERF: l2cache
#define VX_CSR_MPM_L2CACHE_READS 0xB0C // total reads
#define VX_CSR_MPM_L2CACHE_READS_H 0xB8C
#define VX_CSR_MPM_L2CACHE_WRITES 0xB0D // total writes
#define VX_CSR_MPM_L2CACHE_WRITES_H 0xB8D
#define VX_CSR_MPM_L2CACHE_MISS_R 0xB0E // read misses
#define VX_CSR_MPM_L2CACHE_MISS_R_H 0xB8E
#define VX_CSR_MPM_L2CACHE_MISS_W 0xB0F // write misses
#define VX_CSR_MPM_L2CACHE_MISS_W_H 0xB8F
#define VX_CSR_MPM_L2CACHE_BANK_ST 0xB10 // bank conflicts
#define VX_CSR_MPM_L2CACHE_BANK_ST_H 0xB90
#define VX_CSR_MPM_L2CACHE_MSHR_ST 0xB11 // MSHR stalls
#define VX_CSR_MPM_L2CACHE_MSHR_ST_H 0xB91
// PERF: l3cache
#define VX_CSR_MPM_L3CACHE_READS 0xB12 // total reads
#define VX_CSR_MPM_L3CACHE_READS_H 0xB92
#define VX_CSR_MPM_L3CACHE_WRITES 0xB13 // total writes
#define VX_CSR_MPM_L3CACHE_WRITES_H 0xB93
#define VX_CSR_MPM_L3CACHE_MISS_R 0xB14 // read misses
#define VX_CSR_MPM_L3CACHE_MISS_R_H 0xB94
#define VX_CSR_MPM_L3CACHE_MISS_W 0xB15 // write misses
#define VX_CSR_MPM_L3CACHE_MISS_W_H 0xB95
#define VX_CSR_MPM_L3CACHE_BANK_ST 0xB16 // bank conflicts
#define VX_CSR_MPM_L3CACHE_BANK_ST_H 0xB96
#define VX_CSR_MPM_L3CACHE_MSHR_ST 0xB17 // MSHR stalls
#define VX_CSR_MPM_L3CACHE_MSHR_ST_H 0xB97
// PERF: memory
#define VX_CSR_MPM_MEM_READS 0xB18 // total reads
#define VX_CSR_MPM_MEM_READS_H 0xB98
#define VX_CSR_MPM_MEM_WRITES 0xB19 // total writes
#define VX_CSR_MPM_MEM_WRITES_H 0xB99
#define VX_CSR_MPM_MEM_LT 0xB1A // memory latency
#define VX_CSR_MPM_MEM_LT_H 0xB9A
// PERF: smem
#define VX_CSR_MPM_SMEM_READS 0xB1B // memory reads
#define VX_CSR_MPM_SMEM_READS_H 0xB9B
#define VX_CSR_MPM_SMEM_WRITES 0xB1C // memory writes
#define VX_CSR_MPM_SMEM_WRITES_H 0xB9C
#define VX_CSR_MPM_SMEM_BANK_ST 0xB1D // bank conflicts
#define VX_CSR_MPM_SMEM_BANK_ST_H 0xB9D
// Machine Information Registers
#define VX_CSR_MVENDORID 0xF11
#define VX_CSR_MARCHID 0xF12
#define VX_CSR_MIMPID 0xF13
#define VX_CSR_MHARTID 0xF14
// GPGU CSRs
#define VX_CSR_THREAD_ID 0xCC0
#define VX_CSR_WARP_ID 0xCC1
#define VX_CSR_CORE_ID 0xCC2
#define VX_CSR_WARP_MASK 0xCC3
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
#define VX_CSR_NUM_THREADS 0xFC0
#define VX_CSR_NUM_WARPS 0xFC1
#define VX_CSR_NUM_CORES 0xFC2
#endif // VX_TYPES_VH

View File

@@ -1,89 +1,17 @@
all:
$(MAKE) -C basic
$(MAKE) -C demo
$(MAKE) -C dogfood
$(MAKE) -C mstress
$(MAKE) -C io_addr
$(MAKE) -C printf
$(MAKE) -C diverge
$(MAKE) -C sort
$(MAKE) -C fence
$(MAKE) -C no_mf_ext
$(MAKE) -C no_smem
$(MAKE) -C vecaddx
$(MAKE) -C sgemmx
# Find all subdirectories containing a Makefile
SUBDIRS := $(shell find . -mindepth 1 -maxdepth 1 -type d -exec test -e {}/Makefile \; -print)
run-simx:
$(MAKE) -C basic run-simx
$(MAKE) -C demo run-simx
$(MAKE) -C dogfood run-simx
$(MAKE) -C mstress run-simx
$(MAKE) -C io_addr run-simx
$(MAKE) -C printf run-simx
$(MAKE) -C diverge run-simx
$(MAKE) -C sort run-simx
$(MAKE) -C fence run-simx
$(MAKE) -C no_mf_ext run-simx
$(MAKE) -C no_smem run-simx
$(MAKE) -C vecaddx run-simx
$(MAKE) -C sgemmx run-simx
.PHONY: all $(SUBDIRS) clean clean-all
run-rtlsim:
$(MAKE) -C basic run-rtlsim
$(MAKE) -C demo run-rtlsim
$(MAKE) -C dogfood run-rtlsim
$(MAKE) -C mstress run-rtlsim
$(MAKE) -C io_addr run-rtlsim
$(MAKE) -C printf run-rtlsim
$(MAKE) -C diverge run-rtlsim
$(MAKE) -C sort run-rtlsim
$(MAKE) -C fence run-rtlsim
$(MAKE) -C no_mf_ext run-rtlsim
$(MAKE) -C no_smem run-rtlsim
$(MAKE) -C vecaddx run-rtlsim
$(MAKE) -C sgemmx run-rtlsim
# Default target: run make in all subdirectories
all: $(SUBDIRS)
run-opae:
$(MAKE) -C basic run-opae
$(MAKE) -C demo run-opae
$(MAKE) -C dogfood run-opae
$(MAKE) -C mstress run-opae
$(MAKE) -C io_addr run-opae
$(MAKE) -C printf run-opae
$(MAKE) -C diverge run-opae
$(MAKE) -C sort run-opae
$(MAKE) -C fence run-opae
$(MAKE) -C no_mf_ext run-opae
$(MAKE) -C no_smem run-opae
$(MAKE) -C vecaddx run-opae
$(MAKE) -C sgemmx run-opae
$(SUBDIRS):
$(MAKE) -C $@
# Clean target: run make clean in all subdirectories
clean:
$(MAKE) -C basic clean
$(MAKE) -C demo clean
$(MAKE) -C dogfood clean
$(MAKE) -C mstress clean
$(MAKE) -C io_addr clean
$(MAKE) -C printf clean
$(MAKE) -C diverge clean
$(MAKE) -C sort clean
$(MAKE) -C fence clean
$(MAKE) -C no_mf_ext clean
$(MAKE) -C no_smem clean
$(MAKE) -C vecaddx clean
$(MAKE) -C sgemmx clean
for dir in $(SUBDIRS); do $(MAKE) -C $$dir clean; done
clean-all:
$(MAKE) -C basic clean-all
$(MAKE) -C demo clean-all
$(MAKE) -C dogfood clean-all
$(MAKE) -C mstress clean-all
$(MAKE) -C io_addr clean-all
$(MAKE) -C printf clean-all
$(MAKE) -C diverge clean-all
$(MAKE) -C sort clean-all
$(MAKE) -C fence clean-all
$(MAKE) -C no_mf_ext clean-all
$(MAKE) -C no_smem clean-all
$(MAKE) -C vecaddx clean-all
$(MAKE) -C sgemmx clean-all
for dir in $(SUBDIRS); do $(MAKE) -C $$dir clean-all; done

View File

@@ -1,7 +1,5 @@
PROJECT = bad_apple
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

Binary file not shown.

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0x20000;
kernel_arg.addr_b = (uint64_t) 0x28000;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,16 +0,0 @@
PROJECT = basic
SRCS = main.cpp
VX_SRCS = kernel.cpp ../../../kernel/src/vx_perf.c start.S
OPTS ?= -n256
include ../common.mk
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t count;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include "common.h"
int main() {
kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t count = arg->count;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = vx_core_id() * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset + i] = src_ptr[offset + i];
}
return 0;
}

View File

@@ -1,279 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <chrono>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
int test = -1;
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t testno][-k: kernel][-n words][-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
test = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
uint64_t shuffle(int i, uint64_t value) {
return (value << i) | (value & ((1 << i)-1));;
}
int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
int errors = 0;
auto time_start = std::chrono::high_resolution_clock::now();
int num_blocks_8 = (64 * num_blocks) / 8;
// update source buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)staging_buf.data())[i] = shuffle(i, value);
}
/*for (int i = 0; i < num_blocks; ++i) {
std::cout << "data[" << i << "]=0x";
for (int j = 7; j >= 0; --j) {
std::cout << std::hex << ((uint64_t*)staging_buf.data())[i * 8 +j];
}
std::cout << std::endl;
}*/
// write source buffer to local memory
std::cout << "write source buffer to local memory" << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(device, dev_addr, staging_buf.data(), 64 * num_blocks));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)staging_buf.data())[i] = 0;
}
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), dev_addr, 64 * num_blocks));
auto t3 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < num_blocks_8; ++i) {
auto curr = ((uint64_t*)staging_buf.data())[i];
auto ref = shuffle(i, value);
if (curr != ref) {
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
<< ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed;
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
printf("upload time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
printf("download time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Total elapsed time: %lg ms\n", elapsed);
return 0;
}
int run_kernel_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
int errors = 0;
auto time_start = std::chrono::high_resolution_clock::now();
// update source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// start device
std::cout << "start execution" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_start(device));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto t3 = std::chrono::high_resolution_clock::now();
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t4 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
auto t5 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t curr = ((int32_t*)staging_buf.data())[i];
int32_t ref = i;
if (curr != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed;
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
printf("upload time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
printf("execute time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count();
printf("download time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Total elapsed time: %lg ms\n", elapsed);
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
uint32_t num_points = count * num_cores;
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
uint32_t buf_size = num_blocks * 64;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.count = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// run tests
if (0 == test || -1 == test) {
std::cout << "run memcopy test" << std::endl;
RT_CHECK(run_memcopy_test(kernel_arg.src_addr, 0x0badf00d40ff40ff, num_blocks));
}
if (1 == test || -1 == test) {
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "run kernel test" << std::endl;
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "Test PASSED" << std::endl;
return 0;
}

View File

@@ -1,13 +0,0 @@
.section .init, "ax"
.global _start
.type _start, @function
_start:
# call main routine
call main
# dump perf counter
call vx_perf_dump
# end execution
.insn r 0x0b, 0, 0, x0, x0, x0
.size _start, .-_start

View File

@@ -2,11 +2,6 @@ XLEN ?= 32
TOOLDIR ?= /opt
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
@@ -24,13 +19,12 @@ VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
GEMMINI_SW_PATH ?= $(realpath ../../../gemmini)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex
#LLVM_CFLAGS += -mllvm -vortex-branch-divergence=2
#LLVM_CFLAGS += -mllvm -print-after-all
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
@@ -53,14 +47,14 @@ VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sect
# comment out below for regression/basic, which uses GCC that doesn't
# understand these flags
VX_CFLAGS += -mllvm -inline-threshold=262144
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH)
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(GEMMINI_SW_PATH)
VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
# VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a $(VORTEX_KN_PATH)/tohost.S
CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_KN_PATH)/../hw
CXXFLAGS += -I$(VORTEX_RT_PATH)/include
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex
@@ -71,53 +65,22 @@ else
CXXFLAGS += -O2 -DNDEBUG
endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
# CONFIG is supplied from the command line to differentiate ELF files with custom suffixes
CONFIGEXT = $(if $(CONFIG),.$(CONFIG),)
all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel$(CONFIGEXT).dump kernel.radiance$(CONFIGEXT).dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
all: kernel.radiance.dump kernel.radiance$(CONFIGEXT).dump
kernel.radiance.dump: kernel.radiance.elf
$(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump
ifneq ($(CONFIG),)
kernel$(CONFIGEXT).dump: kernel$(CONFIGEXT).elf
$(VX_DP) -D kernel$(CONFIGEXT).elf > kernel$(CONFIGEXT).dump
kernel.radiance$(CONFIGEXT).dump: kernel.radiance$(CONFIGEXT).elf
$(VX_DP) -D kernel.radiance$(CONFIGEXT).elf > kernel.radiance$(CONFIGEXT).dump
endif
kernel.bin: kernel.elf kernel.radiance.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OBJCOPY ?= $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
OBJCOPY_FLAGS ?= "LOAD,ALLOC,DATA,CONTENTS"
BINFILES := args.bin input.a.bin input.b.bin input.c.bin
kernel.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) -o $@ $(VX_SRCS) $(VX_LDFLAGS)
$(OBJCOPY) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --update-section .operand.a=input.a.bin $@ || true
$(OBJCOPY) --update-section .operand.b=input.b.bin $@ || true
$(OBJCOPY) --update-section .operand.c=input.c.bin $@ || true
$(OBJCOPY) --update-section .args=args.bin $@ || true
kernel.radiance.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o $@
@@ -131,41 +94,12 @@ kernel.radiance.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(OBJCOPY) --update-section .args=args.bin $@ || true
ifneq ($(CONFIG),)
kernel$(CONFIGEXT).elf: kernel.elf
cp $< $@
kernel.radiance$(CONFIGEXT).elf: kernel.radiance.elf
cp $< $@
endif
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.bin
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.bin
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
rm -rf *.o
clean-all: clean
rm -rf kernel.elf kernel.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
rm -rf kernel*.elf kernel*.dump

View File

@@ -0,0 +1,145 @@
XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imafd -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
GEMMINI_SW_PATH ?= $(realpath ../../../third_party/gemmini-rocc-tests)
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_MUON ?= $(TOOLDIR)/llvm-muon
LLVM_MUON_32R ?= $(TOOLDIR)/llvm-muon-baseline
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex
#LLVM_CFLAGS += -mllvm -vortex-branch-divergence=2
#LLVM_CFLAGS += -mllvm -print-after-all
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0
#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0
#LLVM_CFLAGS += --rtlib=libgcc
VX_CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
VX_CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
VX_DP = $(LLVM_VORTEX)/bin/llvm-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
MU_CC = $(LLVM_MUON)/bin/clang $(LLVM_CFLAGS)
MU_CXX = $(LLVM_MUON)/bin/clang++ $(LLVM_CFLAGS)
MU_DP = $(LLVM_MUON)/bin/llvm-objdump
MU_CP = $(LLVM_MUON)/bin/llvm-objcopy
MU_32R_CC = $(LLVM_MUON_32R)/bin/clang $(LLVM_CFLAGS)
MU_32R_CXX = $(LLVM_MUON_32R)/bin/clang++ $(LLVM_CFLAGS)
MU_32R_DP = $(LLVM_MUON_32R)/bin/llvm-objdump
MU_32R_CP = $(LLVM_MUON_32R)/bin/llvm-objcopy
VX_CFLAGS += -v -O2 -std=c++17
VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
# comment out below for regression/basic, which uses GCC that doesn't
# understand these flags
VX_CFLAGS += -mllvm -inline-threshold=262144
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(GEMMINI_SW_PATH)
VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
MU_CFLAGS := $(VX_CFLAGS)
MU_CFLAGS += -fuse-ld=lld
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
MU_LDFLAGS := $(VX_LDFLAGS)
VX_LDFLAGS += $(VORTEX_KN_PATH)/libvortexrt.a
MU_LDFLAGS += $(VORTEX_KN_PATH)/libvortexrtmuon.a $(VORTEX_KN_PATH)/tohost.S
CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
# CONFIG is supplied from the command line to differentiate ELF files with custom suffixes
CONFIGEXT = $(if $(CONFIG),.$(CONFIG),)
all: kernel.radiance.dump kernel.radiance$(CONFIGEXT).dump kernel.radiance.32r.dump kernel.vortex.dump
kernel.vortex.dump: kernel.vortex.elf
$(VX_DP) -D kernel.vortex.elf > kernel.vortex.dump
kernel.radiance.dump: kernel.radiance.elf
$(MU_DP) -D kernel.radiance.elf > kernel.radiance.dump
kernel.radiance.32r.dump: kernel.radiance.32r.elf
$(MU_32R_DP) -D kernel.radiance.32r.elf > kernel.radiance.32r.dump
ifneq ($(CONFIG),)
kernel.radiance$(CONFIGEXT).dump: kernel.radiance$(CONFIGEXT).elf
$(MU_DP) -D kernel.radiance$(CONFIGEXT).elf > kernel.radiance$(CONFIGEXT).dump
endif
OBJCOPY_FLAGS ?= "LOAD,ALLOC,DATA,CONTENTS"
BINFILES := args.bin input.a.bin input.b.bin input.c.bin
kernel.vortex.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o $@
$(VX_CP) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(VX_CP) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(VX_CP) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
$(VX_CP) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(VX_CP) --update-section .operand.a=input.a.bin $@ || true
$(VX_CP) --update-section .operand.b=input.b.bin $@ || true
$(VX_CP) --update-section .operand.c=input.c.bin $@ || true
$(VX_CP) --update-section .args=args.bin $@ || true
kernel.radiance.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(MU_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -S
$(MU_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -c
$(MU_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -o $@
# $(MU_CP) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --update-section .operand.a=input.a.bin $@ || true
# $(MU_CP) --update-section .operand.b=input.b.bin $@ || true
# $(MU_CP) --update-section .operand.c=input.c.bin $@ || true
# $(MU_CP) --update-section .args=args.bin $@ || true
kernel.radiance.32r.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(MU_32R_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -o $@
$(MU_32R_CP) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --update-section .operand.a=input.a.bin $@ || true
$(MU_32R_CP) --update-section .operand.b=input.b.bin $@ || true
$(MU_32R_CP) --update-section .operand.c=input.c.bin $@ || true
$(MU_32R_CP) --update-section .args=args.bin $@ || true
ifneq ($(CONFIG),)
kernel.radiance$(CONFIGEXT).elf: kernel.radiance.elf
cp $< $@
endif
clean:
rm -rf *.o
clean-all: clean
rm -rf kernel*.elf kernel*.dump

View File

@@ -1,9 +0,0 @@
PROJECT = demo
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -1,18 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,23 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
uint32_t count = arg->task_size;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,245 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
private:
union Float_t { float f; int i; };
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
const char* kernel_file = "kernel.bin";
uint32_t count = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = diverge
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,83 +0,0 @@
#include <stdint.h>
#include <assert.h>
#include <algorithm>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
if (task_id >= 0x7fffffff) {
value = 0;
} else {
value += 2;
}
// diverge
if (task_id > 1) {
if (task_id > 2) {
value += 6;
} else {
value += 5;
}
} else {
if (task_id > 0) {
value += 4;
} else {
value += 3;
}
}
// all taken
if (task_id >= 0) {
value += 7;
} else {
value = 0;
}
// loop
for (int i = 0, n = task_id; i < n; ++i) {
value += src_ptr[i];
}
// switch
switch (task_id) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(task_id < arg->num_points);
break;
}
// select
value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id);
// min/max
value += std::min(src_ptr[task_id], value);
value += std::max(src_ptr[task_id], value);
dst_ptr[task_id] = value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,268 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <assert.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<int> src_data;
std::vector<int> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
src_data.resize(num_points);
for (uint32_t i = 0; i < src_data.size(); ++i) {
int value = std::rand();
src_data[i] = value;
//std::cout << std::dec << i << ": value=0x" << std::hex << value << std::endl;
}
}
void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (int i = 0; i < (int)ref_data.size(); ++i) {
int value = src_data.at(i);
// none taken
if (i >= 0x7fffffff) {
value = 0;
} else {
value += 2;
}
// diverge
if (i > 1) {
if (i > 2) {
value += 6;
} else {
value += 5;
}
} else {
if (i > 0) {
value += 4;
} else {
value += 3;
}
}
// all taken
if (i >= 0) {
value += 7;
} else {
value = 0;
}
// loop
for (int j = 0, n = i; j < n; ++j) {
value += src_data.at(j);
}
// switch
switch (i) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(i < (int)num_points);
break;
}
// select
value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i);
// min/max
value += std::min(src_data.at(i), value);
value += std::max(src_data.at(i), value);
ref_data[i] = value;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
// generate input data
gen_input_data(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = dogfood
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64 -x19 -x20
include ../common.mk

View File

@@ -1,15 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t testid;
uint32_t num_tasks;
uint32_t task_size;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,396 +0,0 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = a * b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
auto d = a * b + b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = b / a;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per warp delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= wid; ++i) {
barrier_stall += src0_ptr[0] * src0_ptr[i];
}
// memory fence
vx_fence();
// local barrier
vx_barrier(0, num_warps);
// update destination
auto src_idx = (cid * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per core delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= cid; ++i) {
for (int j = 0; j <= wid; ++j) {
barrier_stall += src0_ptr[0] * src0_ptr[i + j];
}
}
// memory fence
vx_fence();
// global barrier
vx_barrier(0x80000000, num_cores);
// update destination
auto src_idx = ((num_cores - 1 - cid) * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
kernel_bar,
kernel_gbar
};
int main() {
auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;
}

View File

@@ -1,218 +0,0 @@
#include <iostream>
#include <vector>
#include <unordered_set>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "testcases.h"
#include "common.h"
///////////////////////////////////////////////////////////////////////////////
TestSuite* testSuite = nullptr;
const char* kernel_file = "kernel.bin";
int count = 0;
std::unordered_set<int> included;
std::unordered_set<int> excluded;
int testid_s = 0;
int testid_e = 0;
bool stop_on_error = true;
vx_device_h device = nullptr;
std::vector<uint8_t> arg_buf;
std::vector<uint8_t> src1_buf;
std::vector<uint8_t> src2_buf;
std::vector<uint8_t> dst_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t<testid>: selected test] [-s<testid>: start test] [-e<testid>: end test] [-x<testid>: excluded tests]" << std::endl;
std::cout << " [-k<kernel>] [-n<words>] [-c] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:x:s:e:k:ch?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
included.insert(atoi(optarg));
break;
case 'x':
excluded.insert(atoi(optarg));
break;
case 's':
testid_s = atoi(optarg);
break;
case 'e':
testid_e = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'c':
stop_on_error = false;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (testSuite) {
delete testSuite;
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int exitcode = 0;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::cout << std::dec;
std::cout << "test ids: " << testid_s << " - " << testid_e << std::endl;
std::cout << "workitem size: " << count << std::endl;
std::cout << "using kernel: " << kernel_file << std::endl;
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
int num_tasks = num_cores * num_warps * num_threads;
int num_points = count * num_tasks;
size_t buf_size = num_points * sizeof(uint32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload kernel" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
arg_buf.resize(sizeof(kernel_arg_t));
src1_buf.resize(buf_size);
src2_buf.resize(buf_size);
dst_buf.resize(buf_size);
// allocate test suite
testSuite = new TestSuite(device);
if (testid_e == 0) {
testid_e = (testSuite->size() - 1);
}
// execute tests
for (int t = testid_s; t <= testid_e; ++t) {
if (!included.empty()) {
if (included.count(t) == 0)
continue;
}
if (!excluded.empty()) {
if (excluded.count(t) != 0)
continue;
}
auto test = testSuite->get_test(t);
auto name = test->name();
std::cout << "Test" << t << ": " << name << std::endl;
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
memcpy(arg_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, arg_buf.data(), sizeof(kernel_arg_t)));
// get test arguments
std::cout << "get test arguments" << std::endl;
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, src1_buf.data(), buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, src2_buf.data(), buf_size));
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
for (int i = 0; i < num_points; ++i) {
((uint32_t*)dst_buf.data())[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, dst_buf.data(), kernel_arg.dst_addr, buf_size));
// verify destination
std::cout << "verify test result" << std::endl;
int errors = test->verify(num_points, dst_buf.data(), src1_buf.data(), src2_buf.data());
if (errors != 0) {
std::cout << "found " << std::dec << errors << " errors!" << std::endl;
std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush;
if (stop_on_error) {
cleanup();
exit(1);
}
exitcode = 1;
} else {
std::cout << "Test" << t << "-" << name << " PASSED!" << std::endl << std::flush;
}
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return exitcode;
}

View File

@@ -1,821 +0,0 @@
#pragma once
#include <iostream>
#include <math.h>
#include <limits>
#include <assert.h>
void cleanup();
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
inline float fround(float x, int32_t precision = 8) {
auto power_of_10 = std::pow(10, precision);
return std::round(x * power_of_10) / power_of_10;
}
inline bool almost_equal_eps(float a, float b, int ulp = 128) {
auto eps = std::numeric_limits<float>::epsilon() * (std::max(fabs(a), fabs(b)) * ulp);
auto d = fabs(a - b);
if (d > eps) {
std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl;
return false;
}
return true;
}
inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) {
Float_t fa{a}, fb{b};
auto d = std::abs(fa.i - fb.i);
if (d > ulp) {
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
return false;
}
return true;
}
inline bool almost_equal(float a, float b) {
if (a == b)
return true;
/*if (almost_equal_eps(a, b))
return true;*/
return almost_equal_ulp(a, b);
}
class ITestCase;
class TestSuite {
public:
TestSuite(vx_device_h device);
~TestSuite();
ITestCase* get_test(int testid) const;
void add_test(ITestCase* test);
size_t size() const;
vx_device_h device() const;
private:
std::vector<ITestCase*> _tests;
vx_device_h device_;
};
class ITestCase {
public:
ITestCase(TestSuite* suite, const char* name)
: suite_(suite)
, name_(name)
{}
virtual ~ITestCase() {}
TestSuite* suite() const {
return suite_;
}
const char* name() const {
return name_;
}
virtual int setup(uint32_t n, void* src1, void* src2) = 0;
virtual int verify(uint32_t n, void* dst, const void* src1, const void* src2) = 0;
protected:
TestSuite* suite_;
const char* const name_;
};
class Test_IADD : public ITestCase {
public:
Test_IADD(TestSuite* suite) : ITestCase(suite, "iadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_IMUL : public ITestCase {
public:
Test_IMUL(TestSuite* suite) : ITestCase(suite, "imul") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_IDIV : public ITestCase {
public:
Test_IDIV(TestSuite* suite) : ITestCase(suite, "idiv") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_IDIV_MUL : public ITestCase {
public:
Test_IDIV_MUL(TestSuite* suite) : ITestCase(suite, "idiv-mul") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = a[i] * b[i];
auto ref = x + y;
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FADD : public ITestCase {
public:
Test_FADD(TestSuite* suite) : ITestCase(suite, "fadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FSUB : public ITestCase {
public:
Test_FSUB(TestSuite* suite) : ITestCase(suite, "fsub") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FMUL : public ITestCase {
public:
Test_FMUL(TestSuite* suite) : ITestCase(suite, "fmul") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FMADD : public ITestCase {
public:
Test_FMADD(TestSuite* suite) : ITestCase(suite, "fmadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FMSUB : public ITestCase {
public:
Test_FMSUB(TestSuite* suite) : ITestCase(suite, "fmsub") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FNMADD : public ITestCase {
public:
Test_FNMADD(TestSuite* suite) : ITestCase(suite, "fnmadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FNMSUB : public ITestCase {
public:
Test_FNMSUB(TestSuite* suite) : ITestCase(suite, "fnmsub") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FNMADD_MADD : public ITestCase {
public:
Test_FNMADD_MADD(TestSuite* suite) : ITestCase(suite, "fnmadd-madd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = -a[i] * b[i] - b[i];
auto y = a[i] * b[i] + b[i];
auto ref = x + y;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FDIV : public ITestCase {
public:
Test_FDIV(TestSuite* suite) : ITestCase(suite, "fdiv") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FDIV2 : public ITestCase {
public:
Test_FDIV2(TestSuite* suite) : ITestCase(suite, "fdiv2") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = b[i] / a[i];
auto ref = x + y;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FSQRT : public ITestCase {
public:
Test_FSQRT(TestSuite* suite) : ITestCase(suite, "fsqrt") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
float q = 1.0f + (i % 64);
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = sqrt(a[i] * b[i]);
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FTOI : public ITestCase {
public:
Test_FTOI(TestSuite* suite) : ITestCase(suite, "ftoi") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
float q = fround(float(n/2) - i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (int32_t)x;
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FTOU : public ITestCase {
public:
Test_FTOU(TestSuite* suite) : ITestCase(suite, "ftou") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
float q = fround(i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (uint32_t)x;
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_ITOF : public ITestCase {
public:
Test_ITOF(TestSuite* suite) : ITestCase(suite, "itof") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 - i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_UTOF : public ITestCase {
public:
Test_UTOF(TestSuite* suite) : ITestCase(suite, "utof") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
b[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_BAR : public ITestCase {
public:
Test_BAR(TestSuite* suite) : ITestCase(suite, "bar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
if (num_warps_ == 1) {
std::cout << "Error: multiple warps configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = (cid * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_warps_;
uint64_t num_threads_;
};
class Test_GBAR : public ITestCase {
public:
Test_GBAR(TestSuite* suite) : ITestCase(suite, "gbar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_CORES, &num_cores_));
if (num_cores_ == 1) {
std::cout << "Error: multiple cores configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = ((num_cores_ - 1 - cid) * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_cores_;
uint64_t num_warps_;
uint64_t num_threads_;
};
///////////////////////////////////////////////////////////////////////////////
TestSuite::TestSuite(vx_device_h device)
: device_(device) {
this->add_test(new Test_IADD(this));
this->add_test(new Test_IMUL(this));
this->add_test(new Test_IDIV(this));
this->add_test(new Test_IDIV_MUL(this));
this->add_test(new Test_FADD(this));
this->add_test(new Test_FSUB(this));
this->add_test(new Test_FMUL(this));
this->add_test(new Test_FMADD(this));
this->add_test(new Test_FMSUB(this));
this->add_test(new Test_FNMADD(this));
this->add_test(new Test_FNMSUB(this));
this->add_test(new Test_FNMADD_MADD(this));
this->add_test(new Test_FDIV(this));
this->add_test(new Test_FDIV2(this));
this->add_test(new Test_FSQRT(this));
this->add_test(new Test_FTOI(this));
this->add_test(new Test_FTOU(this));
this->add_test(new Test_ITOF(this));
this->add_test(new Test_UTOF(this));
this->add_test(new Test_BAR(this));
this->add_test(new Test_GBAR(this));
}
TestSuite::~TestSuite() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
ITestCase* TestSuite::get_test(int testid) const {
return _tests.at(testid);
}
void TestSuite::add_test(ITestCase* test) {
_tests.push_back(test);
}
size_t TestSuite::size() const {
return _tests.size();
}
vx_device_h TestSuite::device() const {
return device_;
}

View File

@@ -1,9 +0,0 @@
PROJECT = fence
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -1,14 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,24 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
vx_fence();
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,194 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = flash_attention
SRCS = main.cpp common.h
# VX_SRCS = kernel.cpp
# VX_SRCS = kernel.gemmini.warpspec.cpp
VX_SRCS = kernel.gemmini.cpp

View File

@@ -1,166 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <cassert>
#include "common.h"
#include "half.hpp"
using half_float::half;
using half_float::half_cast;
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_o, buf_size));
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t dim_seqlen = 128;
uint32_t dim_headdim = 64;
using float_type = half;
uint32_t dst_buf_size =
dim_seqlen * dim_headdim * sizeof(ref_data[0]);
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
kernel_arg.addr_q = 0xa0000000;
kernel_arg.addr_k = 0xa1000000;
kernel_arg.addr_v = 0xa2000000;
kernel_arg.addr_o = 0xc0000000;
kernel_arg.dim_seqlen = dim_seqlen;
kernel_arg.dim_headdim = dim_headdim;
std::cout << "dev_addr_q=0x" << std::hex << kernel_arg.addr_q << std::endl;
std::cout << "dev_addr_k=0x" << std::hex << kernel_arg.addr_k << std::endl;
std::cout << "dev_addr_v=0x" << std::hex << kernel_arg.addr_v << std::endl;
std::cout << "dev_addr_o=0x" << std::hex << kernel_arg.addr_o << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = sizeof(kernel_arg_t);
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,5 +0,0 @@
*.bin
*.dump
*.elf
flops
.depend

View File

@@ -1,9 +0,0 @@
PROJECT = flops
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,15 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#include <cstdint>
#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000
#define DEV_SMEM_START_ADDR 0xff000000
typedef struct {
uint32_t size;
uint32_t addr_src;
uint32_t addr_dst;
} kernel_arg_t;
#endif

Binary file not shown.

View File

@@ -1,41 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
const float *A = (const float *)arg->addr_src;
float *C = (float *)arg->addr_dst;
int incr = A[task_id];
float sum = 0.0f;
float sum1 = 0.0f;
float sum2 = 0.0f;
float sum3 = 0.0f;
float sum4 = 0.0f;
float sum5 = 0.0f;
#pragma unroll 8
for (int i = 0; i < 5000; i++) {
sum1 = sum2 + 5.0f;
sum2 = sum3 + 5.0f;
sum3 = sum4 + 5.0f;
sum4 = sum5 + 5.0f;
sum5 = sum1 + 5.0f;
}
sum = sum1 + sum2 + sum3 + sum4 + sum5;
C[task_id] = static_cast<float>(sum);
}
int main() {
kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR;
const uint32_t grid_size = arg->size;
#ifdef RADIANCE
vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
#else
// NOTE: This kernel assumes contiguous thread scheduling for efficient shared
// memory allocation, and therefore does not work with original vx_spawn_tasks
vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
#endif
return 0;
}

View File

@@ -1,252 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_data(size_t size) {
src_data.resize(size);
for (uint32_t i = 0; i < src_data.size(); ++i) {
src_data[i] = static_cast<float>(i);
}
}
void generate_reference_data(size_t size) {
ref_data.resize(size);
for (uint32_t i = 0; i < ref_data.size(); ++i) {
ref_data[i] = static_cast<float>(i) * 1000.0f;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t size) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_dst, buf_size));
std::cout << "downloading result C matrix from device, device mem address="
<< std::hex << kernel_arg.addr_dst << ", size=" << std::dec
<< buf_size << " bytes\n";
std::ofstream file("output.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open output.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()), buf_size);
file.close();
std::ofstream ref_file("reference.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), buf_size);
ref_file.close();
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < size; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
size_t size = 64;
generate_source_data(size);
generate_reference_data(size);
uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(ref_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_src));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_dst));
kernel_arg.addr_src = 0x20000UL;
kernel_arg.addr_dst = 0xc0000000UL;
kernel_arg.size = size;
std::cout << "dev_addr_src=0x" << std::hex << kernel_arg.addr_src << std::endl;
std::cout << "dev_addr_dst=0x" << std::hex << kernel_arg.addr_dst << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_buf_size,
std::max<uint32_t>(
src_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), src_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_src, staging_buf.data(),
src_buf_size));
std::cout << "uploading source data to device, device mem address="
<< std::hex << kernel_arg.addr_src << ", size=" << std::dec
<< src_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.a.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_dst, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.size));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = idle
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0xa0000000ULL;
kernel_arg.addr_b = (uint64_t) 0xa1000000ULL;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = io_addr
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,19 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint64_t* src_ptr = (uint64_t*)arg->src_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
int32_t* addr_ptr = (int32_t*)(src_ptr[task_id]);
dst_ptr[task_id] = *addr_ptr;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,237 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <VX_config.h>
#include "common.h"
#define NUM_ADDRS 16
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
static uint64_t io_base_addr = IO_CSR_ADDR + IO_CSR_SIZE;
uint64_t usr_test_mem;
std::vector<uint64_t> src_addrs;
std::vector<int32_t> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, usr_test_mem);
vx_dev_close(device);
}
}
void gen_src_addrs(uint32_t num_points) {
src_addrs.resize(num_points);
uint32_t u = 0, k = 0;
for (uint32_t i = 0; i < num_points; ++i) {
if (0 ==(i % 4)) {
k = (i + u) % NUM_ADDRS;
++u;
}
uint32_t j = i % NUM_ADDRS;
uint64_t a = ((j == k) ? usr_test_mem : io_base_addr) + j * sizeof(uint32_t);
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << a << std::endl;
src_addrs[i] = a;
}
}
void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
int32_t j = i % NUM_ADDRS;
ref_data[i] = j * j;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
uint64_t value;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
RT_CHECK(vx_mem_alloc(device, NUM_ADDRS * sizeof(int32_t), VX_MEM_TYPE_GLOBAL, &usr_test_mem));
// generate input data
gen_src_addrs(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = num_points * sizeof(uint64_t);
uint32_t dst_buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << std::dec << num_points << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.dst_addr = value;
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint64_t),
std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload test address data
{
std::cout << "upload test address data" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
buf_ptr[i] = i * i;
}
RT_CHECK(vx_copy_to_dev(device, io_base_addr, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
}
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (uint64_t*)staging_buf.data();
memcpy(buf_ptr, src_addrs.data(), src_buf_size);
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = mstress
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -1,17 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#define NUM_LOADS 8
typedef struct {
uint32_t num_tasks;
uint32_t size;
uint32_t stride;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,29 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t stride = arg->stride;
uint32_t* addr_ptr = (uint32_t*)arg->src0_addr;
float* src_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * stride;
for (uint32_t i = 0; i < stride; ++i) {
float value = 0.0f;
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
uint32_t addr = offset + i + j;
uint32_t index = addr_ptr[addr];
value *= src_ptr[index];
}
dst_ptr[offset+i] = value;
}
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,280 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include "common.h"
#include <assert.h>
#include <limits>
#include <math.h>
#include <vector>
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
inline float fround(float x, int32_t precision = 8) {
auto power_of_10 = std::pow(10, precision);
return std::round(x * power_of_10) / power_of_10;
}
inline bool almost_equal_eps(float a, float b, int ulp = 128) {
auto eps = std::numeric_limits<float>::epsilon() * (std::max(fabs(a), fabs(b)) * ulp);
auto d = fabs(a - b);
if (d > eps) {
std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl;
return false;
}
return true;
}
inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) {
Float_t fa{a}, fb{b};
auto d = std::abs(fa.i - fb.i);
if (d > ulp) {
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
return false;
}
return true;
}
inline bool almost_equal(float a, float b) {
if (a == b)
return true;
/*if (almost_equal_eps(a, b))
return true;*/
return almost_equal_ulp(a, b);
}
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> test_data;
std::vector<uint32_t> addr_table;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
test_data.resize(num_points);
addr_table.resize(num_points + NUM_LOADS - 1);
for (uint32_t i = 0; i < num_points; ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
test_data[i] = r;
}
for (uint32_t i = 0; i < addr_table.size(); ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
uint32_t index = static_cast<uint32_t>(r * num_points);
assert(index < num_points);
addr_table[i] = index;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t dst_buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
float ref = 0.0f;
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
uint32_t addr = i + j;
uint32_t index = addr_table.at(addr);
float value = test_data.at(index);
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
ref *= value;
}
float cur = buf_ptr[i];
if (!almost_equal(cur, ref)) {
std::cout << "error at result #" << std::dec << i
<< ": actual " << cur << ", expected " << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
// generate input data
gen_input_data(num_points);
uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t);
uint32_t src_buf_size = test_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = test_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.stride = count;
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload address buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = no_mf_ext
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n8
include ../common.mk

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t size;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
return 0;
}

View File

@@ -1,174 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,17 +0,0 @@
PROJECT = no_smem
OPTS ?= -n8
VX_SRCS = kernel.cpp $(VORTEX_KN_PATH)/src/vx_perf.c $(VORTEX_KN_PATH)/src/vx_syscalls.c $(VORTEX_KN_PATH)/src/vx_print.S $(VORTEX_KN_PATH)/src/vx_start.S
SRCS = main.cpp
include ../common.mk
VX_CFLAGS += -DSM_DISABLE
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t size;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
return 0;
}

View File

@@ -1,174 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = printf
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n4
include ../common.mk

View File

@@ -1,11 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int cid = vx_core_id();
int* src_ptr = (int*)arg->src_addr;
char value = 'A' + src_ptr[task_id];
vx_printf("cid=%d: task=%d, value=%c\n", cid, task_id, value);
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,138 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 4;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_dev_close(device);
}
}
int run_test() {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_points = count;
uint32_t buf_size = count * sizeof(int32_t);
std::cout << "number of points: " << count << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test());
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = rickroll
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0x20000;
kernel_arg.addr_b = (uint64_t) 0x28000;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

Binary file not shown.

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_gemmini
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0x20000;
kernel_arg.addr_b = (uint64_t) 0x28000;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_gemmini_dma
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -0,0 +1,11 @@
rm kernel.radiance.elf
rm -rf binaries
mkdir binaries
for a in args/*; do
cp -f $a args.bin
aa=$(basename "$a")
cp -f input.a/"$aa" input.a.bin
cp -f input.b/"$aa" input.b.bin
make > /dev/null
mv kernel.radiance.elf binaries/gemmini_debug_dma"$aa".elf
done

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0xa0000000ULL;
kernel_arg.addr_b = (uint64_t) 0xa1000000ULL;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_gemmini_duo
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,282 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 128;
uint32_t dim_n = 128;
uint32_t dim_k = 128;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
std::cout << "write reference output" << std::endl;
std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.c.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), ref_data.size() * sizeof(ref_data[0]));
ref_file.close();
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
// RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.addr_a = 0xa0000000;
kernel_arg.addr_b = 0xa1000000;
kernel_arg.addr_c = 0xc0000000;
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_tcore
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
VX_INCLUDES = sgemm_impl.hpp

View File

@@ -1,308 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <cassert>
#include "common.h"
#include "half.hpp"
using half_float::half;
using half_float::half_cast;
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
template <typename T> std::vector<T> src_a_data;
template <typename T> std::vector<T> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
template <typename T>
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
static_assert(std::is_same_v<half, T> || std::is_same_v<float, T>,
"unsupported floating point datatype");
src_a_data<T>.resize(dim_m * dim_k);
src_b_data<T>.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data<T>.size(); ++i) {
if constexpr (std::is_same_v<half, T>) {
src_a_data<T>[i] = half_cast<half>(static_cast<float>(i));
} else if (std::is_same_v<float, T>) {
src_a_data<T>[i] = static_cast<float>(i);
}
std::cout << "A: " << i << ": value=" << src_a_data<T>[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data<T>.size(); ++i) {
if constexpr (std::is_same_v<half, T>) {
src_b_data<T>[i] = half_cast<half>(static_cast<float>(i));
} else if (std::is_same_v<float, T>) {
src_b_data<T>[i] = static_cast<float>(i);
}
std::cout << "B: " << i << ": value=" << src_b_data<T>[i] << std::endl;
}
}
template <typename T>
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
static_assert(std::is_same_v<half, T> || std::is_same_v<float, T>,
"unsupported floating point datatype");
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += static_cast<float>(src_a_data<T>[dim_k * i + k]) *
static_cast<float>(src_b_data<T>[dim_n * k + j]);
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
using float_type = half;
generate_source_matrix<float_type>(dim_m, dim_n, dim_k);
generate_reference_matmul<float_type>(dim_m, dim_n, dim_k);
std::cout << "write reference output" << std::endl;
std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.c.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), ref_data.size() * sizeof(ref_data[0]));
ref_file.close();
uint32_t src_a_buf_size = src_a_data<float_type>.size() * sizeof(src_a_data<float_type>[0]);
uint32_t src_b_buf_size = src_b_data<float_type>.size() * sizeof(src_b_data<float_type>[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data<float_type>[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
// RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.addr_a = 0xa0000000;
kernel_arg.addr_b = 0xa1000000;
kernel_arg.addr_c = 0xc0000000;
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data<float_type>.data(),
src_a_data<float_type>.size() * sizeof(float_type));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data<float_type>.data(),
src_b_data<float_type>.size() * sizeof(float_type));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_wg
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,292 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
std::cout << "downloading result C matrix from device, device mem address="
<< std::hex << kernel_arg.addr_c << ", size=" << std::dec
<< buf_size << " bytes\n";
std::ofstream file("output.c.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open output.c.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()), buf_size);
file.close();
std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.c.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), buf_size);
ref_file.close();
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 128;
uint32_t dim_n = 128;
uint32_t dim_k = 128;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
// RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.addr_a = 0x20000UL;
kernel_arg.addr_b = 0x28000UL;
kernel_arg.addr_c = 0xc0000000UL;
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.a.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.b.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = sgemmx
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n32
include ../common.mk

View File

@@ -1,18 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif
typedef struct {
uint32_t num_tasks;
uint32_t size;
uint64_t A_addr;
uint64_t B_addr;
uint64_t C_addr;
} kernel_arg_t;
#endif

View File

@@ -1,41 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
inline char is_log2(uint32_t x) {
return ((x & (x-1)) == 0);
}
inline uint32_t log2_fast(uint32_t x) {
return 31 - __builtin_clz (x);
}
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
auto size = arg->size;
uint32_t row, col;
if (is_log2(size)) {
uint32_t log_size = log2_fast(size);
row = task_id >> log_size;
col = task_id & (size-1);
} else {
row = task_id / size;
col = task_id % size;
}
TYPE sum (0);
for (int e = 0; e < size; ++e) {
sum += A[row * size + e] * B[e * size + col];
}
C[row * size + col] = sum;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,251 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <chrono>
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
for (uint32_t row = 0; row < height; ++row) {
for (uint32_t col = 0; col < width; ++col) {
TYPE sum(0);
for (uint32_t e = 0; e < width; ++e) {
sum += A[row * width + e] * B[e * width + col];
}
out[row * width + col] = sum;
}
}
}
const char* kernel_file = "kernel.bin";
uint32_t size = 32;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.A_addr);
vx_mem_free(device, kernel_arg.B_addr);
vx_mem_free(device, kernel_arg.C_addr);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = size * size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "matrix size: " << size << "x" << size << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.A_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.B_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.C_addr));
kernel_arg.num_tasks = num_points;
kernel_arg.size = size;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.A_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.B_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.C_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
std::vector<TYPE> src_A(num_points);
std::vector<TYPE> src_B(num_points);
std::vector<TYPE> refs(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
auto a = static_cast<float>(std::rand()) / RAND_MAX;
auto b = static_cast<float>(std::rand()) / RAND_MAX;
src_A[i] = static_cast<TYPE>(a * size);
src_B[i] = static_cast<TYPE>(b * size);
}
matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_A[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_B[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < refs.size(); ++i) {
auto ref = refs[i];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = sort
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,16 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE int
#endif
typedef struct {
uint32_t num_points;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,25 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t num_points = arg->num_points;
auto src_ptr = (TYPE*)arg->src_addr;
auto dst_ptr = (TYPE*)arg->dst_addr;
auto ref_value = src_ptr[task_id];
uint32_t pos = 0;
for (uint32_t i = 0; i < num_points; ++i) {
auto cur_value = src_ptr[i];
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
}
dst_ptr[pos] = ref_value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,214 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<TYPE> src_data;
std::vector<TYPE> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
src_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
auto r = static_cast<float>(std::rand()) / RAND_MAX;
auto value = static_cast<TYPE>(r * num_points);
src_data[i] = value;
std::cout << std::dec << i << ": value=" << value << std::endl;
}
}
void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
TYPE ref_value = src_data.at(i);
uint32_t pos = 0;
for (uint32_t j = 0; j < num_points; ++j) {
TYPE cur_value = src_data.at(j);
pos += (cur_value < ref_value) || (cur_value == ref_value && j < i);
}
ref_data.at(pos) = ref_value;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
TYPE ref = ref_data.at(i);
TYPE cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
// generate input data
gen_input_data(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = unaligned
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,92 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

Binary file not shown.

View File

@@ -1,9 +1,7 @@
PROJECT = vecaddx
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk
include ../common.mk

Some files were not shown because too many files have changed in this diff Show More