Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
113
runtime/include/vortex.h
Normal file
113
runtime/include/vortex.h
Normal file
@@ -0,0 +1,113 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __VX_VORTEX_H__
|
||||
#define __VX_VORTEX_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* vx_device_h;
|
||||
|
||||
// device caps ids
|
||||
#define VX_CAPS_VERSION 0x0
|
||||
#define VX_CAPS_NUM_THREADS 0x1
|
||||
#define VX_CAPS_NUM_WARPS 0x2
|
||||
#define VX_CAPS_NUM_CORES 0x3
|
||||
#define VX_CAPS_CACHE_LINE_SIZE 0x4
|
||||
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
|
||||
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
||||
#define VX_CAPS_ISA_FLAGS 0x8
|
||||
|
||||
// device isa flags
|
||||
#define VX_ISA_STD_A (1ull << 0)
|
||||
#define VX_ISA_STD_C (1ull << 2)
|
||||
#define VX_ISA_STD_D (1ull << 3)
|
||||
#define VX_ISA_STD_E (1ull << 4)
|
||||
#define VX_ISA_STD_F (1ull << 5)
|
||||
#define VX_ISA_STD_H (1ull << 7)
|
||||
#define VX_ISA_STD_I (1ull << 8)
|
||||
#define VX_ISA_STD_N (1ull << 13)
|
||||
#define VX_ISA_STD_Q (1ull << 16)
|
||||
#define VX_ISA_STD_S (1ull << 18)
|
||||
#define VX_ISA_STD_U (1ull << 20)
|
||||
#define VX_ISA_ARCH(flags) (1 << (((flags >> 30) & 0x3) + 4))
|
||||
#define VX_ISA_EXT_ICACHE (1ull << 32)
|
||||
#define VX_ISA_EXT_DCACHE (1ull << 33)
|
||||
#define VX_ISA_EXT_L2CACHE (1ull << 34)
|
||||
#define VX_ISA_EXT_L3CACHE (1ull << 35)
|
||||
#define VX_ISA_EXT_SMEM (1ull << 36)
|
||||
|
||||
// device memory types
|
||||
#define VX_MEM_TYPE_GLOBAL 0
|
||||
#define VX_MEM_TYPE_LOCAL 1
|
||||
|
||||
// ready wait timeout
|
||||
#define VX_MAX_TIMEOUT (24*60*60*1000) // 24 Hr
|
||||
|
||||
// open the device and connect to it
|
||||
int vx_dev_open(vx_device_h* hdevice);
|
||||
|
||||
// Close the device when all the operations are done
|
||||
int vx_dev_close(vx_device_h hdevice);
|
||||
|
||||
// return device configurations
|
||||
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
|
||||
|
||||
// allocate device memory and return address
|
||||
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr);
|
||||
|
||||
// release device memory
|
||||
int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
|
||||
|
||||
// get device memory info
|
||||
int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used);
|
||||
|
||||
// Copy bytes from host to device memory
|
||||
int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
|
||||
|
||||
// Copy bytes from device memory to host
|
||||
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
|
||||
|
||||
// Start device execution
|
||||
int vx_start(vx_device_h hdevice);
|
||||
|
||||
// Wait for device ready with milliseconds timeout
|
||||
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
|
||||
|
||||
// write device configuration registers
|
||||
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value);
|
||||
|
||||
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
|
||||
|
||||
// upload kernel bytes to device
|
||||
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size);
|
||||
|
||||
// upload kernel file to device
|
||||
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename);
|
||||
|
||||
// performance counters
|
||||
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
|
||||
int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __VX_VORTEX_H__
|
||||
@@ -1,214 +0,0 @@
|
||||
#ifndef VX_INTRINSICS_H
|
||||
#define VX_INTRINSICS_H
|
||||
|
||||
#include <VX_config.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
#else
|
||||
#define __ASM_STR(x) #x
|
||||
#endif
|
||||
|
||||
#define csr_read(csr) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define csr_swap(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_read_set(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define csr_read_clear(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, lod) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Conditional move
|
||||
#define vx_cmov(c, t, f) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Set thread mask
|
||||
inline void vx_tmc(unsigned thread_mask) {
|
||||
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
|
||||
}
|
||||
|
||||
// Set thread predicate
|
||||
inline void vx_pred(unsigned condition) {
|
||||
asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
|
||||
}
|
||||
|
||||
typedef void (*vx_wspawn_pfn)();
|
||||
|
||||
// Spawn warps
|
||||
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
|
||||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline void vx_split(int predicate) {
|
||||
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
|
||||
}
|
||||
|
||||
// Join
|
||||
inline void vx_join() {
|
||||
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
||||
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
// Prefetch
|
||||
inline void vx_prefetch(unsigned addr) {
|
||||
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
inline int vx_thread_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return active core's local thread id
|
||||
inline int vx_thread_lid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor global thread id
|
||||
inline int vx_thread_gid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return active core's local warp id
|
||||
inline int vx_warp_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor's global warp id
|
||||
inline int vx_warp_gid() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return processsor core id
|
||||
inline int vx_core_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return current threadk mask
|
||||
inline int vx_thread_mask() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of threads in a warp
|
||||
inline int vx_num_threads() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of warps in a core
|
||||
inline int vx_num_warps() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return the number of cores in the processsor
|
||||
inline int vx_num_cores() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline void vx_fence() {
|
||||
asm volatile ("fence iorw, iorw");
|
||||
}
|
||||
|
||||
#define __if(b) vx_split(b); \
|
||||
if (b)
|
||||
|
||||
#define __else else
|
||||
|
||||
#define __endif vx_join();
|
||||
|
||||
#define __DIVERGENT__ __attribute__((annotate("divergent")))
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,21 +0,0 @@
|
||||
#ifndef VX_PRINT_H
|
||||
#define VX_PRINT_H
|
||||
|
||||
#include <stdarg.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int vx_vprintf(const char* format, va_list va);
|
||||
int vx_printf(const char * format, ...);
|
||||
|
||||
void vx_putchar(int c);
|
||||
void vx_putint(int value, int base);
|
||||
void vx_putfloat(float value, int precision);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,43 +0,0 @@
|
||||
#ifndef VX_API_H
|
||||
#define VX_API_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_groups[3];
|
||||
uint32_t global_offset[3];
|
||||
uint32_t local_size[3];
|
||||
char * printf_buffer;
|
||||
uint32_t *printf_buffer_position;
|
||||
uint32_t printf_buffer_capacity;
|
||||
uint32_t work_dim;
|
||||
} context_t;
|
||||
|
||||
typedef void (*vx_spawn_kernel_cb) (
|
||||
const void * /* arg */,
|
||||
const context_t * /* context */,
|
||||
uint32_t /* group_x */,
|
||||
uint32_t /* group_y */,
|
||||
uint32_t /* group_z */
|
||||
);
|
||||
|
||||
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
|
||||
|
||||
typedef void (*vx_serial_cb)(void *arg);
|
||||
|
||||
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
|
||||
void vx_serial(vx_serial_cb callback, void * arg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user