Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

113
runtime/include/vortex.h Normal file
View File

@@ -0,0 +1,113 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __VX_VORTEX_H__
#define __VX_VORTEX_H__
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef void* vx_device_h;
// device caps ids
#define VX_CAPS_VERSION 0x0
#define VX_CAPS_NUM_THREADS 0x1
#define VX_CAPS_NUM_WARPS 0x2
#define VX_CAPS_NUM_CORES 0x3
#define VX_CAPS_CACHE_LINE_SIZE 0x4
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
#define VX_CAPS_ISA_FLAGS 0x8
// device isa flags
#define VX_ISA_STD_A (1ull << 0)
#define VX_ISA_STD_C (1ull << 2)
#define VX_ISA_STD_D (1ull << 3)
#define VX_ISA_STD_E (1ull << 4)
#define VX_ISA_STD_F (1ull << 5)
#define VX_ISA_STD_H (1ull << 7)
#define VX_ISA_STD_I (1ull << 8)
#define VX_ISA_STD_N (1ull << 13)
#define VX_ISA_STD_Q (1ull << 16)
#define VX_ISA_STD_S (1ull << 18)
#define VX_ISA_STD_U (1ull << 20)
#define VX_ISA_ARCH(flags) (1 << (((flags >> 30) & 0x3) + 4))
#define VX_ISA_EXT_ICACHE (1ull << 32)
#define VX_ISA_EXT_DCACHE (1ull << 33)
#define VX_ISA_EXT_L2CACHE (1ull << 34)
#define VX_ISA_EXT_L3CACHE (1ull << 35)
#define VX_ISA_EXT_SMEM (1ull << 36)
// device memory types
#define VX_MEM_TYPE_GLOBAL 0
#define VX_MEM_TYPE_LOCAL 1
// ready wait timeout
#define VX_MAX_TIMEOUT (24*60*60*1000) // 24 Hr
// open the device and connect to it
int vx_dev_open(vx_device_h* hdevice);
// Close the device when all the operations are done
int vx_dev_close(vx_device_h hdevice);
// return device configurations
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
// allocate device memory and return address
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int type, uint64_t* dev_addr);
// release device memory
int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
// get device memory info
int vx_mem_info(vx_device_h hdevice, int type, uint64_t* mem_free, uint64_t* mem_used);
// Copy bytes from host to device memory
int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
// Copy bytes from device memory to host
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
// Start device execution
int vx_start(vx_device_h hdevice);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
// write device configuration registers
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value);
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
// upload kernel bytes to device
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size);
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename);
// performance counters
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
#ifdef __cplusplus
}
#endif
#endif // __VX_VORTEX_H__

View File

@@ -1,214 +0,0 @@
#ifndef VX_INTRINSICS_H
#define VX_INTRINSICS_H
#include <VX_config.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
#else
#define __ASM_STR(x) #x
#endif
#define csr_read(csr) ({ \
unsigned __r; \
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
__r; \
})
#define csr_write(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
})
#define csr_swap(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_read_set(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_set(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
})
#define csr_read_clear(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_clear(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
})
// Texture load
#define vx_tex(unit, u, v, lod) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
__r; \
})
// Conditional move
#define vx_cmov(c, t, f) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
__r; \
})
// Set thread mask
inline void vx_tmc(unsigned thread_mask) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
}
// Set thread predicate
inline void vx_pred(unsigned condition) {
asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
}
typedef void (*vx_wspawn_pfn)();
// Spawn warps
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
}
// Split on a predicate
inline void vx_split(int predicate) {
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
}
// Join
inline void vx_join() {
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
}
// Warp Barrier
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
}
// Prefetch
inline void vx_prefetch(unsigned addr) {
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
}
// Return active warp's thread id
inline int vx_thread_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_WTID));
return result;
}
// Return active core's local thread id
inline int vx_thread_lid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LTID));
return result;
}
// Return processsor global thread id
inline int vx_thread_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GTID));
return result;
}
// Return active core's local warp id
inline int vx_warp_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_LWID));
return result;
}
// Return processsor's global warp id
inline int vx_warp_gid() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GWID));
return result;
}
// Return processsor core id
inline int vx_core_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_GCID));
return result;
}
// Return current threadk mask
inline int vx_thread_mask() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
return result;
}
// Return the number of threads in a warp
inline int vx_num_threads() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NT));
return result;
}
// Return the number of warps in a core
inline int vx_num_warps() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NW));
return result;
}
// Return the number of cores in the processsor
inline int vx_num_cores() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_NC));
return result;
}
inline void vx_fence() {
asm volatile ("fence iorw, iorw");
}
#define __if(b) vx_split(b); \
if (b)
#define __else else
#define __endif vx_join();
#define __DIVERGENT__ __attribute__((annotate("divergent")))
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,21 +0,0 @@
#ifndef VX_PRINT_H
#define VX_PRINT_H
#include <stdarg.h>
#ifdef __cplusplus
extern "C" {
#endif
int vx_vprintf(const char* format, va_list va);
int vx_printf(const char * format, ...);
void vx_putchar(int c);
void vx_putint(int value, int base);
void vx_putfloat(float value, int precision);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,43 +0,0 @@
#ifndef VX_API_H
#define VX_API_H
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
uint32_t num_groups[3];
uint32_t global_offset[3];
uint32_t local_size[3];
char * printf_buffer;
uint32_t *printf_buffer_position;
uint32_t printf_buffer_capacity;
uint32_t work_dim;
} context_t;
typedef void (*vx_spawn_kernel_cb) (
const void * /* arg */,
const context_t * /* context */,
uint32_t /* group_x */,
uint32_t /* group_y */,
uint32_t /* group_z */
);
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
typedef void (*vx_serial_cb)(void *arg);
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
void vx_serial(vx_serial_cb callback, void * arg);
#ifdef __cplusplus
}
#endif
#endif