Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

890
kernel/src/tinyprintf.c Normal file
View File

@@ -0,0 +1,890 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
// embedded systems with a very limited resources. These routines are thread
// safe and reentrant!
// Use this instead of the bloated standard/newlib printf cause these use
// malloc for printf (and may not be thread safe).
//
///////////////////////////////////////////////////////////////////////////////
#include <stdbool.h>
#include <stdint.h>
#include "tinyprintf.h"
#include "vx_print.h"
// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
// printf_config.h header file
// default: undefined
#ifdef PRINTF_INCLUDE_CONFIG_H
#include "printf_config.h"
#endif
// 'ntoa' conversion buffer size, this must be big enough to hold one converted
// numeric number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_NTOA_BUFFER_SIZE
#define PRINTF_NTOA_BUFFER_SIZE 32U
#endif
// 'ftoa' conversion buffer size, this must be big enough to hold one converted
// float number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_FTOA_BUFFER_SIZE
#define PRINTF_FTOA_BUFFER_SIZE 32U
#endif
// support for the floating point type (%f)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
#define PRINTF_SUPPORT_FLOAT
#endif
// support for exponential floating point notation (%e/%g)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
#define PRINTF_SUPPORT_EXPONENTIAL
#endif
// define the default floating point precision
// default: 6 digits
#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
#define PRINTF_DEFAULT_FLOAT_PRECISION 6U
#endif
// define the largest float suitable to print with %f
// default: 1e9
#ifndef PRINTF_MAX_FLOAT
#define PRINTF_MAX_FLOAT 1e9
#endif
// support for the long long types (%llu or %p)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
#define PRINTF_SUPPORT_LONG_LONG
#endif
// support for the ptrdiff_t type (%t)
// ptrdiff_t is normally defined in <stddef.h> as long or long long type
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
#define PRINTF_SUPPORT_PTRDIFF_T
#endif
///////////////////////////////////////////////////////////////////////////////
// internal flag definitions
#define FLAGS_ZEROPAD (1U << 0U)
#define FLAGS_LEFT (1U << 1U)
#define FLAGS_PLUS (1U << 2U)
#define FLAGS_SPACE (1U << 3U)
#define FLAGS_HASH (1U << 4U)
#define FLAGS_UPPERCASE (1U << 5U)
#define FLAGS_CHAR (1U << 6U)
#define FLAGS_SHORT (1U << 7U)
#define FLAGS_LONG (1U << 8U)
#define FLAGS_LONG_LONG (1U << 9U)
#define FLAGS_PRECISION (1U << 10U)
#define FLAGS_ADAPT_EXP (1U << 11U)
// import float.h for DBL_MAX
#if defined(PRINTF_SUPPORT_FLOAT)
#include <float.h>
#endif
// output function type
typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
// wrapper (used as buffer) for output function type
typedef struct {
void (*fct)(char character, void* arg);
void* arg;
} out_fct_wrap_type;
// internal buffer output
static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
{
if (idx < maxlen) {
((char*)buffer)[idx] = character;
}
}
// internal null output
static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)character; (void)buffer; (void)idx; (void)maxlen;
}
// internal _putchar wrapper
static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)buffer; (void)idx; (void)maxlen;
if (character) {
vx_putchar(character);
}
}
// internal output function wrapper
static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)idx; (void)maxlen;
if (character) {
// buffer is the output fct pointer
((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
}
}
// internal secure strlen
// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
{
const char* s;
for (s = str; *s && maxsize--; ++s);
return (unsigned int)(s - str);
}
// internal test if char is a digit (0-9)
// \return true if char is a digit
static inline bool _is_digit(char ch)
{
return (ch >= '0') && (ch <= '9');
}
// internal ASCII string to unsigned int conversion
static unsigned int _atoi(const char** str)
{
unsigned int i = 0U;
while (_is_digit(**str)) {
i = i * 10U + (unsigned int)(*((*str)++) - '0');
}
return i;
}
// output the specified string in reverse, taking care of any zero-padding
static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
{
const size_t start_idx = idx;
// pad spaces up to given width
if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
for (size_t i = len; i < width; i++) {
out(' ', buffer, idx++, maxlen);
}
}
// reverse string
while (len) {
out(buf[--len], buffer, idx++, maxlen);
}
// append pad spaces up to given width
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) {
out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
// internal itoa format
static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
{
// pad leading zeros
if (!(flags & FLAGS_LEFT)) {
if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
// handle hash
if (flags & FLAGS_HASH) {
if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
len--;
if (len && (base == 16U)) {
len--;
}
}
if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'x';
}
else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'X';
}
else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'b';
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
buf[len++] = '0';
}
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
// internal itoa for 'long' type
static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
// internal itoa for 'long long' type
#if defined(PRINTF_SUPPORT_LONG_LONG)
static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
#endif // PRINTF_SUPPORT_LONG_LONG
#if defined(PRINTF_SUPPORT_FLOAT)
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
#endif
// internal ftoa for fixed decimal floating point
static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_FTOA_BUFFER_SIZE];
size_t len = 0U;
double diff = 0.0;
// powers of 10
static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
// test for special values
if (value != value)
return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
if (value < -DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
if (value > DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
// test for very large values
// standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
#else
return 0U;
#endif
}
// test for negative
bool negative = false;
if (value < 0) {
negative = true;
value = 0 - value;
}
// set default precision, if not set explicitly
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// limit precision to 9, cause a prec >= 10 can lead to overflow errors
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
buf[len++] = '0';
prec--;
}
int whole = (int)value;
double tmp = (value - whole) * pow10[prec];
unsigned long frac = (unsigned long)tmp;
diff = tmp - frac;
if (diff > 0.5) {
++frac;
// handle rollover, e.g. case 0.99 with prec 1 is 1.0
if (frac >= pow10[prec]) {
frac = 0;
++whole;
}
}
else if (diff < 0.5) {
}
else if ((frac == 0U) || (frac & 1U)) {
// if halfway, round up if odd OR if last digit is 0
++frac;
}
if (prec == 0U) {
diff = value - (double)whole;
if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
// exactly 0.5 and ODD, then round up
// 1.5 -> 2, but 2.5 -> 2
++whole;
}
}
else {
unsigned int count = prec;
// now do fractional part, as an unsigned number
while (len < PRINTF_FTOA_BUFFER_SIZE) {
--count;
buf[len++] = (char)(48U + (frac % 10U));
if (!(frac /= 10U)) {
break;
}
}
// add extra 0s
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
buf[len++] = '0';
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
// add decimal
buf[len++] = '.';
}
}
// do whole part, number is reversed
while (len < PRINTF_FTOA_BUFFER_SIZE) {
buf[len++] = (char)(48 + (whole % 10));
if (!(whole /= 10)) {
break;
}
}
// pad leading zeros
if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
// check for NaN and special values
if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
}
// determine the sign
const bool negative = value < 0;
if (negative) {
value = -value;
}
// default precision
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// determine the decimal exponent
// based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
union {
uint64_t U;
double F;
} conv;
conv.F = value;
int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2
conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2)
// now approximate log10 from the log2 integer part and an expansion of ln around 1.5
int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
// now we want to compute 10^expval but we want to be sure it won't overflow
exp2 = (int)(expval * 3.321928094887362 + 0.5);
const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
const double z2 = z * z;
conv.U = (uint64_t)(exp2 + 1023) << 52U;
// compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
// correct for rounding errors
if (value < conv.F) {
expval--;
conv.F /= 10;
}
// the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
// in "%g" mode, "prec" is the number of *significant figures* not decimals
if (flags & FLAGS_ADAPT_EXP) {
// do we want to fall-back to "%f" mode?
if ((value >= 1e-4) && (value < 1e6)) {
if ((int)prec > expval) {
prec = (unsigned)((int)prec - expval - 1);
}
else {
prec = 0;
}
flags |= FLAGS_PRECISION; // make sure _ftoa respects precision
// no characters in exponent
minwidth = 0U;
expval = 0;
}
else {
// we use one sigfig for the whole part
if ((prec > 0) && (flags & FLAGS_PRECISION)) {
--prec;
}
}
}
// will everything fit?
unsigned int fwidth = width;
if (width > minwidth) {
// we didn't fall-back so subtract the characters required for the exponent
fwidth -= minwidth;
} else {
// not enough characters, so go back to default sizing
fwidth = 0U;
}
if ((flags & FLAGS_LEFT) && minwidth) {
// if we're padding on the right, DON'T pad the floating part
fwidth = 0U;
}
// rescale the float value
if (expval) {
value /= conv.F;
}
// output the floating part
const size_t start_idx = idx;
idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
// output the exponent part
if (minwidth) {
// output the exponential symbol
out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
// output the exponent value
idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
// might need to right-pad spaces
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
// internal vsnprintf
static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) {
unsigned int flags, width, precision, n;
size_t idx = 0U;
if (!buffer) {
// use null output function
out = _out_null;
}
while (*format)
{
// format specifier? %[flags][width][.precision][length]
if (*format != '%') {
// no
out(*format, buffer, idx++, maxlen);
format++;
continue;
}
else {
// yes, evaluate it
format++;
}
// evaluate flags
flags = 0U;
do {
switch (*format) {
case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
case '-': flags |= FLAGS_LEFT; format++; n = 1U; break;
case '+': flags |= FLAGS_PLUS; format++; n = 1U; break;
case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break;
case '#': flags |= FLAGS_HASH; format++; n = 1U; break;
default : n = 0U; break;
}
} while (n);
// evaluate width field
width = 0U;
if (_is_digit(*format)) {
width = _atoi(&format);
}
else if (*format == '*') {
const int w = va_arg(va, int);
if (w < 0) {
flags |= FLAGS_LEFT; // reverse padding
width = (unsigned int)-w;
}
else {
width = (unsigned int)w;
}
format++;
}
// evaluate precision field
precision = 0U;
if (*format == '.') {
flags |= FLAGS_PRECISION;
format++;
if (_is_digit(*format)) {
precision = _atoi(&format);
}
else if (*format == '*') {
const int prec = (int)va_arg(va, int);
precision = prec > 0 ? (unsigned int)prec : 0U;
format++;
}
}
// evaluate length field
switch (*format) {
case 'l' :
flags |= FLAGS_LONG;
format++;
if (*format == 'l') {
flags |= FLAGS_LONG_LONG;
format++;
}
break;
case 'h' :
flags |= FLAGS_SHORT;
format++;
if (*format == 'h') {
flags |= FLAGS_CHAR;
format++;
}
break;
#if defined(PRINTF_SUPPORT_PTRDIFF_T)
case 't' :
flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
#endif
case 'j' :
flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
case 'z' :
flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
default :
break;
}
// evaluate specifier
switch (*format) {
case 'd' :
case 'i' :
case 'u' :
case 'x' :
case 'X' :
case 'o' :
case 'b' : {
// set the base
unsigned int base;
if (*format == 'x' || *format == 'X') {
base = 16U;
}
else if (*format == 'o') {
base = 8U;
}
else if (*format == 'b') {
base = 2U;
}
else {
base = 10U;
flags &= ~FLAGS_HASH; // no hash for dec format
}
// uppercase
if (*format == 'X') {
flags |= FLAGS_UPPERCASE;
}
// no plus or space flag for u, x, X, o, b
if ((*format != 'i') && (*format != 'd')) {
flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
}
// ignore '0' flag when precision is given
if (flags & FLAGS_PRECISION) {
flags &= ~FLAGS_ZEROPAD;
}
// convert the integer
if ((*format == 'i') || (*format == 'd')) {
// signed
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
const long long value = va_arg(va, long long);
idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
const long value = va_arg(va, long);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
else {
const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
}
else {
// unsigned
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
}
else {
const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
}
}
format++;
break;
}
#if defined(PRINTF_SUPPORT_FLOAT)
case 'f' :
case 'F' :
if (*format == 'F') flags |= FLAGS_UPPERCASE;
idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
case 'e':
case 'E':
case 'g':
case 'G':
if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
case 'c' : {
unsigned int l = 1U;
// pre padding
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// char output
out((char)va_arg(va, int), buffer, idx++, maxlen);
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 's' : {
const char* p = va_arg(va, char*);
unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
// pre padding
if (flags & FLAGS_PRECISION) {
l = (l < precision ? l : precision);
}
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// string output
while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
out(*(p++), buffer, idx++, maxlen);
}
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 'p' : {
width = sizeof(void*) * 2U;
flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
#if defined(PRINTF_SUPPORT_LONG_LONG)
const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
if (is_ll) {
idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
}
else {
#endif
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
#if defined(PRINTF_SUPPORT_LONG_LONG)
}
#endif
format++;
break;
}
case '%' :
out('%', buffer, idx++, maxlen);
format++;
break;
default :
out(*format, buffer, idx++, maxlen);
format++;
break;
}
}
// termination
out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
// return written chars without terminating \0
return (int)idx;
}
int tiny_printf(const char* format, ...) {
va_list va;
va_start(va, format);
char buffer[1];
const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_sprintf(char* buffer, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_snprintf(char* buffer, size_t count, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
va_end(va);
return ret;
}
int tiny_vprintf(const char* format, va_list va) {
char buffer[1];
return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
}
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) {
return _vsnprintf(_out_buffer, buffer, count, format, va);
}

86
kernel/src/tinyprintf.h Normal file
View File

@@ -0,0 +1,86 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on
// embedded systems with a very limited resources.
// Use this instead of bloated standard/newlib printf.
// These routines are thread safe and reentrant.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef __TINYPRINTF_H__
#define __TINYPRINTF_H__
#include <stdarg.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* Tiny printf implementation
* You have to implement _putchar if you use printf()
* To avoid conflicts with the regular printf() API it is overridden by macro defines
* and internal underscore-appended functions like printf_() are used
* \param format A string that specifies the format of the output
* \return The number of characters that are written into the array, not counting the terminating null character
*/
int tiny_printf(const char* format, ...);
/**
* Tiny sprintf implementation
* Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD!
* \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output!
* \param format A string that specifies the format of the output
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_sprintf(char* buffer, const char* format, ...);
/**
* Tiny snprintf/vsnprintf implementation
* \param buffer A pointer to the buffer where to store the formatted string
* \param count The maximum number of characters to store in the buffer, including a terminating null character
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that COULD have been written into the buffer, not counting the terminating
* null character. A value equal or larger than count indicates truncation. Only when the returned value
* is non-negative and less than count, the string has been completely written.
*/
int tiny_snprintf(char* buffer, size_t count, const char* format, ...);
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va);
/**
* Tiny vprintf implementation
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_vprintf(const char* format, va_list va);
#ifdef __cplusplus
}
#endif
#endif // __TINYPRINTF_H__

49
kernel/src/vx_perf.c Normal file
View File

@@ -0,0 +1,49 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
#include <vx_intrinsics.h>
#include <stdint.h>
#define DUMP_CSR_4(d, s) \
csr_mem[d + 0] = csr_read(s + 0); \
csr_mem[d + 1] = csr_read(s + 1); \
csr_mem[d + 2] = csr_read(s + 2); \
csr_mem[d + 3] = csr_read(s + 3);
#define DUMP_CSR_32(d, s) \
DUMP_CSR_4(d + 0, s + 0) \
DUMP_CSR_4(d + 4, s + 4) \
DUMP_CSR_4(d + 8, s + 8) \
DUMP_CSR_4(d + 12, s + 12) \
DUMP_CSR_4(d + 16, s + 16) \
DUMP_CSR_4(d + 20, s + 20) \
DUMP_CSR_4(d + 24, s + 24) \
DUMP_CSR_4(d + 28, s + 28)
#ifdef __cplusplus
extern "C" {
#endif
void vx_perf_dump() {
int core_id = vx_core_id();
uint32_t* const csr_mem = (uint32_t*)(IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id);
DUMP_CSR_32(0, VX_CSR_MPM_BASE)
DUMP_CSR_32(32, VX_CSR_MPM_BASE_H)
}
#ifdef __cplusplus
}
#endif

32
kernel/src/vx_print.S Normal file
View File

@@ -0,0 +1,32 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
.type vx_putchar, @function
.global vx_putchar
vx_putchar:
csrr t0, VX_CSR_MHARTID
andi t0, t0, %lo(IO_COUT_SIZE-1)
#if (XLEN == 64)
li t1, (IO_COUT_ADDR >> 32)
slli t1, t1, 32
li t2, (IO_COUT_ADDR & 0xffffffff)
or t1, t1, t2
#else
li t1, IO_COUT_ADDR
#endif
add t0, t0, t1
sb a0, 0(t0)
ret

107
kernel/src/vx_print.c Normal file
View File

@@ -0,0 +1,107 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vx_print.h>
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "tinyprintf.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
const char* format;
va_list* va;
int ret;
} printf_arg_t;
typedef struct {
int value;
int base;
} putint_arg_t;
typedef struct {
float value;
int precision;
} putfloat_arg_t;
static void __putint_cb(const putint_arg_t* arg) {
char tmp[33];
float value = arg->value;
int base = arg->base;
itoa(value, tmp, base);
for (int i = 0; i < 33; ++i) {
int c = tmp[i];
if (!c)
break;
vx_putchar(c);
}
}
static void __putfloat_cb(const putfloat_arg_t* arg) {
float value = arg->value;
int precision = arg->precision;
int ipart = (int)value;
vx_putint(ipart, 10);
if (precision != 0) {
vx_putchar('.');
float frac = value - (float)ipart;
float fscaled = frac * pow(10, precision);
vx_putint((int)fscaled, 10);
}
}
static void __vprintf_cb(printf_arg_t* arg) {
arg->ret = tiny_vprintf(arg->format, *arg->va);
}
void vx_putint(int value, int base) {
putint_arg_t arg;
arg.value = value;
arg.base = base;
vx_serial((vx_serial_cb)__putint_cb, &arg);
}
void vx_putfloat(float value, int precision) {
putfloat_arg_t arg;
arg.value = value;
arg.precision = precision;
vx_serial((vx_serial_cb)__putfloat_cb, &arg);
}
int vx_vprintf(const char* format, va_list va) {
printf_arg_t arg;
arg.format = format;
arg.va = &va;
vx_serial((vx_serial_cb)__vprintf_cb, &arg);
return arg.ret;
}
int vx_printf(const char * format, ...) {
int ret;
va_list va;
va_start(va, format);
ret = vx_vprintf(format, va);
va_end(va);
return ret;
}
#ifdef __cplusplus
}
#endif

77
kernel/src/vx_serial.S Normal file
View File

@@ -0,0 +1,77 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
#define RISCV_CUSTOM0 0x0B
.type vx_serial, @function
.global vx_serial
vx_serial:
#if (XLEN == 64)
addi sp, sp, -56
sd ra, 48(sp)
sd s5, 40(sp)
sd s4, 32(sp)
sd s3, 24(sp)
sd s2, 16(sp)
sd s1, 8(sp)
sd s0, 0(sp)
#else
addi sp, sp, -28
sw ra, 24(sp)
sw s5, 20(sp)
sw s4, 16(sp)
sw s3, 12(sp)
sw s2, 8(sp)
sw s1, 4(sp)
sw s0, 0(sp)
#endif
mv s4, a0 # s4 <- callback
mv s3, a1 # s3 <- arg
csrr s2, VX_CSR_NUM_THREADS # s2 <- NT
csrr s1, VX_CSR_THREAD_ID # s1 <- tid
li s0, 0 # s0 <- index
label_loop:
sub t0, s0, s1
seqz t1, t0 # (index != tid)
.insn r RISCV_CUSTOM0, 2, 0, s5, t1, x0 # split s5, t0
bnez t0, label_join
mv a0, s3 # a0 <- arg
jalr s4 # callback(arg)
label_join:
.insn r RISCV_CUSTOM0, 3, 0, x0, s5, x0 # join s5
addi s0, s0, 1 # index++
blt s0, s2, label_loop # loop back
#if (XLEN == 64)
ld ra, 48(sp)
ld s5, 40(sp)
ld s4, 32(sp)
ld s3, 24(sp)
ld s2, 16(sp)
ld s1, 8(sp)
ld s0, 0(sp)
addi sp, sp, 56
#else
lw ra, 24(sp)
lw s5, 20(sp)
lw s4, 16(sp)
lw s3, 12(sp)
lw s2, 8(sp)
lw s1, 4(sp)
lw s0, 0(sp)
addi sp, sp, 28
#endif
ret

334
kernel/src/vx_spawn.c Normal file
View File

@@ -0,0 +1,334 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <inttypes.h>
#ifdef __cplusplus
extern "C" {
#endif
#define NUM_CORES_MAX 1024
#ifndef MIN
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
typedef struct {
vx_spawn_tasks_cb callback;
void* arg;
int offset; // task offset
int NWs; // number of NW batches where NW=<total warps per core>.
int RWs; // number of remaining warps in the core
} wspawn_tasks_args_t;
typedef struct {
context_t * ctx;
vx_spawn_kernel_cb callback;
void* arg;
int offset; // task offset
int NWs; // number of NW batches where NW=<total warps per core>.
int RWs; // number of remaining warps in the core
char isXYpow2;
char log2XY;
char log2X;
} wspawn_kernel_args_t;
void* g_wspawn_args[NUM_CORES_MAX];
inline char is_log2(int x) {
return ((x & (x-1)) == 0);
}
inline int fast_log2(int x) {
float f = x;
return (*(int*)(&f)>>23) - 127;
}
static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
int NT = vx_num_threads();
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int wK = (p_wspawn_args->NWs * wid) + MIN(p_wspawn_args->RWs, wid);
int tK = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
vx_spawn_tasks_cb callback = p_wspawn_args->callback;
void* arg = p_wspawn_args->arg;
for (int task_id = offset, N = task_id + tK; task_id < N; ++task_id) {
callback(task_id, arg);
}
}
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
int cid = vx_core_id();
int tid = vx_thread_id();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_all_stub();
// disable warp
vx_tmc_zero();
}
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of tasks per core
int tasks_per_core = num_tasks / nc;
int tasks_per_core_n1 = tasks_per_core;
if (core_id == (nc-1)) {
int rem = num_tasks - (nc * tasks_per_core);
tasks_per_core_n1 += rem; // last core also executes remaining tasks
}
// number of tasks per warp
int TW = tasks_per_core_n1 / NT; // occupied warps
int rT = tasks_per_core_n1 - TW * NT; // remaining threads
int fW = 1, rW = 0;
if (TW >= NW) {
fW = TW / NW; // full warps iterations
rW = TW - fW * NW; // remaining warps
}
wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW };
g_wspawn_args[core_id] = &wspawn_args;
if (TW >= 1) {
// execute callback on other warps
int nw = MIN(TW, NW);
vx_wspawn(nw, spawn_tasks_all_cb);
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_all_stub();
// back to single-threaded
vx_tmc_one();
// wait for spawn warps to terminate
vx_wspawn_wait();
}
if (rT != 0) {
// adjust offset
wspawn_args.offset += (tasks_per_core_n1 - rT);
// activate remaining threads
int tmask = (1 << rT) - 1;
vx_tmc(tmask);
// call stub routine
spawn_tasks_rem_stub();
// back to single-threaded
vx_tmc_one();
}
}
///////////////////////////////////////////////////////////////////////////////
static void __attribute__ ((noinline)) spawn_kernel_all_stub() {
int NT = vx_num_threads();
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[cid];
int wK = (p_wspawn_args->NWs * wid) + MIN(p_wspawn_args->RWs, wid);
int tK = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
if (p_wspawn_args->isXYpow2) {
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
int k = wg_id >> p_wspawn_args->log2XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d >> p_wspawn_args->log2X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
}
} else {
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
int k = wg_id / XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d / X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
}
}
}
static void __attribute__ ((noinline)) spawn_kernel_rem_stub() {
int cid = vx_core_id();
int tid = vx_thread_id();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[cid];
int wg_id = p_wspawn_args->offset + tid;
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
if (p_wspawn_args->isXYpow2) {
int k = wg_id >> p_wspawn_args->log2XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d >> p_wspawn_args->log2X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
} else {
int k = wg_id / XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d / X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
}
}
static void __attribute__ ((noinline)) spawn_kernel_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_kernel_all_stub();
// disable warp
vx_tmc_zero();
}
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
// total number of WGs
int X = ctx->num_groups[0];
int Y = ctx->num_groups[1];
int Z = ctx->num_groups[2];
int XY = X * Y;
int num_tasks = XY * Z;
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of tasks per core
int tasks_per_core = num_tasks / nc;
int tasks_per_core_n1 = tasks_per_core;
if (core_id == (nc-1)) {
int rem = num_tasks - (nc * tasks_per_core);
tasks_per_core_n1 += rem; // last core also executes remaining WGs
}
// number of tasks per warp
int TW = tasks_per_core_n1 / NT; // occupied warps
int rT = tasks_per_core_n1 - TW * NT; // remaining threads
int fW = 1, rW = 0;
if (TW >= NW) {
fW = TW / NW; // full warps iterations
rW = TW - fW * NW; // remaining warps
}
// fast path handling
char isXYpow2 = is_log2(XY);
char log2XY = fast_log2(XY);
char log2X = fast_log2(X);
wspawn_kernel_args_t wspawn_args = {
ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X
};
g_wspawn_args[core_id] = &wspawn_args;
if (TW >= 1) {
// execute callback on other warps
int nw = MIN(TW, NW);
vx_wspawn(nw, spawn_kernel_all_cb);
// activate all threads
vx_tmc(-1);
// call stub routine
asm volatile("" ::: "memory");
spawn_kernel_all_stub();
// back to single-threaded
vx_tmc_one();
// wait for spawn warps to terminate
vx_wspawn_wait();
}
if (rT != 0) {
// adjust offset
wspawn_args.offset += (tasks_per_core_n1 - rT);
// activate remaining threads
int tmask = (1 << rT) - 1;
vx_tmc(tmask);
// call stub routine
spawn_kernel_rem_stub();
// back to single-threaded
vx_tmc_one();
}
}
#ifdef __cplusplus
}
#endif

150
kernel/src/vx_start.S Normal file
View File

@@ -0,0 +1,150 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
#define RISCV_CUSTOM0 0x0B
.section .init, "ax"
.global _start
.type _start, @function
_start:
# initialize per-thread registers
csrr t0, VX_CSR_NUM_WARPS # get num warps
la t1, init_regs_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
jal init_regs
li t0, 1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
# wait for spawn warps to terminate
jal vx_wspawn_wait
# initialize TLS for all warps
csrr t0, VX_CSR_NUM_WARPS # get num warps
la t1, init_tls_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
call __init_tls
li t0, 1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
# wait for spawn warps to terminate
jal vx_wspawn_wait
# clear BSS segment
la a0, _edata
la a2, _end
sub a2, a2, a0
li a1, 0
call memset
# initialize trap vector
# la t0, trap_entry
# csrw mtvec, t0
# register global termination functions
la a0, __libc_fini_array
call atexit
# run global initialization functions
call __libc_init_array
# call main program routine
call main
# call exit routine
tail exit
.size _start, .-_start
.section .text
.type _exit, @function
.global _exit
_exit:
mv s0, a0
call vx_perf_dump
mv gp, s0
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
.section .text
.type init_regs, @function
.local init_regs
init_regs:
# set global pointer register
.option push
.option norelax
la gp, __global_pointer
.option pop
# set stack pointer register
#if (XLEN == 64)
li t0, (STACK_BASE_ADDR >> 32)
slli t0, t0, 32
li sp, (STACK_BASE_ADDR & 0xffffffff)
or sp, sp, t0
#else
li sp, STACK_BASE_ADDR # load stack base address
#endif
csrr t0, VX_CSR_MHARTID
sll t1, t0, STACK_LOG2_SIZE
sub sp, sp, t1
# set thread pointer register
# use address space after BSS region
# ensure cache line alignment
la t1, __tcb_aligned_size
mul t0, t0, t1
la tp, _end + 63
add tp, tp, t0
and tp, tp, -64
ret
.section .text
.type init_regs_all, @function
.local init_regs_all
init_regs_all:
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
jal init_regs
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
ret
.section .text
.type init_tls_all, @function
.local init_tls_all
init_tls_all:
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
call __init_tls
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
ret
.section .text
.type vx_wspawn_wait, @function
.global vx_wspawn_wait
vx_wspawn_wait:
csrr t0, VX_CSR_WARP_MASK
li t1, 1
bne t0, t1, vx_wspawn_wait
ret
.section .data
.global __dso_handle
.weak __dso_handle
__dso_handle:
.long 0

124
kernel/src/vx_syscalls.c Normal file
View File

@@ -0,0 +1,124 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sys/stat.h>
#include <newlib.h>
#include <unistd.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
int _close(int file) { return -1; }
int _fstat(int file, struct stat *st) { return -1; }
int _isatty(int file) { return 0; }
int _lseek(int file, int ptr, int dir) { return 0; }
int _open(const char *name, int flags, int mode) { return -1; }
int _read(int file, char *ptr, int len) { return -1; }
caddr_t _sbrk(int incr) {
__asm__ __volatile__("ebreak");
return 0;
}
int _write(int file, char *ptr, int len) {
int i;
for (i = 0; i < len; ++i) {
vx_putchar(*ptr++);
}
return len;
}
int _kill(int pid, int sig) { return -1; }
int _getpid() {
return vx_hart_id();
}
void __init_tls(void) {
extern char __tdata_start[];
extern char __tbss_offset[];
extern char __tdata_size[];
extern char __tbss_size[];
// TLS memory initialization
register char *__thread_self __asm__ ("tp");
memcpy(__thread_self, __tdata_start, (size_t)__tdata_size);
memset(__thread_self + (size_t)__tbss_offset, 0, (size_t)__tbss_size);
}
#ifdef HAVE_INITFINI_ARRAY
/* These magic symbols are provided by the linker. */
extern void (*__preinit_array_start []) (void) __attribute__((weak));
extern void (*__preinit_array_end []) (void) __attribute__((weak));
extern void (*__init_array_start []) (void) __attribute__((weak));
extern void (*__init_array_end []) (void) __attribute__((weak));
#ifdef HAVE_INIT_FINI
extern void _init (void);
#endif
/* Iterate over all the init routines. */
void __libc_init_array (void) {
size_t count;
size_t i;
count = __preinit_array_end - __preinit_array_start;
for (i = 0; i < count; i++)
__preinit_array_start[i] ();
#ifdef HAVE_INIT_FINI
_init ();
#endif
count = __init_array_end - __init_array_start;
for (i = 0; i < count; i++)
__init_array_start[i] ();
}
#endif
#ifdef HAVE_INITFINI_ARRAY
extern void (*__fini_array_start []) (void) __attribute__((weak));
extern void (*__fini_array_end []) (void) __attribute__((weak));
#ifdef HAVE_INIT_FINI
extern void _fini (void);
#endif
/* Run all the cleanup routines. */
void __libc_fini_array (void) {
size_t count;
size_t i;
count = __fini_array_end - __fini_array_start;
for (i = count; i > 0; i--)
__fini_array_start[i-1] ();
#ifdef HAVE_INIT_FINI
_fini ();
#endif
}
#endif
#ifdef __cplusplus
}
#endif