more renaming and cleanup

This commit is contained in:
Richard Yan
2025-01-29 21:22:41 -08:00
parent f98cd9bc22
commit 0d842a5930
348 changed files with 6 additions and 136287 deletions

0
lib/.gitignore vendored Normal file
View File

67
lib/Makefile Normal file
View File

@@ -0,0 +1,67 @@
XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex -mllvm -vortex-branch-divergence=0
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0
#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0
#LLVM_CFLAGS += --rtlib=libgcc
#CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
#CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
#DP = $(LLVM_VORTEX)/bin/llvm-objdump
#CP = $(LLVM_VORTEX)/bin/llvm-objcopy
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I./include
CFLAGS += -DXLEN_$(XLEN)
PROJECT = libvortexrt
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_serial.S ./src/vx_perf.c
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT).a $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).a
$(DP) -D $(PROJECT).a > $(PROJECT).dump
%.S.o: src/%.S
$(CC) $(CFLAGS) -c $< -o $@
%.cpp.o: src/%.cpp include/vx_spawn.h
$(CXX) $(CFLAGS) -c $< -o $@
%.c.o: src/%.c include/vx_spawn.h
$(CC) $(CFLAGS) -c $< -o $@
$(PROJECT).a: $(OBJS)
$(AR) rcs $@ $^
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.a *.o *.dump .depend

View File

@@ -0,0 +1,24 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_ACCUMULATOR_H
#define SRC_MAIN_C_ACCUMULATOR_H
#include "rocc-software/src/xcustom.h"
#define k_DO_WRITE 0
#define k_DO_READ 1
#define k_DO_LOAD 2
#define k_DO_ACCUM 3
#define XCUSTOM_ACC 0
#define doWrite(y, rocc_rd, data) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, data, rocc_rd, k_DO_WRITE);
#define doRead(y, rocc_rd) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, 0, rocc_rd, k_DO_READ);
#define doLoad(y, rocc_rd, mem_addr) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, mem_addr, rocc_rd, k_DO_LOAD);
#define doAccum(y, rocc_rd, data) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, data, rocc_rd, k_DO_ACCUM);
#endif // SRC_MAIN_C_ACCUMULATOR_H

View File

@@ -0,0 +1,10 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_CHARACTER_H
#define SRC_MAIN_C_CHARACTER_H
#include "rocc-software/src/xcustom.h"
#define XCUSTOM_CHAR 2
#endif // SRC_MAIN_C_CHARACTER_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
// See LICENSE for license details.
#ifndef COUNTER_H_
#define COUNTER_H_
#define DISABLE 0
#define INCREMENTAL_COUNTERS 44
// All existing Gemmini performance counters
#define MAIN_LD_CYCLES 1
#define MAIN_ST_CYCLES 2
#define MAIN_EX_CYCLES 3
#define MAIN_LD_ST_CYCLES 4
#define MAIN_LD_EX_CYCLES 5
#define MAIN_ST_EX_CYCLES 6
#define MAIN_LD_ST_EX_CYCLES 7
#define LOAD_DMA_WAIT_CYCLE 8
#define LOAD_ACTIVE_CYCLE 9
#define LOAD_SCRATCHPAD_WAIT_CYCLE 10
#define STORE_DMA_WAIT_CYCLE 11
#define STORE_ACTIVE_CYCLE 12
#define STORE_POOLING_CYCLE 13
#define STORE_SCRATCHPAD_WAIT_CYCLE 14
#define DMA_TLB_MISS_CYCLE 15
#define DMA_TLB_HIT_REQ 16
#define DMA_TLB_TOTAL_REQ 17
#define RDMA_ACTIVE_CYCLE 18
#define RDMA_TLB_WAIT_CYCLES 19
#define RDMA_TL_WAIT_CYCLES 20
#define WDMA_ACTIVE_CYCLE 21
#define WDMA_TLB_WAIT_CYCLES 22
#define WDMA_TL_WAIT_CYCLES 23
#define EXE_ACTIVE_CYCLE 24
#define EXE_FLUSH_CYCLE 25
#define EXE_CONTROL_Q_BLOCK_CYCLE 26
#define EXE_PRELOAD_HAZ_CYCLE 27
#define EXE_OVERLAP_HAZ_CYCLE 28
#define SCRATCHPAD_A_WAIT_CYCLE 29
#define SCRATCHPAD_B_WAIT_CYCLE 30
#define SCRATCHPAD_D_WAIT_CYCLE 31
#define ACC_A_WAIT_CYCLE 32
#define ACC_B_WAIT_CYCLE 33
#define ACC_D_WAIT_CYCLE 34
#define A_GARBAGE_CYCLES 35
#define B_GARBAGE_CYCLES 36
#define D_GARBAGE_CYCLES 37
#define IM2COL_MEM_CYCLES 38
#define IM2COL_ACTIVE_CYCLES 39
#define IM2COL_TRANSPOSER_WAIT_CYCLE 40
#define RESERVATION_STATION_FULL_CYCLES 41
#define RESERVATION_STATION_ACTIVE_CYCLES 42
#define LOOP_MATMUL_ACTIVE_CYCLES 43
#define TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES 44
#define RESERVATION_STATION_LD_COUNT (INCREMENTAL_COUNTERS + 1)
#define RESERVATION_STATION_ST_COUNT (INCREMENTAL_COUNTERS + 2)
#define RESERVATION_STATION_EX_COUNT (INCREMENTAL_COUNTERS + 3)
#define RDMA_BYTES_REC (INCREMENTAL_COUNTERS + 4)
#define WDMA_BYTES_SENT (INCREMENTAL_COUNTERS + 5)
#define RDMA_TOTAL_LATENCY (INCREMENTAL_COUNTERS + 6)
#define WDMA_TOTAL_LATENCY (INCREMENTAL_COUNTERS + 7)
#endif

View File

@@ -0,0 +1,576 @@
#ifndef GEMMINI_NN_H
#define GEMMINI_NN_H
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#ifndef BAREMETAL
#include <sys/mman.h>
#endif
#include "include/gemmini.h"
#include "include/gemmini_testutils.h"
struct ConvParams {
int batch_size;
int in_row_dim;
int in_col_dim;
int out_row_dim;
int out_col_dim;
int kernel_size;
int in_channels;
int out_channels;
int in_stride;
int weight_stride;
int out_stride;
int stride;
int padding;
bool bias;
bool depthwise;
int n_patches;
int patch_size;
acc_scale_t output_scale;
scale_t res_scale;
int pool_size, pool_stride, pool_padding, out_dim_pooled;
int I, J, K;
};
struct FcParams {
int batch_size;
int in_features;
int out_features;
acc_scale_t output_scale;
bool bias;
int I, J, K;
};
#define HIST_IMAGES(IMAGES) \
for (int num = -128; num <= 127; num++) { \
int count = 0; \
for (int i = 0; i < sizeof(IMAGES)/sizeof(IMAGES[0]); i++) { \
for (int j = 0; j < sizeof(IMAGES[0])/sizeof(IMAGES[0][0]); j++) { \
for (int k = 0; k < sizeof(IMAGES[0][0])/sizeof(IMAGES[0][0][0]); k++) { \
for (int l = 0; l < sizeof(IMAGES[0][0][0])/sizeof(IMAGES[0][0][0][0]); l++) { \
if (IMAGES[i][j][k][l] == num) { \
count++; \
} \
} \
} \
} \
} \
if (count > 0) \
printf("%d: %d times\n", num, count); \
}
#define HIST_MATRIX(MATRIX) \
for (int num = -128; num <= 127; num++) { \
int count = 0; \
for (int i = 0; i < sizeof(MATRIX)/sizeof(MATRIX[0]); i++) { \
for (int j = 0; j < sizeof(MATRIX[0])/sizeof(MATRIX[0][0]); j++) { \
if (MATRIX[i][j] == num) { \
count++; \
} \
} \
} \
if (count > 0) \
printf("%d: %d times\n", num, count); \
}
// This function runs a tiled matrix multiplication, with explicit tiling
// factors
static void tiled_matmul_nn(size_t dim_I, size_t dim_J, size_t dim_K,
const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J],
const void * D, elem_t C[dim_I][dim_J],
int act, acc_scale_t scale, bool repeating_bias,
size_t tile_I, size_t tile_J, size_t tile_K,
enum tiled_matmul_type_t tiled_matmul_type,
bool check, char * layer_name)
{
if (check)
printf("%s: gemmini\n", layer_name);
tiled_matmul(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)C,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
tile_I, tile_J, tile_K,
false, false,
false, false,
0,
tiled_matmul_type);
if (check) {
printf("%s: CPU\n", layer_name);
elem_t gold[dim_I][dim_J];
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)gold,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
CPU);
if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) {
printf("Layer calculated incorrectly: %s\n", layer_name);
exit(1);
}
}
}
// This function runs a tiled matrix multiplication, with automatically
// calculated tiling factors
// With default auto-stride calc (A_stride = dim_K, B_stride/C_stride/D_stride = dim_J)
static void tiled_matmul_nn_auto(size_t dim_I, size_t dim_J, size_t dim_K,
const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J],
const void * D, elem_t C[dim_I][dim_J],
int act, acc_scale_t scale, bool repeating_bias,
enum tiled_matmul_type_t tiled_matmul_type,
bool check, char * layer_name)
{
if (check)
printf("%s: gemmini\n", layer_name);
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)C,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
tiled_matmul_type);
if (check) {
printf("%s: CPU\n", layer_name);
elem_t gold[dim_I][dim_J];
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)gold,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
CPU);
if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) {
printf("Layer calculated incorrectly: %s\n", layer_name);
exit(1);
}
}
}
// need to specify stride
// auto tiling calc
static void tiled_matmul_nn_stride_auto(size_t dim_I, size_t dim_J, size_t dim_K,
const size_t A_stride, const size_t B_stride, const size_t C_stride,
const elem_t * A, const elem_t * B, const void * D, const elem_t * C,
int act, acc_scale_t scale, bool repeating_bias,
enum tiled_matmul_type_t tiled_matmul_type)
{
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)C,
A_stride, B_stride, C_stride, C_stride,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
tiled_matmul_type);
}
static void conv_dw(size_t I, size_t J,
const size_t batch_size, const size_t channels,
const size_t in_row_dim, const size_t in_col_dim,
const size_t out_row_dim, const size_t out_col_dim,
const size_t kernel_size,
const elem_t input[batch_size][in_row_dim][in_col_dim][channels],
const elem_t weight[channels][kernel_size][kernel_size],
const acc_t * bias,
// elem_t output [batch_size][out_row_dim][out_col_dim][channels],
elem_t output [I][J],
const struct ConvParams * params)
{
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * params->stride - params->padding;
acc_t result = 0;
if (params->bias) {
result = bias[channel];
}
for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) {
int in_col = out_col * params->stride - params->padding;
for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) {
result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col];
}
in_col++;
}
in_row++;
}
if (result < 0) {
result = 0;
}
acc_t scaled = ACC_SCALE(result, params->output_scale);
if (scaled > elem_t_max) {
scaled = elem_t_max;
} else if (scaled < elem_t_min) {
scaled = elem_t_min;
}
size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col;
output[r][channel] = scaled;
// output[batch][out_row][out_col][channel] = scaled;
}
}
}
}
}
static void conv_dw_with_col2im(size_t prev_I, size_t prev_J, size_t I, size_t J,
const size_t batch_size, const size_t channels,
const size_t out_row_dim, const size_t out_col_dim, const size_t kernel_size,
const elem_t input[prev_I][prev_J],
const elem_t weight[channels][kernel_size][kernel_size],
const acc_t * bias,
// elem_t output [batch_size][out_dim][out_dim][channels],
elem_t output [I][J],
const struct ConvParams * params)
{
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * params->stride - params->padding;
acc_t result = 0;
if (params->bias) {
result = bias[channel];
}
for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) {
int in_col = out_col * params->stride - params->padding;
for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) {
// result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col];
size_t r = batch * params->in_row_dim * params->in_col_dim + in_row * params->in_col_dim + in_col;
result += input[r][channel] * weight[channel][kernel_row][kernel_col];
}
in_col++;
}
in_row++;
}
if (result < 0) {
result = 0;
}
acc_t scaled = ACC_SCALE(result, params->output_scale);
if (scaled > elem_t_max) {
scaled = elem_t_max;
} else if (scaled < elem_t_min) {
scaled = elem_t_min;
}
size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col;
output[r][channel] = scaled;
// output[batch][out_row][out_col][channel] = scaled;
}
}
}
}
}
static void im2col(size_t batch_size, size_t channels, size_t im_row_dim, size_t im_col_dim,
size_t I, size_t K,
const elem_t input[batch_size][im_row_dim][im_col_dim][channels],
elem_t output[I][K],
const struct ConvParams * params)
{
int patch_row = 0;
for (int n_batch = 0; n_batch < params->batch_size; n_batch++) {
for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) {
for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) {
int patch_col = 0;
for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) {
for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) {
for (int im_channel = 0; im_channel < params->in_channels; im_channel++) {
int pixel_row = im_row + filter_row;
int pixel_col = im_col + filter_col;
if (pixel_row < 0 || pixel_row >= params->in_row_dim
|| pixel_col < 0 || pixel_col >= params->in_col_dim) {
// output[patch_row][patch_col] = 0;
} else {
output[patch_row][patch_col] = input[n_batch][pixel_row][pixel_col][im_channel];
}
patch_col++;
}
}
}
patch_row++;
}
}
}
}
static void im2col_with_col2im(size_t prev_I, size_t prev_J,
size_t next_I, size_t next_K,
const elem_t input[prev_I][prev_J],
elem_t output[next_I][next_K],
const struct ConvParams * params)
{
int out_row = 0;
for (int n_batch = 0; n_batch < params->batch_size; n_batch++) {
for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) {
for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) {
int out_col = 0;
for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) {
for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) {
for (int im_channel = 0; im_channel < params->in_channels; im_channel++) {
int pixel_row = im_row + filter_row;
int pixel_col = im_col + filter_col;
if (pixel_row < 0 || pixel_row >= params->in_row_dim
|| pixel_col < 0 || pixel_col >= params->in_col_dim) {
// output[out_row][out_col] = 0;
} else {
int in_row = n_batch * params->in_row_dim * params->in_col_dim + pixel_row * params->in_col_dim + pixel_col;
int in_col = im_channel;
output[out_row][out_col] = input[in_row][in_col];
}
out_col++;
}
}
}
out_row++;
}
}
}
}
// Compute C = A + B with saturating add
void vecadd(size_t len, const elem_t * A, const elem_t * B, elem_t * C, scale_t A_shift) {
for (size_t i = 0; i < len; i++) {
acc_t result = MVIN_SCALE(A[i], A_shift) + B[i];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < elem_t_min) {
result = elem_t_min;
}
C[i] = result;
}
}
void resadd1(const size_t batch_size, const size_t channels, const size_t im_dim,
const elem_t A[batch_size][im_dim][im_dim][channels],
const elem_t B[batch_size][im_dim][im_dim][channels],
elem_t C[batch_size][im_dim][im_dim][channels],
bool relu,
const struct ConvParams * params) {
const int minimum = relu ? 0 : elem_t_min;
for (size_t batch = 0; batch < params->batch_size; batch++) {
for (size_t row = 0; row < params->out_dim_pooled; row++) {
for (size_t col = 0; col < params->out_dim_pooled; col++) {
for (size_t channel = 0; channel < params->out_channels; channel++) {
acc_t result = MVIN_SCALE(A[batch][row][col][channel], params->res_scale) + B[batch][row][col][channel];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < minimum) {
result = minimum;
}
C[batch][row][col][channel] = result;
}
}
}
}
}
void resadd2(const size_t I, const size_t J,
const size_t batch_size, const size_t channels, const size_t im_dim,
const elem_t A[I][J],
const elem_t B[batch_size][im_dim][im_dim][channels],
elem_t C[batch_size][im_dim][im_dim][channels],
bool relu,
const struct ConvParams * params) {
const int minimum = relu ? 0 : elem_t_min;
for (size_t batch = 0; batch < params->batch_size; batch++) {
for (size_t row = 0; row < params->out_dim_pooled; row++) {
for (size_t col = 0; col < params->out_dim_pooled; col++) {
for (size_t channel = 0; channel < params->out_channels; channel++) {
size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col;
acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[batch][row][col][channel];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < minimum) {
result = minimum;
}
C[batch][row][col][channel] = result;
}
}
}
}
}
void resadd3(const size_t I, const size_t J,
const elem_t A[I][J],
const elem_t B[I][J],
elem_t C[I][J],
bool relu,
const struct ConvParams * params) {
const int minimum = relu ? 0 : elem_t_min;
for (size_t batch = 0; batch < params->batch_size; batch++) {
for (size_t row = 0; row < params->out_dim_pooled; row++) {
for (size_t col = 0; col < params->out_dim_pooled; col++) {
for (size_t channel = 0; channel < params->out_channels; channel++) {
size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col;
acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[r][channel];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < minimum) {
result = minimum;
}
C[r][channel] = result;
}
}
}
}
}
// Pooling
void pool(size_t batch_size, size_t channels, size_t in_row_dim, size_t in_col_dim,
size_t out_row_dim, size_t out_col_dim,
elem_t input[batch_size][in_row_dim][in_col_dim][channels],
elem_t output[batch_size][out_row_dim][out_col_dim][channels],
const struct ConvParams * params)
{
size_t kernel_size = params->pool_size;
size_t stride = params->pool_stride;
// size_t in_dim = params->out_dim;
size_t padding = params->pool_padding;
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * stride - padding;
elem_t result = elem_t_min;
for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) {
int in_col = out_col * stride - padding;
for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) {
if (input[batch][in_row][in_col][channel] > result) {
result = input[batch][in_row][in_col][channel];
}
} else if (0 > result) {
result = 0;
}
in_col++;
}
in_row++;
}
output[batch][out_row][out_col][channel] = result;
}
}
}
}
}
void pool_with_col2im(size_t I, size_t J,
size_t batch_size, size_t channels, size_t out_row_dim, size_t out_col_dim,
elem_t input[I][J],
elem_t output[batch_size][out_row_dim][out_col_dim][channels],
const struct ConvParams * params)
{
size_t kernel_size = params->pool_size;
size_t stride = params->pool_stride;
size_t in_row_dim = params->out_row_dim;
size_t in_col_dim = params->out_col_dim;
size_t padding = params->pool_padding;
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * stride - padding;
elem_t result = elem_t_min;
for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) {
int in_col = out_col * stride - padding;
for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) {
if (input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel] > result) {
result = input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel];
}
} else if (0 > result) {
result = 0;
}
in_col++;
}
in_row++;
}
output[batch][out_row][out_col][channel] = result;
}
}
}
}
}
#endif // GEMMINI_NN_H

View File

@@ -0,0 +1,90 @@
#ifndef GEMMINI_PARAMS_H
#define GEMMINI_PARAMS_H
#include <stdint.h>
#include <limits.h>
#define XCUSTOM_ACC 3
#define DIM 16
#define ADDR_LEN 32
#define BANK_NUM 4
#define BANK_ROWS 1024
#define ACC_ROWS 1024
#define MAX_BYTES 64
#define MAX_BLOCK_LEN (MAX_BYTES/(DIM*2))
#define MAX_BLOCK_LEN_ACC (MAX_BYTES/(DIM*2))
typedef uint16_t elem_t;
#define ELEM_T_IS_LOWPREC_FLOAT
static const float elem_t_max = 65504.0;
static const float elem_t_min = -65504.0;
typedef uint16_t acc_t;
typedef double full_t;
#define ELEM_T_IS_FLOAT
#define ELEM_T_EXP_BITS 5
#define ELEM_T_SIG_BITS 11
#define ACC_T_EXP_BITS 5
#define ACC_T_SIG_BITS 11
typedef uint16_t elem_t_bits;
typedef uint16_t acc_t_bits;
#define HAS_MVIN_SCALE
typedef uint16_t scale_t;
typedef uint16_t scale_t_bits;
typedef int32_t scale_acc_t;
typedef uint32_t scale_acc_t_bits;
typedef uint16_t acc_scale_t;
typedef uint16_t acc_scale_t_bits;
#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))
#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))
#define MVIN_SCALE_IDENTITY 0x3c00
#define ACC_SCALE_IDENTITY 1.0
#define ROUNDING_RIGHT_SHIFT(x, shift) \
((x) / (1 << (shift)))
#ifdef __cplusplus
#define SAME_TYPE(x) decltype(x)
#else
#define SAME_TYPE(x) typeof(x)
#endif
#define ROUND_NEAR_EVEN(x) \
({ const SAME_TYPE(x) x_ = (x); \
const long long i = x_; \
const long long next = x_ < 0 ? x_ - 1 : x_ + 1; \
SAME_TYPE(x) rem = x_ - i; \
rem = rem < 0 ? -rem : rem; \
SAME_TYPE(x) result = rem < 0.5 ? i : (rem > 0.5 ? next : ( \
i % 2 == 0 ? i : next)); \
result; })
// Rounding right shift equation: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
#define ROUNDING_RIGHT_SHIFT_BITS(x, shift) \
((shift) > 0 ? (((x) >> (shift)) + \
(((shift) == 0 ? 0 : (((x) >> ((shift)-1)) & 1)) & \
((((shift) <= 1 ? 0 : ((x) & ((1 << ((shift)-1)) - 1))) != 0) | (((x) >> (shift)) & 1)))) : ((x) << (-(shift))))
#define ACC_SCALE(x, scale) \
((x))
#define MVIN_SCALE(x, scale) \
((x) * (scale))
#define MVIN_SCALE_ACC(x, scale) (x)
#define ACC_SCALE_T_IS_FLOAT
#define ACC_SCALE_EXP_BITS 5
#define ACC_SCALE_SIG_BITS 11
#define ACC_READ_SMALL_WIDTH
#define HAS_FIRST_LAYER_OPTIMIZATIONS
#endif // GEMMINI_PARAMS_H

View File

@@ -0,0 +1,92 @@
#ifndef GEMMINI_PARAMS_H
#define GEMMINI_PARAMS_H
#include <stdint.h>
#include <limits.h>
#define XCUSTOM_ACC 3
#define DIM 8
#define ADDR_LEN 32
#define BANK_NUM 8
#define BANK_ROWS 1024
#define ACC_ROWS 512
#define MAX_BYTES 64
#define MAX_BLOCK_LEN (MAX_BYTES/(DIM*4))
#define MAX_BLOCK_LEN_ACC (MAX_BYTES/(DIM*4))
typedef float elem_t;
static const elem_t elem_t_max = 3.4028235E38;
static const elem_t elem_t_min = -3.4028235E38;
typedef float acc_t;
typedef double full_t;
#define ELEM_T_IS_FLOAT
#define ELEM_T_EXP_BITS 8
#define ELEM_T_SIG_BITS 24
#define ACC_T_EXP_BITS 8
#define ACC_T_SIG_BITS 24
typedef uint32_t elem_t_bits;
typedef uint32_t acc_t_bits;
#define HAS_MVIN_SCALE
typedef float scale_t;
typedef uint32_t scale_t_bits;
#define HAS_MVIN_ACC_SCALE
typedef float scale_acc_t;
typedef uint32_t scale_acc_t_bits;
typedef float acc_scale_t;
typedef uint32_t acc_scale_t_bits;
#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))
#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))
#define MVIN_SCALE_IDENTITY 1.0
#define ACC_SCALE_IDENTITY 1.0
#define ROUNDING_RIGHT_SHIFT(x, shift) \
((x) / (1 << (shift)))
#ifdef __cplusplus
#define SAME_TYPE(x) decltype(x)
#else
#define SAME_TYPE(x) typeof(x)
#endif
#define ROUND_NEAR_EVEN(x) \
({ const SAME_TYPE(x) x_ = (x); \
const long long i = x_; \
const long long next = x_ < 0 ? x_ - 1 : x_ + 1; \
SAME_TYPE(x) rem = x_ - i; \
rem = rem < 0 ? -rem : rem; \
SAME_TYPE(x) result = rem < 0.5 ? i : (rem > 0.5 ? next : ( \
i % 2 == 0 ? i : next)); \
result; })
// Rounding right shift equation: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
#define ROUNDING_RIGHT_SHIFT_BITS(x, shift) \
((shift) > 0 ? (((x) >> (shift)) + \
(((shift) == 0 ? 0 : (((x) >> ((shift)-1)) & 1)) & \
((((shift) <= 1 ? 0 : ((x) & ((1 << ((shift)-1)) - 1))) != 0) | (((x) >> (shift)) & 1)))) : ((x) << (-(shift))))
#define ACC_SCALE(x, scale) \
((x) * (scale))
#define MVIN_SCALE(x, scale) \
((x) * (scale))
#define MVIN_SCALE_ACC(x, scale) \
((x) * (scale))
#define ACC_SCALE_T_IS_FLOAT
#define ACC_SCALE_EXP_BITS 8
#define ACC_SCALE_SIG_BITS 24
#define ACC_READ_SMALL_WIDTH
#define ACC_READ_FULL_WIDTH
#define HAS_FIRST_LAYER_OPTIMIZATIONS
#endif // GEMMINI_PARAMS_H

View File

@@ -0,0 +1 @@
gemmini_params.dim16fp16.h

View File

@@ -0,0 +1,285 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_GEMMINI_TESTUTILS_H
#define SRC_MAIN_C_GEMMINI_TESTUTILS_H
#undef abs
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include <stdbool.h>
#include "include/gemmini_params.h"
#include "include/gemmini.h"
#ifdef BAREMETAL
#undef assert
#define assert(expr) \
if (!(expr)) { \
printf("Failed assertion: " #expr "\n " __FILE__ ":%u\n", __LINE__); \
exit(1); \
}
#endif
// #define GEMMINI_ASSERTIONS
// Matmul utility functions
static void matmul(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[k][c];
}
}
static void matmul_short(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[r][k]*B[k][c];
}
}
static void matmul_full(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
// Identical to the other matmul function, but with a 64-bit bias
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[k][c];
}
}
static void matmul_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[k][c];
}
}
static void matmul_short_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[k][r]*B[k][c];
}
}
static void matmul_full_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[k][c];
}
}
static void matmul_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[c][k];
}
}
static void matmul_short_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[r][k]*B[c][k];
}
}
static void matmul_full_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[c][k];
}
}
static void matmul_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[c][k];
}
}
static void matmul_short_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[k][r]*B[c][k];
}
}
static void matmul_full_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[c][k];
}
}
static void matadd(full_t sum[DIM][DIM], full_t m1[DIM][DIM], full_t m2[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++)
sum[r][c] = m1[r][c] + m2[r][c];
}
// THIS IS A ROUNDING SHIFT! It also performs a saturating cast
static void matshift(full_t full[DIM][DIM], elem_t out[DIM][DIM], int shift) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
// Bitshift and round element
full_t shifted = ROUNDING_RIGHT_SHIFT(full[r][c], shift);
// Saturate and cast element
#ifndef ELEM_T_IS_FLOAT
full_t elem = shifted > elem_t_max ? elem_t_max : (shifted < elem_t_min ? elem_t_min : shifted);
out[r][c] = elem;
#else
out[r][c] = shifted; // TODO should we also saturate when using floats?
#endif
}
}
static void matscale(full_t full[DIM][DIM], elem_t out[DIM][DIM], acc_scale_t scale) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
// Bitshift and round element
full_t scaled = ACC_SCALE(full[r][c], scale);
// Saturate and cast element
#ifndef ELEM_T_IS_FLOAT
full_t elem = scaled > elem_t_max ? elem_t_max : (scaled < elem_t_min ? elem_t_min : scaled);
out[r][c] = elem;
#else
out[r][c] = scaled; // TODO should we also saturate when using floats?
#endif
}
}
static void matrelu(elem_t in[DIM][DIM], elem_t out[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++)
out[r][c] = in[r][c] > 0 ? in[r][c] : 0;
}
static void transpose(elem_t in[DIM][DIM], elem_t out[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++)
out[c][r] = in[r][c];
}
int rand() {
static uint32_t x = 777;
x = x * 1664525 + 1013904223;
return x >> 24;
}
#ifdef ELEM_T_IS_FLOAT
double rand_double() {
double a = (double)(rand() % 128) / (double)(1 + (rand() % 64));
double b = (double)(rand() % 128) / (double)(1 + (rand() % 64));
return a - b;
}
#endif
static void printMatrix(elem_t m[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i) {
for (size_t j = 0; j < DIM; ++j)
#ifndef ELEM_T_IS_FLOAT
printf("%d ", m[i][j]);
#else
printf("%x ", elem_t_to_elem_t_bits(m[i][j]));
#endif
printf("\n");
}
}
static void printMatrixAcc(acc_t m[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i) {
for (size_t j = 0; j < DIM; ++j)
#ifndef ELEM_T_IS_FLOAT
printf("%d ", m[i][j]);
#else
printf("%x ", acc_t_to_acc_t_bits(m[i][j]));
#endif
printf("\n");
}
}
static int is_equal(elem_t x[DIM][DIM], elem_t y[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i)
for (size_t j = 0; j < DIM; ++j) {
#ifndef ELEM_T_IS_FLOAT
if (x[i][j] != y[i][j])
#else
bool isnanx = elem_t_isnan(x[i][j]);
bool isnany = elem_t_isnan(y[i][j]);
if (x[i][j] != y[i][j] && !(isnanx && isnany))
#endif
return 0;
}
return 1;
}
static int is_equal_transposed(elem_t x[DIM][DIM], elem_t y[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i)
for (size_t j = 0; j < DIM; ++j) {
#ifndef ELEM_T_IS_FLOAT
if (x[i][j] != y[j][i])
#else
bool isnanx = elem_t_isnan(x[i][j]);
bool isnany = elem_t_isnan(y[j][i]);
if (x[i][j] != y[j][i] && !(isnanx && isnany))
#endif
return 0;
}
return 1;
}
// This is a GNU extension known as statment expressions
#define MAT_IS_EQUAL(dim_i, dim_j, x, y) \
({int result = 1; \
for (size_t i = 0; i < dim_i; i++) \
for (size_t j = 0; j < dim_j; ++j) { \
if (x[i][j] != y[i][j]) { \
result = 0; \
break; \
} \
} \
result;})
static uint64_t read_cycles() {
uint64_t cycles;
asm volatile ("rdcycle %0" : "=r" (cycles));
return cycles;
// const uint32_t * mtime = (uint32_t *)(33554432 + 0xbff8);
// const uint32_t * mtime = (uint32_t *)(33554432 + 0xbffc);
// return *mtime;
}
#undef abs
#endif // SRC_MAIN_C_GEMMINI_TESTUTILS_H

View File

@@ -0,0 +1,13 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_TRANSLATOR_H
#define SRC_MAIN_C_TRANSLATOR_H
#include "rocc-software/src/xcustom.h"
#define XCUSTOM_TRANS 1
#define doTranslate(y, vaddr) \
ROCC_INSTRUCTION(XCUSTOM_TRANS, y, vaddr, 0, 0);
#endif // SRC_MAIN_C_TRANSLATOR_H

3
lib/gemmini/rocc-software/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
*~
*#
*.#*

View File

@@ -0,0 +1,46 @@
All contributors must agree to the Developer Certificate of Origin Version 1.1. (DCO 1.1) by signing their commits with:
```
DCO 1.1 Signed-off-by: [NAME] <[EMAIL]>
```
The full text of the DCO 1.1 is as follows:
```
Developer Certificate of Origin
Version 1.1
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
660 York Street, Suite 102,
San Francisco, CA 94110 USA
Everyone is permitted to copy and distribute verbatim copies of this
license document, but changing it is not allowed.
Developer's Certificate of Origin 1.1
By making a contribution to this project, I certify that:
(a) The contribution was created in whole or in part by me and I
have the right to submit it under the open source license
indicated in the file; or
(b) The contribution is based upon previous work that, to the best
of my knowledge, is covered under an appropriate open source
license and I have the right under that license to submit that
work with modifications, whether created in whole or in part
by me, under the same open source license (unless I am
permitted to submit under a different license), as indicated
in the file; or
(c) The contribution was provided directly to me by some other
person who certified (a), (b) or (c) and I have not modified
it.
(d) I understand and agree that this project and the contribution
are public and that a record of the contribution (including all
personal information I submit with it, including my sign-off) is
maintained indefinitely and may be redistributed consistent with
this project or the open source license(s) involved.
```

View File

@@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright {yyyy} {name of copyright owner}
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,4 @@
Rocket Custom Coprocessor (RoCC) Software
========================================
This is a set of C and RISC-V Assembly macros that help with emitting custom RISC-V instructions for talking with Rocket Custom Coprocessors (RoCCs).

View File

@@ -0,0 +1,28 @@
// Copyright 2018 IBM
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ROCC_SOFTWARE_SRC_RISCV_TEST_ROCC_H_
#define ROCC_SOFTWARE_SRC_RISCV_TEST_ROCC_H_
#define RVTEST_XS_ENABLE \
li a0, MSTATUS_XS & (MSTATUS_XS >> 1); \
csrs mstatus, a0;
#define RVTEST_WITH_ROCC \
.macro init; \
RVTEST_XS_ENABLE \
.endm
#endif // ROCC_SOFTWARE_SRC_RISCV_TEST_ROCC_H_

View File

@@ -0,0 +1,170 @@
// Copyright 2018--2020 IBM
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef ROCC_SOFTWARE_SRC_XCUSTOM_H_
#define ROCC_SOFTWARE_SRC_XCUSTOM_H_
#define STR1(x) #x
#ifndef STR
#define STR(x) STR1(x)
#endif
#define CAT_(A, B) A##B
#define CAT(A, B) CAT_(A, B)
/** Assembly macro for creating "raw" Rocket Custom Coproessor (RoCC)
* assembly language instructions that will return data in rd. These
* are to be used only in assembly language programs (not C/C++).
*
* Example:
*
* Consider the following macro consisting of a CUSTOM_0 instruction
* with func7 "42" that is doing some operation of "a0 = op(a1, a2)":
*
* ROCC_INSTRUCTION_RAW_R_R_R(0, a0, a1, a2, 42)
*
* This will produce the following pseudo assembly language
* instruction:
*
* .insn r CUSTOM_0, 7, 42, a0, a1, a2
*
* @param x the custom instruction number: 0, 1, 2, or 3
* @param rd the destination register, e.g., a0 or x10
* @param rs1 the first source register, e.g., a0 or x10
* @param rs2 the second source register, e.g., a0 or x10
* @param func7 the value of the func7 field
* @return a raw .insn RoCC instruction
*/
#define ROCC_INSTRUCTION_RAW_R_R_R(x, rd, rs1, rs2, func7) \
.insn r CAT(CUSTOM_, x), 7, func7, rd, rs1, rs2
/** Assembly macro for creating "raw" Rocket Custom Coproessor (RoCC)
* assembly language instructions that will *NOT* return data in rd.
* These are to be used only in assembly language programs (not
* C/C++).
*
* Example:
*
* Consider the following macro consisting of a CUSTOM_1 instruction
* with func7 "42" that is doing some operation of "op(a1, a2)". *NO*
* data is returned:
*
* ROCC_INSTRUCTION_RAW_R_R_R(1, a1, a2, 42)
*
* This will produce the following pseudo assembly language
* instruction:
*
* .insn r CUSTOM_1, 3, 42, x0, a1, a2
*
* @param x the custom instruction number: 0, 1, 2, or 3
* @param rs1 the first source register, e.g., a0 or x10
* @param rs2 the second source register, e.g., a0 or x10
* @param func7 the value of the func7 field
* @return a raw .insn RoCC instruction
*/
#define ROCC_INSTRUCTION_RAW_0_R_R(x, rs1, rs2, func7) \
.insn r CAT(CUSTOM_, x), 3, func7, x0, rs1, rs2
/** C/C++ inline assembly macro for creating Rocket Custom Coprocessor
* (RoCC) instructions that return data in rd. These are to be used
* only in C/C++ programs (not bare assembly).
*
* This is equivalent to ROCC_INSTRUCTION_R_R_R. See it's
* documentation.
*/
#define ROCC_INSTRUCTION(x, rd, rs1, rs2, func7) \
ROCC_INSTRUCTION_R_R_R(x, rd, rs1, rs2, func7)
/** C/C++ inline assembly macro for creating Rocket Custom Coprocessor
* (RoCC) instructions that return data in C variable rd.
* These are to be used only in C/C++ programs (not bare assembly).
*
* Example:
*
* Consider the following macro consisting of a CUSTOM_2 instruction
* with func7 "42" that is doing some operation of "a0 = op(a1, a2)"
* (where a0, a1, and a2 are variables defined in C):
*
* ROCC_INSTRUCTION(2, a0, a1, a2, 42)
*
* This will produce the following inline assembly:
*
* asm volatile(
* ".insn r CUSTOM_2, 0x7, 42, %0, %1, %2"
* : "=r"(rd)
* : "r"(rs1), "r"(rs2));
*
* @param x the custom instruction number: 0, 1, 2, or 3
* @param rd the C variable to capture as destination operand
* @param rs1 the C variable to capture for first source register
* @param rs2 the C variable to capture for second source register
* @param func7 the value of the func7 field
* @return an inline assembly RoCC instruction
*/
#define ROCC_INSTRUCTION_R_R_R(x, rd, rs1, rs2, func7) \
{ \
asm volatile( \
".insn r " STR(CAT(CUSTOM_, x)) ", " STR(0x7) ", " STR(func7) ", %0, %1, %2" \
: "=r"(rd) \
: "r"(rs1), "r"(rs2)); \
}
/** C/C++ inline assembly macro for creating Rocket Custom Coprocessor
* (RoCC) instructions that return data in C variable rd.
* These are to be used only in C/C++ programs (not bare assembly).
*
* Example:
*
* Consider the following macro consisting of a CUSTOM_3 instruction
* with func7 "42" that is doing some operation of "a0 = op(a1, a2)"
* (where a0, a1, and a2 are variables defined in C):
*
* ROCC_INSTRUCTION(3, a0, a1, a2, 42)
*
* This will produce the following inline assembly:
*
* asm volatile(
* ".insn r CUSTOM_3, 0x7, 42, %0, %1, %2"
* :: "r"(rs1), "r"(rs2));
*
* @param x the custom instruction number: 0, 1, 2, or 3
* @param rs1 the C variable to capture for first source register
* @param rs2 the C variable to capture for second source register
* @param funct7 the value of the funct7 f
* @return an inline assembly RoCC instruction
*/
#define ROCC_INSTRUCTION_0_R_R(x, rs1, rs2, func7) \
{ \
asm volatile( \
".insn r " STR(CAT(CUSTOM_, x)) ", " STR(0x3) ", " STR(func7) ", x0, %0, %1" \
: \
: "r"(rs1), "r"(rs2)); \
}
// [TODO] fix these to align with the above approach
// Macro to pass rs2_ as an immediate
/*
#define ROCC_INSTRUCTION_R_R_I(XCUSTOM_, rd_, rs1_, rs2_, funct_) \
asm volatile (XCUSTOM_" %[rd], %[rs1], %[rs2], %[funct]" \
: [rd] "=r" (rd_) \
: [rs1] "r" (rs1_), [rs2] "i" (rs2_), [funct] "i" (funct_))
// Macro to pass rs1_ and rs2_ as immediates
#define ROCC_INSTRUCTION_R_I_I(XCUSTOM_, rd_, rs1_, rs2_, funct_) \
asm volatile (XCUSTOM_" %[rd], %[rs1], %[rs2], %[funct]" \
: [rd] "=r" (rd_) \
: [rs1] "i" (rs1_), [rs2] "i" (rs2_), [funct] "i" (funct_))
*/
#endif // ROCC_SOFTWARE_SRC_XCUSTOM_H_

685
lib/include/VX_config.h Normal file
View File

@@ -0,0 +1,685 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2024-05-07 13:55:58.398687
// Translated from ./rtl/VX_config.vh:
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef VX_CONFIG_VH
#define VX_CONFIG_VH
#ifndef MIN
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
#endif
#ifndef MAX
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#endif
#ifndef CLAMP
#define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
#endif
#ifndef UP
#define UP(x) (((x) != 0) ? (x) : 1)
#endif
///////////////////////////////////////////////////////////////////////////////
#ifndef EXT_M_DISABLE
#define EXT_M_ENABLE
#endif
#ifndef EXT_F_DISABLE
#define EXT_F_ENABLE
#endif
#ifndef XLEN_32
#ifndef XLEN_64
#define XLEN_32
#endif
#endif
#ifdef XLEN_64
#define XLEN 64
#endif
#ifdef XLEN_32
#define XLEN 32
#endif
#ifdef EXT_D_ENABLE
#define FLEN_64
#else
#define FLEN_32
#endif
#ifdef FLEN_64
#define FLEN 64
#endif
#ifdef FLEN_32
#define FLEN 32
#endif
#ifdef XLEN_64
#ifdef FLEN_32
#define FPU_RV64F
#endif
#endif
#ifndef NUM_CLUSTERS
#define NUM_CLUSTERS 1
#endif
#ifndef NUM_CORES
#define NUM_CORES 8
#endif
#ifndef NUM_WARPS
#define NUM_WARPS 8
#endif
#ifndef NUM_THREADS
#define NUM_THREADS 8
#endif
#ifndef NUM_BARRIERS
#define NUM_BARRIERS 8
#endif
#ifndef SOCKET_SIZE
#define SOCKET_SIZE MIN(4, NUM_CORES)
#endif
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
#ifdef L2_ENABLE
#define L2_ENABLED 1
#else
#define L2_ENABLED 0
#endif
#ifdef L3_ENABLE
#define L3_ENABLED 1
#else
#define L3_ENABLED 0
#endif
#ifdef L1_DISABLE
#define ICACHE_DISABLE
#define DCACHE_DISABLE
#endif
#ifndef MEM_BLOCK_SIZE
#define MEM_BLOCK_SIZE 64
#endif
#ifndef MEM_ADDR_WIDTH
#ifdef XLEN_64
#define MEM_ADDR_WIDTH 48
#else
#define MEM_ADDR_WIDTH 32
#endif
#endif
#ifndef L1_LINE_SIZE
#ifdef L1_DISABLE
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 4 : MEM_BLOCK_SIZE)
#else
#define L1_LINE_SIZE ((L2_ENABLED || L3_ENABLED) ? 16 : MEM_BLOCK_SIZE)
#endif
#endif
#ifdef L2_ENABLE
#define L2_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L2_LINE_SIZE L1_LINE_SIZE
#endif
#ifdef L3_ENABLE
#define L3_LINE_SIZE MEM_BLOCK_SIZE
#else
#define L3_LINE_SIZE L2_LINE_SIZE
#endif
#ifdef XLEN_64
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x180000000
#endif
#ifndef STACK_BASE_ADDR
#define STACK_BASE_ADDR 0x1FF000000
#endif
#else
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x80000000
#endif
#ifndef STACK_BASE_ADDR
#define STACK_BASE_ADDR 0xFF000000
#endif
#endif
#ifndef SMEM_BASE_ADDR
#define SMEM_BASE_ADDR STACK_BASE_ADDR
#endif
#ifndef SMEM_LOG_SIZE
#define SMEM_LOG_SIZE 19
#endif
#ifndef IO_BASE_ADDR
#define IO_BASE_ADDR (SMEM_BASE_ADDR + (1 << SMEM_LOG_SIZE))
#endif
#ifndef IO_COUT_ADDR
#define IO_COUT_ADDR IO_BASE_ADDR
#endif
#define IO_COUT_SIZE MEM_BLOCK_SIZE
#ifndef IO_CSR_ADDR
#define IO_CSR_ADDR (IO_COUT_ADDR + IO_COUT_SIZE)
#endif
#define IO_CSR_SIZE (4 * 64 * NUM_CORES * NUM_CLUSTERS)
#ifndef STACK_LOG2_SIZE
#define STACK_LOG2_SIZE 13
#endif
#define STACK_SIZE (1 << STACK_LOG2_SIZE)
#define RESET_DELAY 8
#ifndef STALL_TIMEOUT
#define STALL_TIMEOUT (100000 * (1 ** (L2_ENABLED + L3_ENABLED)))
#endif
#ifndef SV_DPI
#define DPI_DISABLE
#endif
#ifndef FPU_FPNEW
#ifndef FPU_DSP
#ifndef FPU_DPI
#ifndef SYNTHESIS
#ifndef DPI_DISABLE
#define FPU_DPI
#else
#define FPU_DSP
#endif
#else
#define FPU_DSP
#endif
#endif
#endif
#endif
#ifndef SYNTHESIS
#ifndef DPI_DISABLE
#define IMUL_DPI
#define IDIV_DPI
#endif
#endif
#ifndef DEBUG_LEVEL
#define DEBUG_LEVEL 3
#endif
// Pipeline Configuration /////////////////////////////////////////////////////
// Issue width
#ifndef ISSUE_WIDTH
#define ISSUE_WIDTH NUM_WARPS
#endif
// Number of ALU units
#ifndef NUM_ALU_LANES
#define NUM_ALU_LANES NUM_THREADS
#endif
#ifndef NUM_ALU_BLOCKS
#define NUM_ALU_BLOCKS 4
#endif
// Number of FPU units
#ifndef NUM_FPU_LANES
#define NUM_FPU_LANES NUM_THREADS
#endif
#ifndef NUM_FPU_BLOCKS
#define NUM_FPU_BLOCKS 2
#endif
// Number of LSU units
#ifndef NUM_LSU_LANES
#define NUM_LSU_LANES NUM_THREADS
#endif
// Number of SFU units
#ifndef NUM_SFU_LANES
#define NUM_SFU_LANES MIN(NUM_THREADS, 4)
#endif
// Size of Instruction Buffer
#ifndef IBUF_SIZE
#define IBUF_SIZE (4 * ISSUE_WIDTH)
#endif
// Size of LSU Request Queue
#ifndef LSUQ_SIZE
#define LSUQ_SIZE (4 * NUM_WARPS * (NUM_THREADS / NUM_LSU_LANES))
#endif
// LSU Duplicate Address Check
#ifndef LSU_DUP_DISABLE
#define LSU_DUP_ENABLE
#endif
#ifdef LSU_DUP_ENABLE
#define LSU_DUP_ENABLED 1
#else
#define LSU_DUP_ENABLED 0
#endif
#ifdef GBAR_ENABLE
#define GBAR_ENABLED 1
#else
#define GBAR_ENABLED 0
#endif
#ifndef LATENCY_IMUL
#ifdef VIVADO
#define LATENCY_IMUL 4
#endif
#ifdef QUARTUS
#define LATENCY_IMUL 3
#endif
#ifndef LATENCY_IMUL
#define LATENCY_IMUL 4
#endif
#endif
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue
#ifndef FPUQ_SIZE
#define FPUQ_SIZE (2 * (NUM_THREADS / NUM_FPU_LANES))
#endif
// FNCP Latency
#ifndef LATENCY_FNCP
#define LATENCY_FNCP 2
#endif
// FMA Latency
#ifndef LATENCY_FMA
#ifdef FPU_DPI
#define LATENCY_FMA 4
#endif
#ifdef FPU_FPNEW
#define LATENCY_FMA 4
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FMA 4
#endif
#ifdef VIVADO
#define LATENCY_FMA 16
#endif
#ifndef LATENCY_FMA
#define LATENCY_FMA 4
#endif
#endif
#endif
// FDIV Latency
#ifndef LATENCY_FDIV
#ifdef FPU_DPI
#define LATENCY_FDIV 15
#endif
#ifdef FPU_FPNEW
#define LATENCY_FDIV 16
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FDIV 15
#endif
#ifdef VIVADO
#define LATENCY_FDIV 28
#endif
#ifndef LATENCY_FDIV
#define LATENCY_FDIV 16
#endif
#endif
#endif
// FSQRT Latency
#ifndef LATENCY_FSQRT
#ifdef FPU_DPI
#define LATENCY_FSQRT 10
#endif
#ifdef FPU_FPNEW
#define LATENCY_FSQRT 16
#endif
#ifdef FPU_DSP
#ifdef QUARTUS
#define LATENCY_FSQRT 10
#endif
#ifdef VIVADO
#define LATENCY_FSQRT 28
#endif
#ifndef LATENCY_FSQRT
#define LATENCY_FSQRT 16
#endif
#endif
#endif
// FCVT Latency
#ifndef LATENCY_FCVT
#define LATENCY_FCVT 5
#endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
#ifndef ICACHE_DISABLE
#define ICACHE_ENABLE
#endif
#ifdef ICACHE_ENABLE
#define ICACHE_ENABLED 1
#else
#define ICACHE_ENABLED 0
#define NUM_ICACHES 0
#endif
// Number of Cache Units
#ifndef NUM_ICACHES
#define NUM_ICACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
#ifndef ICACHE_SIZE
#define ICACHE_SIZE 16384
#endif
// Core Response Queue Size
#ifndef ICACHE_CRSQ_SIZE
#define ICACHE_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef ICACHE_MSHR_SIZE
#define ICACHE_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef ICACHE_MREQ_SIZE
#define ICACHE_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef ICACHE_MRSQ_SIZE
#define ICACHE_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef ICACHE_NUM_WAYS
#define ICACHE_NUM_WAYS 1
#endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
#ifndef DCACHE_DISABLE
#define DCACHE_ENABLE
#endif
#ifdef DCACHE_ENABLE
#define DCACHE_ENABLED 1
#else
#define DCACHE_ENABLED 0
#define NUM_DCACHES 0
#define DCACHE_NUM_BANKS 1
#endif
// Number of Cache Units
#ifndef NUM_DCACHES
#define NUM_DCACHES UP(SOCKET_SIZE / 4)
#endif
// Cache Size
#ifndef DCACHE_SIZE
#define DCACHE_SIZE 16384
#endif
// Number of Banks
#ifndef DCACHE_NUM_BANKS
#define DCACHE_NUM_BANKS NUM_LSU_LANES
#endif
// Core Response Queue Size
#ifndef DCACHE_CRSQ_SIZE
#define DCACHE_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef DCACHE_MSHR_SIZE
#define DCACHE_MSHR_SIZE 8
#endif
// Memory Request Queue Size
#ifndef DCACHE_MREQ_SIZE
#define DCACHE_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef DCACHE_MRSQ_SIZE
#define DCACHE_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef DCACHE_NUM_WAYS
#define DCACHE_NUM_WAYS 1
#endif
// SM Configurable Knobs //////////////////////////////////////////////////////
#ifndef SM_DISABLE
#define SM_ENABLE
#endif
#ifdef SM_ENABLE
#define SM_ENABLED 1
#else
#define SM_ENABLED 0
#define SMEM_NUM_BANKS 1
#endif
// Number of Banks
#ifndef SMEM_NUM_BANKS
#define SMEM_NUM_BANKS (NUM_LSU_LANES)
#endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
#ifndef L2_CACHE_SIZE
#ifdef ALTERA_S10
#define L2_CACHE_SIZE 2097152
#else
#define L2_CACHE_SIZE 1048576
#endif
#endif
// Number of Banks
#ifndef L2_NUM_BANKS
#define L2_NUM_BANKS MIN(4, NUM_SOCKETS)
#endif
// Core Response Queue Size
#ifndef L2_CRSQ_SIZE
#define L2_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef L2_MSHR_SIZE
#define L2_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef L2_MREQ_SIZE
#define L2_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef L2_MRSQ_SIZE
#define L2_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef L2_NUM_WAYS
#define L2_NUM_WAYS 2
#endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
#ifndef L3_CACHE_SIZE
#ifdef ALTERA_S10
#define L3_CACHE_SIZE 2097152
#else
#define L3_CACHE_SIZE 1048576
#endif
#endif
// Number of Banks
#ifndef L3_NUM_BANKS
#define L3_NUM_BANKS MIN(4, NUM_CLUSTERS)
#endif
// Core Response Queue Size
#ifndef L3_CRSQ_SIZE
#define L3_CRSQ_SIZE 2
#endif
// Miss Handling Register Size
#ifndef L3_MSHR_SIZE
#define L3_MSHR_SIZE 16
#endif
// Memory Request Queue Size
#ifndef L3_MREQ_SIZE
#define L3_MREQ_SIZE 4
#endif
// Memory Response Queue Size
#ifndef L3_MRSQ_SIZE
#define L3_MRSQ_SIZE 0
#endif
// Number of Associative Ways
#ifndef L3_NUM_WAYS
#define L3_NUM_WAYS 4
#endif
// ISA Extensions /////////////////////////////////////////////////////////////
#ifdef EXT_A_ENABLE
#define EXT_A_ENABLED 1
#else
#define EXT_A_ENABLED 0
#endif
#ifdef EXT_C_ENABLE
#define EXT_C_ENABLED 1
#else
#define EXT_C_ENABLED 0
#endif
#ifdef EXT_D_ENABLE
#define EXT_D_ENABLED 1
#else
#define EXT_D_ENABLED 0
#endif
#ifdef EXT_F_ENABLE
#define EXT_F_ENABLED 1
#else
#define EXT_F_ENABLED 0
#endif
#ifdef EXT_M_ENABLE
#define EXT_M_ENABLED 1
#else
#define EXT_M_ENABLED 0
#endif
#define ISA_STD_A 0
#define ISA_STD_C 2
#define ISA_STD_D 3
#define ISA_STD_E 4
#define ISA_STD_F 5
#define ISA_STD_H 7
#define ISA_STD_I 8
#define ISA_STD_N 13
#define ISA_STD_Q 16
#define ISA_STD_S 18
#define ISA_STD_U 20
#define ISA_EXT_ICACHE 0
#define ISA_EXT_DCACHE 1
#define ISA_EXT_L2CACHE 2
#define ISA_EXT_L3CACHE 3
#define ISA_EXT_SMEM 4
#define MISA_EXT (ICACHE_ENABLED << ISA_EXT_ICACHE) \
| (DCACHE_ENABLED << ISA_EXT_DCACHE) \
| (L2_ENABLED << ISA_EXT_L2CACHE) \
| (L3_ENABLED << ISA_EXT_L3CACHE) \
| (SM_ENABLED << ISA_EXT_SMEM)
#define MISA_STD (EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
| (EXT_C_ENABLED << 2) /* C - Compressed extension */ \
| (EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
| (0 << 4) /* E - RV32E base ISA */ \
| (EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
| (0 << 6) /* G - Additional standard extensions present */ \
| (0 << 7) /* H - Hypervisor mode implemented */ \
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
| (0 << 9) /* J - Reserved */ \
| (0 << 10) /* K - Reserved */ \
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
| (EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
| (0 << 13) /* N - User level interrupts supported */ \
| (0 << 14) /* O - Reserved */ \
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
| (0 << 17) /* R - Reserved */ \
| (0 << 18) /* S - Supervisor mode implemented */ \
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
| (1 << 20) /* U - User mode implemented */ \
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
| (0 << 22) /* W - Reserved */ \
| (1 << 23) /* X - Non-standard extensions present */ \
| (0 << 24) /* Y - Reserved */ \
| (0 << 25) /* Z - Reserved */
// Device identification //////////////////////////////////////////////////////
#define VENDOR_ID 0
#define ARCHITECTURE_ID 0
#define IMPLEMENTATION_ID 0
#endif // VX_CONFIG_VH

193
lib/include/VX_types.h Normal file
View File

@@ -0,0 +1,193 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2024-06-15 00:25:12.935689
// Translated from ./rtl/VX_types.vh:
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef VX_TYPES_VH
#define VX_TYPES_VH
// Device configuration registers
#define VX_CSR_ADDR_BITS 12
#define VX_DCR_ADDR_BITS 12
#define VX_DCR_BASE_STATE_BEGIN 0x001
#define VX_DCR_BASE_STARTUP_ADDR0 0x001
#define VX_DCR_BASE_STARTUP_ADDR1 0x002
#define VX_DCR_BASE_MPM_CLASS 0x003
#define VX_DCR_BASE_STATE_END 0x004
#define VX_DCR_BASE_STATE(addr) ((addr) - VX_DCR_BASE_STATE_BEGIN)
#define VX_DCR_BASE_STATE_COUNT (VX_DCR_BASE_STATE_END-VX_DCR_BASE_STATE_BEGIN)
// Machine Performance-monitoring counters classes
#define VX_DCR_MPM_CLASS_NONE 0
#define VX_DCR_MPM_CLASS_CORE 1
#define VX_DCR_MPM_CLASS_MEM 2
// User Floating-Point CSRs
#define VX_CSR_FFLAGS 0x001
#define VX_CSR_FRM 0x002
#define VX_CSR_FCSR 0x003
#define VX_CSR_SATP 0x180
#define VX_CSR_PMPCFG0 0x3A0
#define VX_CSR_PMPADDR0 0x3B0
#define VX_CSR_MSTATUS 0x300
#define VX_CSR_MISA 0x301
#define VX_CSR_MEDELEG 0x302
#define VX_CSR_MIDELEG 0x303
#define VX_CSR_MIE 0x304
#define VX_CSR_MTVEC 0x305
#define VX_CSR_MEPC 0x341
#define VX_CSR_MNSTATUS 0x744
#define VX_CSR_MPM_BASE 0xB00
#define VX_CSR_MPM_BASE_H 0xB80
#define VX_CSR_MPM_USER 0xB03
#define VX_CSR_MPM_USER_H 0xB83
// Machine Performance-monitoring core counters
// PERF: Standard
#define VX_CSR_MCYCLE 0xB00
#define VX_CSR_MCYCLE_H 0xB80
#define VX_CSR_MPM_RESERVED 0xB01
#define VX_CSR_MPM_RESERVED_H 0xB81
#define VX_CSR_MINSTRET 0xB02
#define VX_CSR_MINSTRET_H 0xB82
// PERF: pipeline
#define VX_CSR_MPM_SCHED_ID 0xB03
#define VX_CSR_MPM_SCHED_ID_H 0xB83
#define VX_CSR_MPM_SCHED_ST 0xB04
#define VX_CSR_MPM_SCHED_ST_H 0xB84
#define VX_CSR_MPM_IBUF_ST 0xB05
#define VX_CSR_MPM_IBUF_ST_H 0xB85
#define VX_CSR_MPM_SCRB_ST 0xB06
#define VX_CSR_MPM_SCRB_ST_H 0xB86
#define VX_CSR_MPM_SCRB_ALU 0xB07
#define VX_CSR_MPM_SCRB_ALU_H 0xB87
#define VX_CSR_MPM_SCRB_FPU 0xB08
#define VX_CSR_MPM_SCRB_FPU_H 0xB88
#define VX_CSR_MPM_SCRB_LSU 0xB09
#define VX_CSR_MPM_SCRB_LSU_H 0xB89
#define VX_CSR_MPM_SCRB_SFU 0xB0A
#define VX_CSR_MPM_SCRB_SFU_H 0xB8A
// PERF: memory
#define VX_CSR_MPM_IFETCHES 0xB0B
#define VX_CSR_MPM_IFETCHES_H 0xB8B
#define VX_CSR_MPM_LOADS 0xB0C
#define VX_CSR_MPM_LOADS_H 0xB8C
#define VX_CSR_MPM_STORES 0xB0D
#define VX_CSR_MPM_STORES_H 0xB8D
#define VX_CSR_MPM_IFETCH_LT 0xB0E
#define VX_CSR_MPM_IFETCH_LT_H 0xB8E
#define VX_CSR_MPM_LOAD_LT 0xB0F
#define VX_CSR_MPM_LOAD_LT_H 0xB8F
// SFU: scoreboard
#define VX_CSR_MPM_SCRB_WCTL 0xB10
#define VX_CSR_MPM_SCRB_WCTL_H 0xB90
#define VX_CSR_MPM_SCRB_CSRS 0xB11
#define VX_CSR_MPM_SCRB_CSRS_H 0xB91
// Machine Performance-monitoring memory counters
// PERF: icache
#define VX_CSR_MPM_ICACHE_READS 0xB03 // total reads
#define VX_CSR_MPM_ICACHE_READS_H 0xB83
#define VX_CSR_MPM_ICACHE_MISS_R 0xB04 // read misses
#define VX_CSR_MPM_ICACHE_MISS_R_H 0xB84
#define VX_CSR_MPM_ICACHE_MSHR_ST 0xB05 // MSHR stalls
#define VX_CSR_MPM_ICACHE_MSHR_ST_H 0xB85
// PERF: dcache
#define VX_CSR_MPM_DCACHE_READS 0xB06 // total reads
#define VX_CSR_MPM_DCACHE_READS_H 0xB86
#define VX_CSR_MPM_DCACHE_WRITES 0xB07 // total writes
#define VX_CSR_MPM_DCACHE_WRITES_H 0xB87
#define VX_CSR_MPM_DCACHE_MISS_R 0xB08 // read misses
#define VX_CSR_MPM_DCACHE_MISS_R_H 0xB88
#define VX_CSR_MPM_DCACHE_MISS_W 0xB09 // write misses
#define VX_CSR_MPM_DCACHE_MISS_W_H 0xB89
#define VX_CSR_MPM_DCACHE_BANK_ST 0xB0A // bank conflicts
#define VX_CSR_MPM_DCACHE_BANK_ST_H 0xB8A
#define VX_CSR_MPM_DCACHE_MSHR_ST 0xB0B // MSHR stalls
#define VX_CSR_MPM_DCACHE_MSHR_ST_H 0xB8B
// PERF: l2cache
#define VX_CSR_MPM_L2CACHE_READS 0xB0C // total reads
#define VX_CSR_MPM_L2CACHE_READS_H 0xB8C
#define VX_CSR_MPM_L2CACHE_WRITES 0xB0D // total writes
#define VX_CSR_MPM_L2CACHE_WRITES_H 0xB8D
#define VX_CSR_MPM_L2CACHE_MISS_R 0xB0E // read misses
#define VX_CSR_MPM_L2CACHE_MISS_R_H 0xB8E
#define VX_CSR_MPM_L2CACHE_MISS_W 0xB0F // write misses
#define VX_CSR_MPM_L2CACHE_MISS_W_H 0xB8F
#define VX_CSR_MPM_L2CACHE_BANK_ST 0xB10 // bank conflicts
#define VX_CSR_MPM_L2CACHE_BANK_ST_H 0xB90
#define VX_CSR_MPM_L2CACHE_MSHR_ST 0xB11 // MSHR stalls
#define VX_CSR_MPM_L2CACHE_MSHR_ST_H 0xB91
// PERF: l3cache
#define VX_CSR_MPM_L3CACHE_READS 0xB12 // total reads
#define VX_CSR_MPM_L3CACHE_READS_H 0xB92
#define VX_CSR_MPM_L3CACHE_WRITES 0xB13 // total writes
#define VX_CSR_MPM_L3CACHE_WRITES_H 0xB93
#define VX_CSR_MPM_L3CACHE_MISS_R 0xB14 // read misses
#define VX_CSR_MPM_L3CACHE_MISS_R_H 0xB94
#define VX_CSR_MPM_L3CACHE_MISS_W 0xB15 // write misses
#define VX_CSR_MPM_L3CACHE_MISS_W_H 0xB95
#define VX_CSR_MPM_L3CACHE_BANK_ST 0xB16 // bank conflicts
#define VX_CSR_MPM_L3CACHE_BANK_ST_H 0xB96
#define VX_CSR_MPM_L3CACHE_MSHR_ST 0xB17 // MSHR stalls
#define VX_CSR_MPM_L3CACHE_MSHR_ST_H 0xB97
// PERF: memory
#define VX_CSR_MPM_MEM_READS 0xB18 // total reads
#define VX_CSR_MPM_MEM_READS_H 0xB98
#define VX_CSR_MPM_MEM_WRITES 0xB19 // total writes
#define VX_CSR_MPM_MEM_WRITES_H 0xB99
#define VX_CSR_MPM_MEM_LT 0xB1A // memory latency
#define VX_CSR_MPM_MEM_LT_H 0xB9A
// PERF: smem
#define VX_CSR_MPM_SMEM_READS 0xB1B // memory reads
#define VX_CSR_MPM_SMEM_READS_H 0xB9B
#define VX_CSR_MPM_SMEM_WRITES 0xB1C // memory writes
#define VX_CSR_MPM_SMEM_WRITES_H 0xB9C
#define VX_CSR_MPM_SMEM_BANK_ST 0xB1D // bank conflicts
#define VX_CSR_MPM_SMEM_BANK_ST_H 0xB9D
// Machine Information Registers
#define VX_CSR_MVENDORID 0xF11
#define VX_CSR_MARCHID 0xF12
#define VX_CSR_MIMPID 0xF13
#define VX_CSR_MHARTID 0xF14
// GPGU CSRs
#define VX_CSR_THREAD_ID 0xCC0
#define VX_CSR_WARP_ID 0xCC1
#define VX_CSR_CORE_ID 0xCC2
#define VX_CSR_WARP_MASK 0xCC3
#define VX_CSR_THREAD_MASK 0xCC4 // warning! this value is also used in LLVM
#define VX_CSR_NUM_THREADS 0xFC0
#define VX_CSR_NUM_WARPS 0xFC1
#define VX_CSR_NUM_CORES 0xFC2
#endif // VX_TYPES_VH

258
lib/include/gemmini_mmio.h Normal file
View File

@@ -0,0 +1,258 @@
#ifndef GEMMINI_MMIO_H
#define GEMMINI_MMIO_H
#ifndef GEMMINI_PARAMS_H
#error INCLUDE GEMMINI.H FIRST
#endif
/* shared memory constants and helpers */
/* =================================== */
#define SMEM_BASE 0xff000000
// 16KB
// #define SMEM_SIZE 0x4000
// 64KB
// #define SMEM_SIZE 0x10000
// 128KB (FP16 GEMM)
#define SMEM_SIZE 0x20000
// 256KB (FlashAttention)
// #define SMEM_SIZE 0x40000
#define SMEM_MASK (SMEM_SIZE - 1)
#define SMEM_ADDR_END (SMEM_BASE + SMEM_SIZE)
#define SPAD_BASE 0x0
#define SPAD_ROW_SIZE (DIM * sizeof(elem_t))
#define SPAD_NUM_ROWS (SMEM_SIZE / SPAD_ROW_SIZE)
#define SPAD_MASK (SPAD_NUM_ROWS - 1)
#define PRINT_BUF ((char *) (SMEM_ADDR_END))
#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;})
#define SMEM_TO_SPAD(smem_addr) (SPAD_BASE + ((smem_addr) & SMEM_MASK) / SPAD_ROW_SIZE)
#define SPAD_TO_SMEM(spad_addr) (SMEM_BASE + ((spad_addr) & SPAD_MASK) * SPAD_ROW_SIZE)
// convert normal matrix i,j into tiled smem offset
// top_in_tiles = i / DIM
// left_in_tiles = j / DIM
// num_tiles_before_current = top_in_tiles * (J / DIM) + left_in_tiles
// smem_addr = num_tiles_before_current * DIM * DIM + (i % DIM) * DIM + (j % DIM)
#define SMEM_MAT_OFFSET(i, j, J) \
(((i) / DIM * (J) / DIM + (j) / DIM) * DIM * DIM + ((i) % DIM) * DIM + ((j) % DIM))
/* gemmini mmio interface */
/* ====================== */
static size_t gemmini_tile_idx[NUM_THREADS * NUM_WARPS * NUM_CORES * NUM_CLUSTERS] = {0};
#define use_gemmini(i) {gemmini_tile_idx[HW_TID()] = (i);}
#define GEMMINI_TILE_IDX() (gemmini_tile_idx[HW_TID()])
#define GEMMINI_CISC_IMM(x, i) ((x) + 32 * (i))
#define GEMMINI_CTRL (SMEM_BASE + SMEM_SIZE + 0x3000 + 0x100 * GEMMINI_TILE_IDX())
#define GEMMINI_RS1_ADDR (GEMMINI_CTRL + 0x10)
#define GEMMINI_RS2_ADDR (GEMMINI_CTRL + 0x18)
#define GEMMINI_INST_ADDR (GEMMINI_CTRL + 0x0)
#define GEMMINI_BUSY_ADDR (GEMMINI_CTRL + 0x20)
#define GEMMINI_OCCUPANCY_ADDR (GEMMINI_CTRL + 0x28)
#undef ROCC_INSTRUCTION_RS1_RS2
#define ROCC_INSTRUCTION_RS1_RS2(x, rs1, rs2, funct) { \
*((volatile uint64_t *) GEMMINI_RS1_ADDR) = (rs1); \
*((volatile uint64_t *) GEMMINI_RS2_ADDR) = (rs2); \
*((volatile uint32_t*) GEMMINI_INST_ADDR) = (0x7B) | (0 << 7) | (3 << 12) | (1 << 15) | (2 << 20) | ((funct) << 25); \
}
/* additional intrinsics */
/* ===================== */
#define loop_matmul_skips(skip_lda, skip_ldb, skip_ldd, skip_ex, skip_stc) \
(((skip_lda) | ((skip_ldb) << 1) | ((skip_ldd) << 2) | ((skip_ex) << 3) | ((skip_stc) << 4)) << 3)
#define sp_tiled_matmul_full_spad_ws(A_sp_addr_start, B_sp_addr_start, D_sp_addr_start, C_dst_sp_addr_start,\
I, J, K, pad_I, pad_J, pad_K, a_transpose, b_transpose, full_C, low_D, acc, act, skips) \
gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K, A_sp_addr_start, (B_sp_addr_start) + (K) * (J) * DIM, NULL, \
C_dst_sp_addr_start, a_transpose, b_transpose, full_C, low_D, acc, act, 0, 0, false, skips)
#define gemmini_status() ({uint32_t status; asm volatile ("csrr %0, 0xacc" : "=r" (status)); status;})
#undef gemmini_fence
//#define gemmini_fence() { while (gemmini_status()); }
#define gemmini_fence() { while (*((volatile uint32_t *) GEMMINI_BUSY_ADDR)) asm volatile ("nop"); }
#define virgo_fence(n) { while (*((volatile uint32_t *) GEMMINI_OCCUPANCY_ADDR) > n) asm volatile ("nop"); }
/* cisc instructions */
/* ================= */
// bits [4:0] is the opcode
// bits [7:5] is the target gemmini id, zero-indexed
// #define GEMMINI_CISC_CMD_I(x) asm("csrwi 0xacc, %0" :: "i" (x))
#define GEMMINI_CISC_CMD_I(x) asm("csrw 0xacc, %0" :: "r" (x)) // use registers even for immediate calls for now
#define GEMMINI_CISC_CMD_R(x) asm("csrw 0xacc, %0" :: "r" (x))
#define GEMMINI_CISC_COMPUTE_HEXADECILES 0
#define GEMMINI_CISC_COMPUTE_AND_STORE_TO_SPAD 1
#define GEMMINI_CISC_MANUAL 2
#define GEMMINI_CISC_SET_AB_STRIDE 8
#define GEMMINI_CISC_STORE_TO_SPAD 9
#define GEMMINI_CISC_LOAD_TO_HEXADECILES 10
#define GEMMINI_CISC_SET_DC_STRIDE 11
#define GEMMINI_CISC_STORE_TO_GMEM 12
/* high level virgo routines */
/* ========================= */
inline void gemmini_tile_load_ab(const elem_t * const a_addr, const elem_t * const b_addr,
const uint32_t a_hexadecile, const uint32_t b_hexadecile,
const uint32_t tile_idx_i, const uint32_t tile_idx_j, const uint32_t tile_idx_k,
const uint32_t mat_size_m, const uint32_t mat_size_n, const uint32_t mat_size_k,
const uint32_t tile_size_m, const uint32_t tile_size_n, const uint32_t tile_size_k) {
ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC,
(uint64_t) (a_addr + tile_idx_i * tile_size_m * mat_size_k + tile_idx_k * tile_size_k),
(uint64_t) (b_addr + tile_idx_k * tile_size_k * mat_size_n + tile_idx_j * tile_size_n), k_LOOP_WS_CONFIG_ADDRS_AB)
GEMMINI_CISC_CMD_R((mat_size_n << 20) | (mat_size_k << 8) | GEMMINI_CISC_SET_AB_STRIDE);
GEMMINI_CISC_CMD_R((b_hexadecile << 16) | (a_hexadecile << 8) | GEMMINI_CISC_LOAD_TO_HEXADECILES);
}
template <bool store_to_spad = false>
inline void gemmini_tile_compute(const uint32_t a_hexadecile,
const uint32_t b_hexadecile,
const uint32_t d_hexadecile,
const bool accumulate) {
if constexpr (!store_to_spad) {
GEMMINI_CISC_CMD_R((static_cast<uint32_t>(accumulate) << 24) |
(b_hexadecile << 16) | (a_hexadecile << 8) |
GEMMINI_CISC_COMPUTE_HEXADECILES);
} else {
GEMMINI_CISC_CMD_R((d_hexadecile << 24) | (b_hexadecile << 16) |
(a_hexadecile << 8) | GEMMINI_CISC_COMPUTE_AND_STORE_TO_SPAD);
}
}
inline void gemmini_tile_store_c_gmem(elem_t * const c_addr,
const uint32_t tile_idx_i, const uint32_t tile_idx_j,
const uint32_t mat_size_m, const uint32_t mat_size_n,
const uint32_t tile_size_m, const uint32_t tile_size_n) {
elem_t * const dram_c_tile_start = c_addr + tile_idx_i * tile_size_m * mat_size_n + tile_idx_j * tile_size_n;
ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, (uint64_t) dram_c_tile_start, k_LOOP_WS_CONFIG_ADDRS_DC)
GEMMINI_CISC_CMD_R((mat_size_n << 20) | GEMMINI_CISC_SET_DC_STRIDE);
GEMMINI_CISC_CMD_I(GEMMINI_CISC_STORE_TO_GMEM);
// ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, BOUND_INST, k_LOOP_WS_CONFIG_BOUNDS)
// ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, mat_size_n, k_LOOP_WS_CONFIG_STRIDES_DC)
// ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, loop_matmul_skips(1, 1, 1, 1, 0), k_LOOP_WS)
}
inline void gemmini_tile_store_c_spad(const uint32_t c_hexadecile) {
GEMMINI_CISC_CMD_R(((uint32_t) (c_hexadecile << 8)) | GEMMINI_CISC_STORE_TO_SPAD);
}
inline void gemmini_manual_job() {
GEMMINI_CISC_CMD_I(GEMMINI_CISC_MANUAL);
}
/* inline static void sp_tiled_matmul_full_spad_ws(const uint32_t A_sp_addr_start, const uint32_t B_sp_addr_start,
const uint32_t D_sp_addr_start, const uint32_t C_dst_sp_addr_start,
size_t I, size_t J, size_t K, size_t pad_I, size_t pad_J, size_t pad_K,
bool a_transpose, bool b_transpose,
bool full_C, bool low_D, bool acc,
int act, int skip_mvout) {
gemmini_loop_ws_spad(I, J, K, pad_I, pad_J, pad_K,
A_sp_addr_start, B_sp_addr_start + K * J * DIM, NULL, C_dst_sp_addr_start,
a_transpose, b_transpose,
full_C, low_D, acc,
act, 0, 0, false, skip_mvout); */
/*
return;
// const uint32_t A_sp_addr_start = 0;
// const uint32_t B_sp_addr_start = BANK_NUM * BANK_ROWS - K * J * DIM;
// const uint32_t D_sp_addr_start = 1 << (ADDR_LEN-1);
const uint32_t C_sp_addr_start = 2 << (ADDR_LEN-2) | (full_C << (ADDR_LEN-3));
// const int D_blocks = low_D ? (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN) :
// (J <= MAX_BLOCK_LEN_ACC ? J : MAX_BLOCK_LEN_ACC);
const int C_blocks = 1; //full_C ? 1 : (J <= MAX_BLOCK_LEN ? J : MAX_BLOCK_LEN);
// const size_t sizeof_D = low_D ? sizeof(elem_t) : sizeof(acc_t);
const size_t sizeof_C = full_C ? sizeof(acc_t) : sizeof(elem_t);
gemmini_fence();
if (a_transpose || b_transpose || (I < 4)) {
for (size_t k = 0; k < K; k++) {
for (size_t j = 0; j < J; j++) {
for (size_t i = 0; i < I; i++) {
const uint32_t A_sp_addr = a_transpose ? (A_sp_addr_start + (k*I + i)*DIM) :
(A_sp_addr_start + (i*K + k)*DIM);
const uint32_t B_sp_addr = b_transpose ? (B_sp_addr_start + (j*K + k)*DIM) :
(B_sp_addr_start + (k*J + j)*DIM);
const uint32_t C_sp_addr = C_sp_addr_start + (i*J + j)*DIM;
// Compute
uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR;
uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2));
gemmini_extended_preload(pre_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM);
if (i == 0) { // First iteration
gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
} else { // All other iterations
gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
}
if (k == K - 1) {
// Move-out C (if not normalizing)
// if (((act != LAYERNORM) && (act != SOFTMAX)) && (j == J-1 || j % C_blocks == C_blocks-1)) {
const size_t rounded_j = j; // (j / C_blocks) * C_blocks;
const uint32_t rounded_C_sp_addr = C_sp_addr; // C_sp_addr_start + (i*J + rounded_j)*DIM;
const uint32_t C_dst_sp_addr = ((uint32_t) C_dst_sp_addr_start) + (i * J + rounded_j) * DIM; // * DIM * sizeof_C;
// const size_t blocks = rounded_j + C_blocks <= J ? C_blocks : J-rounded_j;
constexpr size_t cols = DIM; // blocks * DIM - (rounded_j + blocks >= J ? pad_J : 0);
constexpr size_t rows = DIM; // DIM - (i == I - 1 ? pad_I : 0);
gemmini_extended_mvout_spad(C_dst_sp_addr, 1, rounded_C_sp_addr, cols, rows);
// }
}
}
}
}
} else {
for (size_t k = 0; k < K; k++) {
for (size_t j = 0; j < J; j++) {
uint32_t A_sp_addr = A_sp_addr_start + k * DIM; // (i*K + k)*DIM;
const uint32_t B_sp_addr = B_sp_addr_start + (k*J + j)*DIM;
uint32_t C_sp_addr = C_sp_addr_start + j * DIM; // (i*J + j)*DIM;
for (size_t i = 0; i < I; i += 4) {
// Compute
// constexpr uint32_t pre_sp_addr = i == 0 ? B_sp_addr : GARBAGE_ADDR;
const uint32_t out_sp_addr = C_sp_addr | ((k == 0 ? 0 : 1) << (ADDR_LEN-2));
if (i == 0) { // First iteration
gemmini_extended_preload(B_sp_addr, out_sp_addr, DIM, DIM, DIM, DIM);
gemmini_extended_compute_preloaded(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM);
gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM);
gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM);
gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
} else { // All other iterations
gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr, DIM, DIM, DIM, DIM);
gemmini_extended_compute_accumulated(A_sp_addr, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + J * DIM, DIM, DIM, DIM, DIM);
gemmini_extended_compute_accumulated(A_sp_addr + K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 2 * J * DIM, DIM, DIM, DIM, DIM);
gemmini_extended_compute_accumulated(A_sp_addr + 2 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
gemmini_extended_preload(GARBAGE_ADDR, out_sp_addr + 3 * J * DIM, DIM, DIM, DIM, DIM);
gemmini_extended_compute_accumulated(A_sp_addr + 3 * K * DIM, GARBAGE_ADDR, DIM, DIM, DIM, DIM);
}
if (k == K - 1) {
for (int x = 0; x < 3; x++) gemmini_fence();
gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + (i * J + j) * DIM, 1, C_sp_addr, DIM, DIM);
gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 1) * J + j) * DIM, 1, C_sp_addr + J * DIM, DIM, DIM);
gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 2) * J + j) * DIM, 1, C_sp_addr + 2 * J * DIM, DIM, DIM);
gemmini_extended_mvout_spad((uint32_t) C_dst_sp_addr_start + ((i + 3) * J + j) * DIM, 1, C_sp_addr + 3 * J * DIM, DIM, DIM);
}
A_sp_addr += 4 * K * DIM;
C_sp_addr += 4 * J * DIM;
}
}
}
}
gemmini_fence();
}*/
#endif

228
lib/include/vx_intrinsics.h Normal file
View File

@@ -0,0 +1,228 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __VX_INTRINSICS_H__
#define __VX_INTRINSICS_H__
#include <VX_config.h>
#include <VX_types.h>
#if defined(__clang__)
#define __UNIFORM__ __attribute__((annotate("vortex.uniform")))
#else
#define __UNIFORM__
#endif
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
#else
#define __ASM_STR(x) #x
#endif
#define RISCV_CUSTOM0 0x0B
#define RISCV_CUSTOM1 0x2B
#define RISCV_CUSTOM2 0x5B
#define RISCV_CUSTOM3 0x7B
#define csr_read(csr) ({ \
unsigned __r; \
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
__r; \
})
#define csr_write(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
})
#define csr_swap(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_read_set(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_set(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
})
#define csr_read_clear(csr, val) ({ \
unsigned __r; \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
__r; \
})
#define csr_clear(csr, val) ({ \
unsigned __v = (unsigned)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
else \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
})
// Conditional move
inline unsigned vx_cmov(unsigned c, unsigned t, unsigned f) {
unsigned ret;
asm volatile (".insn r4 %1, 1, 0, %0, %2, %3, %4" : "=r"(ret) : "i"(RISCV_CUSTOM1), "r"(c), "r"(t), "r"(f));
return ret;
}
// Set thread mask
inline void vx_tmc(unsigned thread_mask) {
asm volatile (".insn r %0, 0, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(thread_mask));
}
// disable all threads in the current warp
inline void vx_tmc_zero() {
asm volatile (".insn r %0, 0, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
}
// switch execution to single thread zero
inline void vx_tmc_one() {
asm volatile (
"li a0, 1\n\t" // Load immediate value 1 into a0 (x10) register
".insn r %0, 0, 0, x0, a0, x0" :: "i"(RISCV_CUSTOM0)
: "a0" // Indicate that a0 (x10) is clobbered
);
}
// Set thread predicate
inline void vx_pred(unsigned condition, unsigned thread_mask) {
asm volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
}
typedef void (*vx_wspawn_pfn)();
// Spawn warps
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
asm volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
}
// Split on a predicate
inline unsigned vx_split(unsigned predicate) {
unsigned ret;
asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
return ret;
}
// Join
inline void vx_join(unsigned stack_ptr) {
asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
}
// Warp Barrier
__attribute__((convergent))
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
asm volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
}
// Return current thread identifier
inline int vx_thread_id() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
return ret;
}
// Return current warp identifier
inline int vx_warp_id() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
return ret;
}
// Return current core identifier
inline int vx_core_id() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
return ret;
}
// Return current thread mask
inline int vx_thread_mask() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_MASK));
return ret;
}
// Return number of active warps
inline int vx_active_warps() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_MASK));
return ret;
}
// Return the number of threads per warp
inline int vx_num_threads() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
return ret;
}
// Return the number of warps per core
inline int vx_num_warps() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
return ret;
}
// Return the number of cores per cluster
inline int vx_num_cores() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
return ret;
}
// Return the hart identifier (thread id accross the processor)
inline int vx_hart_id() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
return ret;
}
inline void vx_fence() {
asm volatile ("fence iorw, iorw");
}
#ifdef __cplusplus
}
#endif
#endif // __VX_INTRINSICS_H__

34
lib/include/vx_print.h Normal file
View File

@@ -0,0 +1,34 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __VX_PRINT_H__
#define __VX_PRINT_H__
#include <stdarg.h>
#ifdef __cplusplus
extern "C" {
#endif
int vx_vprintf(const char* format, va_list va);
int vx_printf(const char * format, ...);
void vx_putchar(int c);
void vx_putint(int value, int base);
void vx_putfloat(float value, int precision);
#ifdef __cplusplus
}
#endif
#endif // __VX_PRINT_H__

64
lib/include/vx_spawn.h Normal file
View File

@@ -0,0 +1,64 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __VX_SPAWN_H__
#define __VX_SPAWN_H__
#include <stdint.h>
#include <stdio.h>
#ifndef CORES_PER_CLUSTER
#define CORES_PER_CLUSTER 8
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
uint32_t num_groups[3];
uint32_t global_offset[3];
uint32_t local_size[3];
char * printf_buffer;
uint32_t *printf_buffer_position;
uint32_t printf_buffer_capacity;
uint32_t work_dim;
} context_t;
typedef void (*vx_spawn_kernel_cb) (
const void * /* arg */,
const context_t * /* context */,
uint32_t /* group_x */,
uint32_t /* group_y */,
uint32_t /* group_z */
);
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
typedef void (*vx_serial_cb)(void *arg);
void vx_wspawn_wait();
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg);
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg);
void vx_serial(vx_serial_cb callback, void * arg);
#ifdef __cplusplus
}
#endif
#endif // __VX_SPAWN_H__

296
lib/linker/vx_link32.ld Normal file
View File

@@ -0,0 +1,296 @@
/* Default linker script, for normal executables */
/* Copyright (C) 2014-2020 Free Software Foundation, Inc.
Copying and distribution of this script, with or without modification,
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved. */
OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv",
"elf32-littleriscv")
OUTPUT_ARCH(riscv)
ENTRY(_start)
MEMORY {
DRAM0 (rwx): ORIGIN = 0x80000000, LENGTH = 512M
DRAMARG (rwx): ORIGIN = 0x9fff0000, LENGTH = 8K
DRAM1 (rwx): ORIGIN = 0xa0000000, LENGTH = 16M
DRAM2 (rwx): ORIGIN = 0xa1000000, LENGTH = 16M
DRAM3 (rwx): ORIGIN = 0xa2000000, LENGTH = 16M
}
SECTIONS
{
. = STARTUP_ADDR;
.interp : { *(.interp) }
.note.gnu.build-id : { *(.note.gnu.build-id) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.rela.init : { *(.rela.init) }
.rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) }
.rela.fini : { *(.rela.fini) }
.rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) }
.rela.data.rel.ro : { *(.rela.data.rel.ro .rela.data.rel.ro.* .rela.gnu.linkonce.d.rel.ro.*) }
.rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) }
.rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) }
.rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) }
.rela.ctors : { *(.rela.ctors) }
.rela.dtors : { *(.rela.dtors) }
.rela.got : { *(.rela.got) }
.rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) }
.rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) }
.rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) }
.rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) }
.rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) }
.rela.iplt :
{
PROVIDE_HIDDEN (__rela_iplt_start = .);
*(.rela.iplt)
PROVIDE_HIDDEN (__rela_iplt_end = .);
}
.rela.plt :
{
*(.rela.plt)
}
.init :
{
KEEP (*(SORT_NONE(.init)))
}
.plt : { *(.plt) }
.iplt : { *(.iplt) }
.text :
{
*(.text.unlikely .text.*_unlikely .text.unlikely.*)
*(.text.exit .text.exit.*)
*(.text.startup .text.startup.*)
*(.text.hot .text.hot.*)
*(SORT(.text.sorted.*))
*(.text .stub .text.* .gnu.linkonce.t.*)
/* .gnu.warning sections are handled specially by elf.em. */
*(.gnu.warning)
}
.fini :
{
KEEP (*(SORT_NONE(.fini)))
}
PROVIDE (__etext = .);
PROVIDE (_etext = .);
PROVIDE (etext = .);
.rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
.rodata1 : { *(.rodata1) }
.sdata2 :
{
*(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
}
.sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
.eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
.eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
.gnu_extab : ONLY_IF_RO { *(.gnu_extab*) }
/* These sections are generated by the Sun/Oracle C++ compiler. */
.exception_ranges : ONLY_IF_RO { *(.exception_ranges*) }
/* Adjust the address for the data segment. We want to adjust up to
the same address within the page on the next page up. */
. = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
/* Exception handling */
.eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gnu_extab : ONLY_IF_RW { *(.gnu_extab) }
.gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
.exception_ranges : ONLY_IF_RW { *(.exception_ranges*) }
/* Thread Local Storage sections */
.tdata :
{
PROVIDE_HIDDEN (__tdata_start = .);
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE_HIDDEN (__tdata_end = .);
}
PROVIDE (__tdata_size = SIZEOF (.tdata));
.tbss :
{
PROVIDE_HIDDEN (__tbss_start = .);
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
PROVIDE_HIDDEN (__tbss_end = .);
}
PROVIDE (__tbss_size = SIZEOF (.tbss));
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
.preinit_array :
{
PROVIDE_HIDDEN (__preinit_array_start = .);
KEEP (*(.preinit_array))
PROVIDE_HIDDEN (__preinit_array_end = .);
}
.init_array :
{
PROVIDE_HIDDEN (__init_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
PROVIDE_HIDDEN (__init_array_end = .);
}
.fini_array :
{
PROVIDE_HIDDEN (__fini_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
PROVIDE_HIDDEN (__fini_array_end = .);
}
.htif_pad : {
. = ALIGN(0x1000);
}
. = ALIGN(0x1000);
.tohost : {
*(.tohost)
/* . += 0x100; */
}
. = ALIGN(0x1000);
.ctors :
{
/* gcc uses crtbegin.o to find the start of
the constructors, so we make sure it is
first. Because this is a wildcard, it
doesn't matter if the user does not
actually link against crtbegin.o; the
linker won't look for a file to match a
wildcard. The wildcard also means that it
doesn't matter which directory crtbegin.o
is in. */
KEEP (*crtbegin.o(.ctors))
KEEP (*crtbegin?.o(.ctors))
/* We don't want to include the .ctor section from
the crtend.o file until after the sorted ctors.
The .ctor section from the crtend file contains the
end of ctors marker and it must be last */
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
KEEP (*(SORT(.ctors.*)))
KEEP (*(.ctors))
}
.dtors :
{
KEEP (*crtbegin.o(.dtors))
KEEP (*crtbegin?.o(.dtors))
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
KEEP (*(SORT(.dtors.*)))
KEEP (*(.dtors))
}
.jcr : { KEEP (*(.jcr)) }
.data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
.dynamic : { *(.dynamic) }
. = DATA_SEGMENT_RELRO_END (0, .);
.data :
{
__DATA_BEGIN__ = .;
*(.data .data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
}
.data1 : { *(.data1) }
.got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
/* We want the small data sections together, so single-instruction offsets
can access them all, and initialized data all before uninitialized, so
we can shorten the on-disk segment size. */
.sdata :
{
__SDATA_BEGIN__ = .;
*(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
*(.sdata .sdata.* .gnu.linkonce.s.*)
}
_edata = .; PROVIDE (edata = .);
. = .;
__bss_start = .;
.sbss :
{
*(.dynsbss)
*(.sbss .sbss.* .gnu.linkonce.sb.*)
*(.scommon)
}
.bss :
{
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
*(COMMON)
/* Align here to ensure that the .bss section occupies space up to
_end. Align after .bss to ensure correct alignment even if the
.bss section disappears because there are no input sections.
FIXME: Why do we need it? When there is no .bss section, we do not
pad the .data section. */
. = ALIGN(. != 0 ? 32 / 8 : 1);
}
. = ALIGN(32 / 8);
. = SEGMENT_START("ldata-segment", .);
. = ALIGN(32 / 8);
__BSS_END__ = .;
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
_end = .; PROVIDE (end = .);
. = DATA_SEGMENT_END (.);
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
.stab.excl 0 : { *(.stab.excl) }
.stab.exclstr 0 : { *(.stab.exclstr) }
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
/* DWARF debug sections.
Symbols in the DWARF debugging sections are relative to the beginning
of the section so we begin them at 0. */
/* DWARF 1 */
.debug 0 : { *(.debug) }
.line 0 : { *(.line) }
/* GNU DWARF 1 extensions */
.debug_srcinfo 0 : { *(.debug_srcinfo) }
.debug_sfnames 0 : { *(.debug_sfnames) }
/* DWARF 1.1 and DWARF 2 */
.debug_aranges 0 : { *(.debug_aranges) }
.debug_pubnames 0 : { *(.debug_pubnames) }
/* DWARF 2 */
.debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
.debug_abbrev 0 : { *(.debug_abbrev) }
.debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) }
.debug_frame 0 : { *(.debug_frame) }
.debug_str 0 : { *(.debug_str) }
.debug_loc 0 : { *(.debug_loc) }
.debug_macinfo 0 : { *(.debug_macinfo) }
/* SGI/MIPS DWARF 2 extensions */
.debug_weaknames 0 : { *(.debug_weaknames) }
.debug_funcnames 0 : { *(.debug_funcnames) }
.debug_typenames 0 : { *(.debug_typenames) }
.debug_varnames 0 : { *(.debug_varnames) }
/* DWARF 3 */
.debug_pubtypes 0 : { *(.debug_pubtypes) }
.debug_ranges 0 : { *(.debug_ranges) }
/* DWARF Extension. */
.debug_macro 0 : { *(.debug_macro) }
.debug_addr 0 : { *(.debug_addr) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
.args : {
*(.args)
. += 8K;
}> DRAMARG
.operand.a : {
*(.operand.a)
. += 32K;
}> DRAM1
.operand.b : {
*(.operand.b)
. += 32K;
}> DRAM2
.operand.c : {
*(.operand.c)
. += 32K;
}> DRAM3
}

252
lib/linker/vx_link64.ld Normal file
View File

@@ -0,0 +1,252 @@
/* Default linker script, for normal executables */
/* Copyright (C) 2014-2020 Free Software Foundation, Inc.
Copying and distribution of this script, with or without modification,
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved. */
OUTPUT_FORMAT("elf64-littleriscv", "elf64-littleriscv",
"elf64-littleriscv")
OUTPUT_ARCH(riscv)
ENTRY(_start)
SECTIONS
{
. = STARTUP_ADDR;
.interp : { *(.interp) }
.note.gnu.build-id : { *(.note.gnu.build-id) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
.gnu.version_d : { *(.gnu.version_d) }
.gnu.version_r : { *(.gnu.version_r) }
.rela.init : { *(.rela.init) }
.rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) }
.rela.fini : { *(.rela.fini) }
.rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) }
.rela.data.rel.ro : { *(.rela.data.rel.ro .rela.data.rel.ro.* .rela.gnu.linkonce.d.rel.ro.*) }
.rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) }
.rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) }
.rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) }
.rela.ctors : { *(.rela.ctors) }
.rela.dtors : { *(.rela.dtors) }
.rela.got : { *(.rela.got) }
.rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) }
.rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) }
.rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) }
.rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) }
.rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) }
.rela.iplt :
{
PROVIDE_HIDDEN (__rela_iplt_start = .);
*(.rela.iplt)
PROVIDE_HIDDEN (__rela_iplt_end = .);
}
.rela.plt :
{
*(.rela.plt)
}
.init :
{
KEEP (*(SORT_NONE(.init)))
}
.plt : { *(.plt) }
.iplt : { *(.iplt) }
.text :
{
*(.text.unlikely .text.*_unlikely .text.unlikely.*)
*(.text.exit .text.exit.*)
*(.text.startup .text.startup.*)
*(.text.hot .text.hot.*)
*(SORT(.text.sorted.*))
*(.text .stub .text.* .gnu.linkonce.t.*)
/* .gnu.warning sections are handled specially by elf.em. */
*(.gnu.warning)
}
.fini :
{
KEEP (*(SORT_NONE(.fini)))
}
PROVIDE (__etext = .);
PROVIDE (_etext = .);
PROVIDE (etext = .);
.rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) }
.rodata1 : { *(.rodata1) }
.sdata2 :
{
*(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
}
.sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) }
.eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) }
.eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }
.gnu_extab : ONLY_IF_RO { *(.gnu_extab*) }
/* These sections are generated by the Sun/Oracle C++ compiler. */
.exception_ranges : ONLY_IF_RO { *(.exception_ranges*) }
/* Adjust the address for the data segment. We want to adjust up to
the same address within the page on the next page up. */
. = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE));
/* Exception handling */
.eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) }
.gnu_extab : ONLY_IF_RW { *(.gnu_extab) }
.gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }
.exception_ranges : ONLY_IF_RW { *(.exception_ranges*) }
/* Thread Local Storage sections */
.tdata :
{
PROVIDE_HIDDEN (__tdata_start = .);
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE_HIDDEN (__tdata_end = .);
}
PROVIDE (__tdata_size = SIZEOF (.tdata));
.tbss :
{
PROVIDE_HIDDEN (__tbss_start = .);
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
PROVIDE_HIDDEN (__tbss_end = .);
}
PROVIDE (__tbss_size = SIZEOF (.tbss));
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
.preinit_array :
{
PROVIDE_HIDDEN (__preinit_array_start = .);
KEEP (*(.preinit_array))
PROVIDE_HIDDEN (__preinit_array_end = .);
}
.init_array :
{
PROVIDE_HIDDEN (__init_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)))
KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors))
PROVIDE_HIDDEN (__init_array_end = .);
}
.fini_array :
{
PROVIDE_HIDDEN (__fini_array_start = .);
KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)))
KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors))
PROVIDE_HIDDEN (__fini_array_end = .);
}
.ctors :
{
/* gcc uses crtbegin.o to find the start of
the constructors, so we make sure it is
first. Because this is a wildcard, it
doesn't matter if the user does not
actually link against crtbegin.o; the
linker won't look for a file to match a
wildcard. The wildcard also means that it
doesn't matter which directory crtbegin.o
is in. */
KEEP (*crtbegin.o(.ctors))
KEEP (*crtbegin?.o(.ctors))
/* We don't want to include the .ctor section from
the crtend.o file until after the sorted ctors.
The .ctor section from the crtend file contains the
end of ctors marker and it must be last */
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors))
KEEP (*(SORT(.ctors.*)))
KEEP (*(.ctors))
}
.dtors :
{
KEEP (*crtbegin.o(.dtors))
KEEP (*crtbegin?.o(.dtors))
KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors))
KEEP (*(SORT(.dtors.*)))
KEEP (*(.dtors))
}
.jcr : { KEEP (*(.jcr)) }
.data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) }
.dynamic : { *(.dynamic) }
. = DATA_SEGMENT_RELRO_END (0, .);
.data :
{
__DATA_BEGIN__ = .;
*(.data .data.* .gnu.linkonce.d.*)
SORT(CONSTRUCTORS)
}
.data1 : { *(.data1) }
.got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) }
/* We want the small data sections together, so single-instruction offsets
can access them all, and initialized data all before uninitialized, so
we can shorten the on-disk segment size. */
.sdata :
{
__SDATA_BEGIN__ = .;
*(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
*(.sdata .sdata.* .gnu.linkonce.s.*)
}
_edata = .; PROVIDE (edata = .);
. = .;
__bss_start = .;
.sbss :
{
*(.dynsbss)
*(.sbss .sbss.* .gnu.linkonce.sb.*)
*(.scommon)
}
.bss :
{
*(.dynbss)
*(.bss .bss.* .gnu.linkonce.b.*)
*(COMMON)
/* Align here to ensure that the .bss section occupies space up to
_end. Align after .bss to ensure correct alignment even if the
.bss section disappears because there are no input sections.
FIXME: Why do we need it? When there is no .bss section, we do not
pad the .data section. */
. = ALIGN(. != 0 ? 64 / 8 : 1);
}
. = ALIGN(64 / 8);
. = SEGMENT_START("ldata-segment", .);
. = ALIGN(64 / 8);
__BSS_END__ = .;
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
_end = .; PROVIDE (end = .);
. = DATA_SEGMENT_END (.);
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }
.stab.excl 0 : { *(.stab.excl) }
.stab.exclstr 0 : { *(.stab.exclstr) }
.stab.index 0 : { *(.stab.index) }
.stab.indexstr 0 : { *(.stab.indexstr) }
.comment 0 : { *(.comment) }
.gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) }
/* DWARF debug sections.
Symbols in the DWARF debugging sections are relative to the beginning
of the section so we begin them at 0. */
/* DWARF 1 */
.debug 0 : { *(.debug) }
.line 0 : { *(.line) }
/* GNU DWARF 1 extensions */
.debug_srcinfo 0 : { *(.debug_srcinfo) }
.debug_sfnames 0 : { *(.debug_sfnames) }
/* DWARF 1.1 and DWARF 2 */
.debug_aranges 0 : { *(.debug_aranges) }
.debug_pubnames 0 : { *(.debug_pubnames) }
/* DWARF 2 */
.debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) }
.debug_abbrev 0 : { *(.debug_abbrev) }
.debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) }
.debug_frame 0 : { *(.debug_frame) }
.debug_str 0 : { *(.debug_str) }
.debug_loc 0 : { *(.debug_loc) }
.debug_macinfo 0 : { *(.debug_macinfo) }
/* SGI/MIPS DWARF 2 extensions */
.debug_weaknames 0 : { *(.debug_weaknames) }
.debug_funcnames 0 : { *(.debug_funcnames) }
.debug_typenames 0 : { *(.debug_typenames) }
.debug_varnames 0 : { *(.debug_varnames) }
/* DWARF 3 */
.debug_pubtypes 0 : { *(.debug_pubtypes) }
.debug_ranges 0 : { *(.debug_ranges) }
/* DWARF Extension. */
.debug_macro 0 : { *(.debug_macro) }
.debug_addr 0 : { *(.debug_addr) }
.gnu.attributes 0 : { KEEP (*(.gnu.attributes)) }
/DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) }
}

890
lib/src/tinyprintf.c Normal file
View File

@@ -0,0 +1,890 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
// embedded systems with a very limited resources. These routines are thread
// safe and reentrant!
// Use this instead of the bloated standard/newlib printf cause these use
// malloc for printf (and may not be thread safe).
//
///////////////////////////////////////////////////////////////////////////////
#include <stdbool.h>
#include <stdint.h>
#include "tinyprintf.h"
#include "vx_print.h"
// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
// printf_config.h header file
// default: undefined
#ifdef PRINTF_INCLUDE_CONFIG_H
#include "printf_config.h"
#endif
// 'ntoa' conversion buffer size, this must be big enough to hold one converted
// numeric number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_NTOA_BUFFER_SIZE
#define PRINTF_NTOA_BUFFER_SIZE 32U
#endif
// 'ftoa' conversion buffer size, this must be big enough to hold one converted
// float number including padded zeros (dynamically created on stack)
// default: 32 byte
#ifndef PRINTF_FTOA_BUFFER_SIZE
#define PRINTF_FTOA_BUFFER_SIZE 32U
#endif
// support for the floating point type (%f)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
#define PRINTF_SUPPORT_FLOAT
#endif
// support for exponential floating point notation (%e/%g)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
#define PRINTF_SUPPORT_EXPONENTIAL
#endif
// define the default floating point precision
// default: 6 digits
#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
#define PRINTF_DEFAULT_FLOAT_PRECISION 6U
#endif
// define the largest float suitable to print with %f
// default: 1e9
#ifndef PRINTF_MAX_FLOAT
#define PRINTF_MAX_FLOAT 1e9
#endif
// support for the long long types (%llu or %p)
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
#define PRINTF_SUPPORT_LONG_LONG
#endif
// support for the ptrdiff_t type (%t)
// ptrdiff_t is normally defined in <stddef.h> as long or long long type
// default: activated
#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
#define PRINTF_SUPPORT_PTRDIFF_T
#endif
///////////////////////////////////////////////////////////////////////////////
// internal flag definitions
#define FLAGS_ZEROPAD (1U << 0U)
#define FLAGS_LEFT (1U << 1U)
#define FLAGS_PLUS (1U << 2U)
#define FLAGS_SPACE (1U << 3U)
#define FLAGS_HASH (1U << 4U)
#define FLAGS_UPPERCASE (1U << 5U)
#define FLAGS_CHAR (1U << 6U)
#define FLAGS_SHORT (1U << 7U)
#define FLAGS_LONG (1U << 8U)
#define FLAGS_LONG_LONG (1U << 9U)
#define FLAGS_PRECISION (1U << 10U)
#define FLAGS_ADAPT_EXP (1U << 11U)
// import float.h for DBL_MAX
#if defined(PRINTF_SUPPORT_FLOAT)
#include <float.h>
#endif
// output function type
typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
// wrapper (used as buffer) for output function type
typedef struct {
void (*fct)(char character, void* arg);
void* arg;
} out_fct_wrap_type;
// internal buffer output
static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
{
if (idx < maxlen) {
((char*)buffer)[idx] = character;
}
}
// internal null output
static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)character; (void)buffer; (void)idx; (void)maxlen;
}
// internal _putchar wrapper
static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)buffer; (void)idx; (void)maxlen;
if (character) {
vx_putchar(character);
}
}
// internal output function wrapper
static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
{
(void)idx; (void)maxlen;
if (character) {
// buffer is the output fct pointer
((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
}
}
// internal secure strlen
// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
{
const char* s;
for (s = str; *s && maxsize--; ++s);
return (unsigned int)(s - str);
}
// internal test if char is a digit (0-9)
// \return true if char is a digit
static inline bool _is_digit(char ch)
{
return (ch >= '0') && (ch <= '9');
}
// internal ASCII string to unsigned int conversion
static unsigned int _atoi(const char** str)
{
unsigned int i = 0U;
while (_is_digit(**str)) {
i = i * 10U + (unsigned int)(*((*str)++) - '0');
}
return i;
}
// output the specified string in reverse, taking care of any zero-padding
static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
{
const size_t start_idx = idx;
// pad spaces up to given width
if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
for (size_t i = len; i < width; i++) {
out(' ', buffer, idx++, maxlen);
}
}
// reverse string
while (len) {
out(buf[--len], buffer, idx++, maxlen);
}
// append pad spaces up to given width
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) {
out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
// internal itoa format
static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
{
// pad leading zeros
if (!(flags & FLAGS_LEFT)) {
if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
// handle hash
if (flags & FLAGS_HASH) {
if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
len--;
if (len && (base == 16U)) {
len--;
}
}
if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'x';
}
else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'X';
}
else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
buf[len++] = 'b';
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
buf[len++] = '0';
}
}
if (len < PRINTF_NTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
// internal itoa for 'long' type
static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
// internal itoa for 'long long' type
#if defined(PRINTF_SUPPORT_LONG_LONG)
static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_NTOA_BUFFER_SIZE];
size_t len = 0U;
// no hash for 0 values
if (!value) {
flags &= ~FLAGS_HASH;
}
// write if precision != 0 and value is != 0
if (!(flags & FLAGS_PRECISION) || value) {
do {
const char digit = (char)(value % base);
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
value /= base;
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
}
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
}
#endif // PRINTF_SUPPORT_LONG_LONG
#if defined(PRINTF_SUPPORT_FLOAT)
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
#endif
// internal ftoa for fixed decimal floating point
static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
char buf[PRINTF_FTOA_BUFFER_SIZE];
size_t len = 0U;
double diff = 0.0;
// powers of 10
static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
// test for special values
if (value != value)
return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
if (value < -DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
if (value > DBL_MAX)
return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
// test for very large values
// standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
#else
return 0U;
#endif
}
// test for negative
bool negative = false;
if (value < 0) {
negative = true;
value = 0 - value;
}
// set default precision, if not set explicitly
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// limit precision to 9, cause a prec >= 10 can lead to overflow errors
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
buf[len++] = '0';
prec--;
}
int whole = (int)value;
double tmp = (value - whole) * pow10[prec];
unsigned long frac = (unsigned long)tmp;
diff = tmp - frac;
if (diff > 0.5) {
++frac;
// handle rollover, e.g. case 0.99 with prec 1 is 1.0
if (frac >= pow10[prec]) {
frac = 0;
++whole;
}
}
else if (diff < 0.5) {
}
else if ((frac == 0U) || (frac & 1U)) {
// if halfway, round up if odd OR if last digit is 0
++frac;
}
if (prec == 0U) {
diff = value - (double)whole;
if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
// exactly 0.5 and ODD, then round up
// 1.5 -> 2, but 2.5 -> 2
++whole;
}
}
else {
unsigned int count = prec;
// now do fractional part, as an unsigned number
while (len < PRINTF_FTOA_BUFFER_SIZE) {
--count;
buf[len++] = (char)(48U + (frac % 10U));
if (!(frac /= 10U)) {
break;
}
}
// add extra 0s
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
buf[len++] = '0';
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
// add decimal
buf[len++] = '.';
}
}
// do whole part, number is reversed
while (len < PRINTF_FTOA_BUFFER_SIZE) {
buf[len++] = (char)(48 + (whole % 10));
if (!(whole /= 10)) {
break;
}
}
// pad leading zeros
if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
width--;
}
while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
buf[len++] = '0';
}
}
if (len < PRINTF_FTOA_BUFFER_SIZE) {
if (negative) {
buf[len++] = '-';
}
else if (flags & FLAGS_PLUS) {
buf[len++] = '+'; // ignore the space if the '+' exists
}
else if (flags & FLAGS_SPACE) {
buf[len++] = ' ';
}
}
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
}
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
{
// check for NaN and special values
if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
}
// determine the sign
const bool negative = value < 0;
if (negative) {
value = -value;
}
// default precision
if (!(flags & FLAGS_PRECISION)) {
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
}
// determine the decimal exponent
// based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
union {
uint64_t U;
double F;
} conv;
conv.F = value;
int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2
conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2)
// now approximate log10 from the log2 integer part and an expansion of ln around 1.5
int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
// now we want to compute 10^expval but we want to be sure it won't overflow
exp2 = (int)(expval * 3.321928094887362 + 0.5);
const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
const double z2 = z * z;
conv.U = (uint64_t)(exp2 + 1023) << 52U;
// compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
// correct for rounding errors
if (value < conv.F) {
expval--;
conv.F /= 10;
}
// the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
// in "%g" mode, "prec" is the number of *significant figures* not decimals
if (flags & FLAGS_ADAPT_EXP) {
// do we want to fall-back to "%f" mode?
if ((value >= 1e-4) && (value < 1e6)) {
if ((int)prec > expval) {
prec = (unsigned)((int)prec - expval - 1);
}
else {
prec = 0;
}
flags |= FLAGS_PRECISION; // make sure _ftoa respects precision
// no characters in exponent
minwidth = 0U;
expval = 0;
}
else {
// we use one sigfig for the whole part
if ((prec > 0) && (flags & FLAGS_PRECISION)) {
--prec;
}
}
}
// will everything fit?
unsigned int fwidth = width;
if (width > minwidth) {
// we didn't fall-back so subtract the characters required for the exponent
fwidth -= minwidth;
} else {
// not enough characters, so go back to default sizing
fwidth = 0U;
}
if ((flags & FLAGS_LEFT) && minwidth) {
// if we're padding on the right, DON'T pad the floating part
fwidth = 0U;
}
// rescale the float value
if (expval) {
value /= conv.F;
}
// output the floating part
const size_t start_idx = idx;
idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
// output the exponent part
if (minwidth) {
// output the exponential symbol
out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
// output the exponent value
idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
// might need to right-pad spaces
if (flags & FLAGS_LEFT) {
while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
}
}
return idx;
}
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
// internal vsnprintf
static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) {
unsigned int flags, width, precision, n;
size_t idx = 0U;
if (!buffer) {
// use null output function
out = _out_null;
}
while (*format)
{
// format specifier? %[flags][width][.precision][length]
if (*format != '%') {
// no
out(*format, buffer, idx++, maxlen);
format++;
continue;
}
else {
// yes, evaluate it
format++;
}
// evaluate flags
flags = 0U;
do {
switch (*format) {
case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
case '-': flags |= FLAGS_LEFT; format++; n = 1U; break;
case '+': flags |= FLAGS_PLUS; format++; n = 1U; break;
case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break;
case '#': flags |= FLAGS_HASH; format++; n = 1U; break;
default : n = 0U; break;
}
} while (n);
// evaluate width field
width = 0U;
if (_is_digit(*format)) {
width = _atoi(&format);
}
else if (*format == '*') {
const int w = va_arg(va, int);
if (w < 0) {
flags |= FLAGS_LEFT; // reverse padding
width = (unsigned int)-w;
}
else {
width = (unsigned int)w;
}
format++;
}
// evaluate precision field
precision = 0U;
if (*format == '.') {
flags |= FLAGS_PRECISION;
format++;
if (_is_digit(*format)) {
precision = _atoi(&format);
}
else if (*format == '*') {
const int prec = (int)va_arg(va, int);
precision = prec > 0 ? (unsigned int)prec : 0U;
format++;
}
}
// evaluate length field
switch (*format) {
case 'l' :
flags |= FLAGS_LONG;
format++;
if (*format == 'l') {
flags |= FLAGS_LONG_LONG;
format++;
}
break;
case 'h' :
flags |= FLAGS_SHORT;
format++;
if (*format == 'h') {
flags |= FLAGS_CHAR;
format++;
}
break;
#if defined(PRINTF_SUPPORT_PTRDIFF_T)
case 't' :
flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
#endif
case 'j' :
flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
case 'z' :
flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
format++;
break;
default :
break;
}
// evaluate specifier
switch (*format) {
case 'd' :
case 'i' :
case 'u' :
case 'x' :
case 'X' :
case 'o' :
case 'b' : {
// set the base
unsigned int base;
if (*format == 'x' || *format == 'X') {
base = 16U;
}
else if (*format == 'o') {
base = 8U;
}
else if (*format == 'b') {
base = 2U;
}
else {
base = 10U;
flags &= ~FLAGS_HASH; // no hash for dec format
}
// uppercase
if (*format == 'X') {
flags |= FLAGS_UPPERCASE;
}
// no plus or space flag for u, x, X, o, b
if ((*format != 'i') && (*format != 'd')) {
flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
}
// ignore '0' flag when precision is given
if (flags & FLAGS_PRECISION) {
flags &= ~FLAGS_ZEROPAD;
}
// convert the integer
if ((*format == 'i') || (*format == 'd')) {
// signed
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
const long long value = va_arg(va, long long);
idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
const long value = va_arg(va, long);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
else {
const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
}
}
else {
// unsigned
if (flags & FLAGS_LONG_LONG) {
#if defined(PRINTF_SUPPORT_LONG_LONG)
idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
#endif
}
else if (flags & FLAGS_LONG) {
idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
}
else {
const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
}
}
format++;
break;
}
#if defined(PRINTF_SUPPORT_FLOAT)
case 'f' :
case 'F' :
if (*format == 'F') flags |= FLAGS_UPPERCASE;
idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
case 'e':
case 'E':
case 'g':
case 'G':
if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
format++;
break;
#endif // PRINTF_SUPPORT_EXPONENTIAL
#endif // PRINTF_SUPPORT_FLOAT
case 'c' : {
unsigned int l = 1U;
// pre padding
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// char output
out((char)va_arg(va, int), buffer, idx++, maxlen);
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 's' : {
const char* p = va_arg(va, char*);
unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
// pre padding
if (flags & FLAGS_PRECISION) {
l = (l < precision ? l : precision);
}
if (!(flags & FLAGS_LEFT)) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
// string output
while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
out(*(p++), buffer, idx++, maxlen);
}
// post padding
if (flags & FLAGS_LEFT) {
while (l++ < width) {
out(' ', buffer, idx++, maxlen);
}
}
format++;
break;
}
case 'p' : {
width = sizeof(void*) * 2U;
flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
#if defined(PRINTF_SUPPORT_LONG_LONG)
const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
if (is_ll) {
idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
}
else {
#endif
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
#if defined(PRINTF_SUPPORT_LONG_LONG)
}
#endif
format++;
break;
}
case '%' :
out('%', buffer, idx++, maxlen);
format++;
break;
default :
out(*format, buffer, idx++, maxlen);
format++;
break;
}
}
// termination
out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
// return written chars without terminating \0
return (int)idx;
}
int tiny_printf(const char* format, ...) {
va_list va;
va_start(va, format);
char buffer[1];
const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_sprintf(char* buffer, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
va_end(va);
return ret;
}
int tiny_snprintf(char* buffer, size_t count, const char* format, ...) {
va_list va;
va_start(va, format);
const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
va_end(va);
return ret;
}
int tiny_vprintf(const char* format, va_list va) {
char buffer[1];
return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
}
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) {
return _vsnprintf(_out_buffer, buffer, count, format, va);
}

86
lib/src/tinyprintf.h Normal file
View File

@@ -0,0 +1,86 @@
///////////////////////////////////////////////////////////////////////////////
// \author (c) Marco Paland (info@paland.com)
// 2014-2019, PALANDesign Hannover, Germany
//
// \license The MIT License (MIT)
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
// \brief Tiny printf, sprintf and snprintf implementation, optimized for speed on
// embedded systems with a very limited resources.
// Use this instead of bloated standard/newlib printf.
// These routines are thread safe and reentrant.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef __TINYPRINTF_H__
#define __TINYPRINTF_H__
#include <stdarg.h>
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* Tiny printf implementation
* You have to implement _putchar if you use printf()
* To avoid conflicts with the regular printf() API it is overridden by macro defines
* and internal underscore-appended functions like printf_() are used
* \param format A string that specifies the format of the output
* \return The number of characters that are written into the array, not counting the terminating null character
*/
int tiny_printf(const char* format, ...);
/**
* Tiny sprintf implementation
* Due to security reasons (buffer overflow) YOU SHOULD CONSIDER USING (V)SNPRINTF INSTEAD!
* \param buffer A pointer to the buffer where to store the formatted string. MUST be big enough to store the output!
* \param format A string that specifies the format of the output
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_sprintf(char* buffer, const char* format, ...);
/**
* Tiny snprintf/vsnprintf implementation
* \param buffer A pointer to the buffer where to store the formatted string
* \param count The maximum number of characters to store in the buffer, including a terminating null character
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that COULD have been written into the buffer, not counting the terminating
* null character. A value equal or larger than count indicates truncation. Only when the returned value
* is non-negative and less than count, the string has been completely written.
*/
int tiny_snprintf(char* buffer, size_t count, const char* format, ...);
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va);
/**
* Tiny vprintf implementation
* \param format A string that specifies the format of the output
* \param va A value identifying a variable arguments list
* \return The number of characters that are WRITTEN into the buffer, not counting the terminating null character
*/
int tiny_vprintf(const char* format, va_list va);
#ifdef __cplusplus
}
#endif
#endif // __TINYPRINTF_H__

49
lib/src/vx_perf.c Normal file
View File

@@ -0,0 +1,49 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
#include <vx_intrinsics.h>
#include <stdint.h>
#define DUMP_CSR_4(d, s) \
csr_mem[d + 0] = csr_read(s + 0); \
csr_mem[d + 1] = csr_read(s + 1); \
csr_mem[d + 2] = csr_read(s + 2); \
csr_mem[d + 3] = csr_read(s + 3);
#define DUMP_CSR_32(d, s) \
DUMP_CSR_4(d + 0, s + 0) \
DUMP_CSR_4(d + 4, s + 4) \
DUMP_CSR_4(d + 8, s + 8) \
DUMP_CSR_4(d + 12, s + 12) \
DUMP_CSR_4(d + 16, s + 16) \
DUMP_CSR_4(d + 20, s + 20) \
DUMP_CSR_4(d + 24, s + 24) \
DUMP_CSR_4(d + 28, s + 28)
#ifdef __cplusplus
extern "C" {
#endif
void vx_perf_dump() {
int core_id = vx_core_id();
uint32_t* const csr_mem = (uint32_t*)(IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id);
DUMP_CSR_32(0, VX_CSR_MPM_BASE)
DUMP_CSR_32(32, VX_CSR_MPM_BASE_H)
}
#ifdef __cplusplus
}
#endif

32
lib/src/vx_print.S Normal file
View File

@@ -0,0 +1,32 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
.type vx_putchar, @function
.global vx_putchar
vx_putchar:
csrr t0, VX_CSR_MHARTID
andi t0, t0, %lo(IO_COUT_SIZE-1)
#if (XLEN == 64)
li t1, (IO_COUT_ADDR >> 32)
slli t1, t1, 32
li t2, (IO_COUT_ADDR & 0xffffffff)
or t1, t1, t2
#else
li t1, IO_COUT_ADDR
#endif
add t0, t0, t1
sb a0, 0(t0)
ret

107
lib/src/vx_print.c Normal file
View File

@@ -0,0 +1,107 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vx_print.h>
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "tinyprintf.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
const char* format;
va_list* va;
int ret;
} printf_arg_t;
typedef struct {
int value;
int base;
} putint_arg_t;
typedef struct {
float value;
int precision;
} putfloat_arg_t;
static void __putint_cb(const putint_arg_t* arg) {
char tmp[33];
float value = arg->value;
int base = arg->base;
itoa(value, tmp, base);
for (int i = 0; i < 33; ++i) {
int c = tmp[i];
if (!c)
break;
vx_putchar(c);
}
}
static void __putfloat_cb(const putfloat_arg_t* arg) {
float value = arg->value;
int precision = arg->precision;
int ipart = (int)value;
vx_putint(ipart, 10);
if (precision != 0) {
vx_putchar('.');
float frac = value - (float)ipart;
float fscaled = frac * pow(10, precision);
vx_putint((int)fscaled, 10);
}
}
static void __vprintf_cb(printf_arg_t* arg) {
arg->ret = tiny_vprintf(arg->format, *arg->va);
}
void vx_putint(int value, int base) {
putint_arg_t arg;
arg.value = value;
arg.base = base;
vx_serial((vx_serial_cb)__putint_cb, &arg);
}
void vx_putfloat(float value, int precision) {
putfloat_arg_t arg;
arg.value = value;
arg.precision = precision;
vx_serial((vx_serial_cb)__putfloat_cb, &arg);
}
int vx_vprintf(const char* format, va_list va) {
printf_arg_t arg;
arg.format = format;
arg.va = &va;
vx_serial((vx_serial_cb)__vprintf_cb, &arg);
return arg.ret;
}
int vx_printf(const char * format, ...) {
int ret;
va_list va;
va_start(va, format);
ret = vx_vprintf(format, va);
va_end(va);
return ret;
}
#ifdef __cplusplus
}
#endif

77
lib/src/vx_serial.S Normal file
View File

@@ -0,0 +1,77 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
#define RISCV_CUSTOM0 0x0B
.type vx_serial, @function
.global vx_serial
vx_serial:
#if (XLEN == 64)
addi sp, sp, -56
sd ra, 48(sp)
sd s5, 40(sp)
sd s4, 32(sp)
sd s3, 24(sp)
sd s2, 16(sp)
sd s1, 8(sp)
sd s0, 0(sp)
#else
addi sp, sp, -28
sw ra, 24(sp)
sw s5, 20(sp)
sw s4, 16(sp)
sw s3, 12(sp)
sw s2, 8(sp)
sw s1, 4(sp)
sw s0, 0(sp)
#endif
mv s4, a0 # s4 <- callback
mv s3, a1 # s3 <- arg
csrr s2, VX_CSR_NUM_THREADS # s2 <- NT
csrr s1, VX_CSR_THREAD_ID # s1 <- tid
li s0, 0 # s0 <- index
label_loop:
sub t0, s0, s1
seqz t1, t0 # (index != tid)
.insn r RISCV_CUSTOM0, 2, 0, s5, t1, x0 # split s5, t0
bnez t0, label_join
mv a0, s3 # a0 <- arg
jalr s4 # callback(arg)
label_join:
.insn r RISCV_CUSTOM0, 3, 0, x0, s5, x0 # join s5
addi s0, s0, 1 # index++
blt s0, s2, label_loop # loop back
#if (XLEN == 64)
ld ra, 48(sp)
ld s5, 40(sp)
ld s4, 32(sp)
ld s3, 24(sp)
ld s2, 16(sp)
ld s1, 8(sp)
ld s0, 0(sp)
addi sp, sp, 56
#else
lw ra, 24(sp)
lw s5, 20(sp)
lw s4, 16(sp)
lw s3, 12(sp)
lw s2, 8(sp)
lw s1, 4(sp)
lw s0, 0(sp)
addi sp, sp, 28
#endif
ret

597
lib/src/vx_spawn.c Normal file
View File

@@ -0,0 +1,597 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <inttypes.h>
#ifdef __cplusplus
extern "C" {
#endif
#define NUM_CORES_MAX 1024
#ifndef MIN
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
typedef struct {
vx_spawn_tasks_cb callback;
void* arg;
int offset; // task offset
int NWs; // number of NW batches where NW=<total warps per core>.
int RWs; // number of remaining warps in the core
} wspawn_tasks_args_t;
typedef struct {
context_t * ctx;
vx_spawn_kernel_cb callback;
void* arg;
int offset; // task offset
int NWs; // number of NW batches where NW=<total warps per core>.
int RWs; // number of remaining warps in the core
char isXYpow2;
char log2XY;
char log2X;
} wspawn_kernel_args_t;
void* g_wspawn_args[NUM_CORES_MAX];
inline char is_log2(int x) {
return ((x & (x-1)) == 0);
}
inline int log2_fast(int x) {
return 31 - __builtin_clz (x);
}
static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
int NT = vx_num_threads();
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int wK = (p_wspawn_args->NWs * wid) + MIN(p_wspawn_args->RWs, wid);
int tK = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
vx_spawn_tasks_cb callback = p_wspawn_args->callback;
void* arg = p_wspawn_args->arg;
for (int task_id = offset, N = task_id + tK; task_id < N; ++task_id) {
callback(task_id, arg);
}
}
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
int NT = vx_num_threads();
int NW = vx_num_warps();
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
int offset = p_wspawn_args->offset + (NT * wid + tid);
vx_spawn_tasks_cb callback = p_wspawn_args->callback;
void* arg = p_wspawn_args->arg;
for (int wave_id = 0; wave_id < waves; ++wave_id) {
int task_id = offset + (wave_id * NT * NW);
callback(task_id, arg);
}
}
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() {
int NT = vx_num_threads();
int NW = vx_num_warps();
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
// round-robin warp_id allocation across cores in cluster
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
vx_spawn_tasks_cb callback = p_wspawn_args->callback;
void* arg = p_wspawn_args->arg;
// sequential iterations
for (int wave_id = 0; wave_id < waves; ++wave_id) {
int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER);
callback(task_id, arg);
}
}
static void __attribute__ ((noinline)) spawn_tasks_rem_stub() {
int cid = vx_core_id();
int tid = vx_thread_id();
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
int task_id = p_wspawn_args->offset + tid;
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() {
int NT = vx_num_threads();
int cid = vx_core_id();
int tid = vx_thread_id();
int wid = vx_warp_id();
const int core_id_in_cluster = cid % CORES_PER_CLUSTER;
// round-robin warp_id allocation across cores in cluster
const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster;
wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid];
// FIXME: This assumes that all cores but the last one are working with full
// warps, and only the last core has a partially-filled warp.
int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid);
int task_id = offset;
(p_wspawn_args->callback)(task_id, p_wspawn_args->arg);
}
static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_contiguous_all_stub();
// disable warp
vx_tmc_zero();
}
static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_cluster_all_stub();
// disable warp
vx_tmc_zero();
}
static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_all_stub();
// disable warp
vx_tmc_zero();
}
// This function runs in every core, but with only 1 warp and 1 thread enabled.
// The logic in this function figures out how many warps/threads this particular
// core has to enable to fulfill an entire grid of computation.
void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) {
// device specs
const int NC = vx_num_cores();
const int NW = vx_num_warps();
const int NT = vx_num_threads();
// NOTE: assumes divisible
const int num_cluster = NC / CORES_PER_CLUSTER;
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
const int cluster_id = core_id / CORES_PER_CLUSTER;
const int core_id_in_cluster = core_id % CORES_PER_CLUSTER;
// try to fill up full clusters first
const int num_threads_in_cluster = CORES_PER_CLUSTER * NW * NT;
const int num_used_clusters =
(num_tasks + (num_threads_in_cluster - 1)) / num_threads_in_cluster;
if (cluster_id >= num_used_clusters) {
return; // terminate extra clusters
}
// fill up the last cluster with remaining tasks
const int num_full_clusters = num_tasks / num_threads_in_cluster;
int num_tasks_this_cluster = num_threads_in_cluster;
if (cluster_id >= num_full_clusters) {
num_tasks_this_cluster = num_tasks % num_threads_in_cluster;
}
// Distribute threads equally across as many cores as possible, even if they
// don't fill up NW*NT in a single core. This makes sure the warps get evenly
// distributed in a single cluster
//
// TODO: Try to contain in a single cluster if possible?
const int num_active_cores = (num_tasks + (NT - 1)) / NT;
if (core_id >= num_active_cores)
return; // terminate extra cores
const int num_full_warps_this_cluster = num_tasks_this_cluster / NT;
const int rem_threads_in_last_warp = num_tasks_this_cluster % NT;
// const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT;
int num_warps_this_core = num_full_warps_this_cluster / CORES_PER_CLUSTER;
const int num_warps_in_last_row = num_full_warps_this_cluster % CORES_PER_CLUSTER;
if (core_id_in_cluster < num_warps_in_last_row) {
num_warps_this_core++;
}
// if 0, last warp is full-threads enabled
int rem_threads_in_last_warp_this_core = 0;
if (rem_threads_in_last_warp != 0) {
if (core_id_in_cluster == num_warps_in_last_row - 1) {
rem_threads_in_last_warp_this_core = rem_threads_in_last_warp;
}
}
// sequential iterations
const int num_full_waves = num_warps_this_core / NW;
const int rem_full_warps_in_last_wave = num_warps_this_core % NW;
const const int offset = cluster_id * num_tasks_this_cluster;
wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves,
rem_full_warps_in_last_wave};
g_wspawn_args[core_id] = &wspawn_args;
if (num_warps_this_core > 0) {
// execute callback on other warps
const int nw = MIN(num_warps_this_core, NW);
vx_wspawn(nw, spawn_tasks_cluster_all_cb);
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_cluster_all_stub();
// back to single-threaded
vx_tmc_one();
// wait for spawn warps to terminate
vx_wspawn_wait();
}
// TODO: this is incomplete
// TODO: Instead of launching an additional wave just to work on remaining
// threads, handle this in the last wave amongst other full warps.
if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) {
// adjust offset
// FIXME: use rem_threads_in_last_warp_this_core
wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp);
// activate remaining threads
const int tmask = (1 << rem_threads_in_last_warp) - 1;
vx_tmc(tmask);
// call stub routine
spawn_tasks_cluster_rem_stub();
// back to single-threaded
vx_tmc_one();
}
}
void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of tasks per core
int tasks_per_core = num_tasks / nc;
int tasks_per_core_n1 = tasks_per_core;
if (core_id == (nc-1)) {
int rem = num_tasks - (nc * tasks_per_core);
tasks_per_core_n1 += rem; // last core also executes remaining tasks
}
// number of tasks per warp
int TW = tasks_per_core_n1 / NT; // occupied warps
int rT = tasks_per_core_n1 - TW * NT; // remaining threads
int fW = 1, rW = 0;
if (TW >= NW) {
fW = TW / NW; // full warps iterations
rW = TW - fW * NW; // remaining warps
}
wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW };
g_wspawn_args[core_id] = &wspawn_args;
if (TW >= 1) {
// execute callback on other warps
int nw = MIN(TW, NW);
vx_wspawn(nw, spawn_tasks_contiguous_all_cb);
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_contiguous_all_stub();
// back to single-threaded
vx_tmc_one();
// wait for spawn warps to terminate
vx_wspawn_wait();
}
if (rT != 0) {
// adjust offset
wspawn_args.offset += (tasks_per_core_n1 - rT);
// activate remaining threads
int tmask = (1 << rT) - 1;
vx_tmc(tmask);
// call stub routine
spawn_tasks_rem_stub();
// back to single-threaded
vx_tmc_one();
}
}
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of tasks per core
int tasks_per_core = num_tasks / nc;
int tasks_per_core_n1 = tasks_per_core;
if (core_id == (nc-1)) {
int rem = num_tasks - (nc * tasks_per_core);
tasks_per_core_n1 += rem; // last core also executes remaining tasks
}
// number of tasks per warp
int TW = tasks_per_core_n1 / NT; // occupied warps
int rT = tasks_per_core_n1 - TW * NT; // remaining threads
int fW = 1, rW = 0;
if (TW >= NW) {
fW = TW / NW; // full warps iterations
rW = TW - fW * NW; // remaining warps
}
wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW };
g_wspawn_args[core_id] = &wspawn_args;
if (TW >= 1) {
// execute callback on other warps
int nw = MIN(TW, NW);
vx_wspawn(nw, spawn_tasks_all_cb);
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_tasks_all_stub();
// back to single-threaded
vx_tmc_one();
// wait for spawn warps to terminate
vx_wspawn_wait();
}
if (rT != 0) {
// adjust offset
wspawn_args.offset += (tasks_per_core_n1 - rT);
// activate remaining threads
int tmask = (1 << rT) - 1;
vx_tmc(tmask);
// call stub routine
spawn_tasks_rem_stub();
// back to single-threaded
vx_tmc_one();
}
}
///////////////////////////////////////////////////////////////////////////////
static void __attribute__ ((noinline)) spawn_kernel_all_stub() {
int NT = vx_num_threads();
int cid = vx_core_id();
int wid = vx_warp_id();
int tid = vx_thread_id();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[cid];
int wK = (p_wspawn_args->NWs * wid) + MIN(p_wspawn_args->RWs, wid);
int tK = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs);
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
if (p_wspawn_args->isXYpow2) {
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
int k = wg_id >> p_wspawn_args->log2XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d >> p_wspawn_args->log2X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
}
} else {
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
int k = wg_id / XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d / X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
}
}
}
static void __attribute__ ((noinline)) spawn_kernel_rem_stub() {
int cid = vx_core_id();
int tid = vx_thread_id();
wspawn_kernel_args_t* p_wspawn_args = (wspawn_kernel_args_t*)g_wspawn_args[cid];
int wg_id = p_wspawn_args->offset + tid;
int X = p_wspawn_args->ctx->num_groups[0];
int Y = p_wspawn_args->ctx->num_groups[1];
int XY = X * Y;
if (p_wspawn_args->isXYpow2) {
int k = wg_id >> p_wspawn_args->log2XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d >> p_wspawn_args->log2X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
} else {
int k = wg_id / XY;
int wg_2d = wg_id - k * XY;
int j = wg_2d / X;
int i = wg_2d - j * X;
(p_wspawn_args->callback)(p_wspawn_args->arg, p_wspawn_args->ctx, i, j, k);
}
}
static void __attribute__ ((noinline)) spawn_kernel_all_cb() {
// activate all threads
vx_tmc(-1);
// call stub routine
spawn_kernel_all_stub();
// disable warp
vx_tmc_zero();
}
void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
// total number of WGs
int X = ctx->num_groups[0];
int Y = ctx->num_groups[1];
int Z = ctx->num_groups[2];
int XY = X * Y;
int num_tasks = XY * Z;
// device specs
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
int nc = MIN(nC, NC);
if (core_id >= nc)
return; // terminate extra cores
// number of tasks per core
int tasks_per_core = num_tasks / nc;
int tasks_per_core_n1 = tasks_per_core;
if (core_id == (nc-1)) {
int rem = num_tasks - (nc * tasks_per_core);
tasks_per_core_n1 += rem; // last core also executes remaining WGs
}
// number of tasks per warp
int TW = tasks_per_core_n1 / NT; // occupied warps
int rT = tasks_per_core_n1 - TW * NT; // remaining threads
int fW = 1, rW = 0;
if (TW >= NW) {
fW = TW / NW; // full warps iterations
rW = TW - fW * NW; // remaining warps
}
// fast path handling
char isXYpow2 = is_log2(XY);
char log2XY = log2_fast(XY);
char log2X = log2_fast(X);
wspawn_kernel_args_t wspawn_args = {
ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X
};
g_wspawn_args[core_id] = &wspawn_args;
if (TW >= 1) {
// execute callback on other warps
int nw = MIN(TW, NW);
vx_wspawn(nw, spawn_kernel_all_cb);
// activate all threads
vx_tmc(-1);
// call stub routine
asm volatile("" ::: "memory");
spawn_kernel_all_stub();
// back to single-threaded
vx_tmc_one();
// wait for spawn warps to terminate
vx_wspawn_wait();
}
if (rT != 0) {
// adjust offset
wspawn_args.offset += (tasks_per_core_n1 - rT);
// activate remaining threads
int tmask = (1 << rT) - 1;
vx_tmc(tmask);
// call stub routine
spawn_kernel_rem_stub();
// back to single-threaded
vx_tmc_one();
}
}
#ifdef __cplusplus
}
#endif

153
lib/src/vx_start.S Normal file
View File

@@ -0,0 +1,153 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <VX_config.h>
#include <VX_types.h>
#define RISCV_CUSTOM0 0x0B
.section .init, "ax"
.global _start
.type _start, @function
_start:
# initialize per-thread registers
csrr t0, VX_CSR_NUM_WARPS # get num warps
la t1, init_regs_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
jal init_regs
li t0, 1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
# wait for spawn warps to terminate
jal vx_wspawn_wait
# initialize TLS for all warps
csrr t0, VX_CSR_NUM_WARPS # get num warps
la t1, init_tls_all
.insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1 # wspawn t0, t1
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
call __init_tls
li t0, 1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
# wait for spawn warps to terminate
jal vx_wspawn_wait
# clear BSS segment
la a0, _edata
la a2, _end
sub a2, a2, a0
li a1, 0
call memset
# initialize trap vector
# la t0, trap_entry
# csrw mtvec, t0
# register global termination functions
la a0, __libc_fini_array
call atexit
# run global initialization functions
call __libc_init_array
# call main program routine
call main
# call exit routine
tail exit
.size _start, .-_start
.section .text
.type _exit, @function
.global _exit
_exit:
mv s0, a0
call vx_perf_dump
mv gp, s0
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
.section .text
.type init_regs, @function
.local init_regs
init_regs:
# set global pointer register
.option push
.option norelax
la gp, __global_pointer
.option pop
# set stack pointer register
#if (XLEN == 64)
li t0, (STACK_BASE_ADDR >> 32)
slli t0, t0, 32
li sp, (STACK_BASE_ADDR & 0xffffffff)
or sp, sp, t0
#else
li sp, STACK_BASE_ADDR # load stack base address
#endif
csrr t0, VX_CSR_MHARTID
sll t1, t0, STACK_LOG2_SIZE
sll t2, t0, 4
add t1, t1, t2
sub sp, sp, t1
# set thread pointer register
# use address space after BSS region
# ensure cache line alignment
la t1, __tcb_aligned_size
mul t0, t0, t1
la tp, _end + 63
add tp, tp, t0
and tp, tp, -64
ret
.section .text
.type init_regs_all, @function
.local init_regs_all
init_regs_all:
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
jal init_regs
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
ret
.section .text
.type init_tls_all, @function
.local init_tls_all
init_tls_all:
li t0, -1
.insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0 # tmc t0
call __init_tls
.insn r RISCV_CUSTOM0, 0, 0, x0, x0, x0 # tmc x0
ret
.section .text
.type vx_wspawn_wait, @function
.global vx_wspawn_wait
vx_wspawn_wait:
csrr t0, VX_CSR_WARP_MASK
li t1, 1
bne t0, t1, vx_wspawn_wait
ret
.section .data
.global __dso_handle
.weak __dso_handle
__dso_handle:
.long 0

124
lib/src/vx_syscalls.c Normal file
View File

@@ -0,0 +1,124 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sys/stat.h>
#include <newlib.h>
#include <unistd.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
int _close(int file) { return -1; }
int _fstat(int file, struct stat *st) { return -1; }
int _isatty(int file) { return 0; }
int _lseek(int file, int ptr, int dir) { return 0; }
int _open(const char *name, int flags, int mode) { return -1; }
int _read(int file, char *ptr, int len) { return -1; }
caddr_t _sbrk(int incr) {
__asm__ __volatile__("ebreak");
return 0;
}
int _write(int file, char *ptr, int len) {
int i;
for (i = 0; i < len; ++i) {
vx_putchar(*ptr++);
}
return len;
}
int _kill(int pid, int sig) { return -1; }
int _getpid() {
return vx_hart_id();
}
void __init_tls(void) {
extern char __tdata_start[];
extern char __tbss_offset[];
extern char __tdata_size[];
extern char __tbss_size[];
// TLS memory initialization
register char *__thread_self __asm__ ("tp");
memcpy(__thread_self, __tdata_start, (size_t)__tdata_size);
memset(__thread_self + (size_t)__tbss_offset, 0, (size_t)__tbss_size);
}
#ifdef HAVE_INITFINI_ARRAY
/* These magic symbols are provided by the linker. */
extern void (*__preinit_array_start []) (void) __attribute__((weak));
extern void (*__preinit_array_end []) (void) __attribute__((weak));
extern void (*__init_array_start []) (void) __attribute__((weak));
extern void (*__init_array_end []) (void) __attribute__((weak));
#ifdef HAVE_INIT_FINI
extern void _init (void);
#endif
/* Iterate over all the init routines. */
void __libc_init_array (void) {
size_t count;
size_t i;
count = __preinit_array_end - __preinit_array_start;
for (i = 0; i < count; i++)
__preinit_array_start[i] ();
#ifdef HAVE_INIT_FINI
_init ();
#endif
count = __init_array_end - __init_array_start;
for (i = 0; i < count; i++)
__init_array_start[i] ();
}
#endif
#ifdef HAVE_INITFINI_ARRAY
extern void (*__fini_array_start []) (void) __attribute__((weak));
extern void (*__fini_array_end []) (void) __attribute__((weak));
#ifdef HAVE_INIT_FINI
extern void _fini (void);
#endif
/* Run all the cleanup routines. */
void __libc_fini_array (void) {
size_t count;
size_t i;
count = __fini_array_end - __fini_array_start;
for (i = count; i > 0; i--)
__fini_array_start[i-1] ();
#ifdef HAVE_INIT_FINI
_fini ();
#endif
}
#endif
#ifdef __cplusplus
}
#endif

7
lib/tohost.S Normal file
View File

@@ -0,0 +1,7 @@
.section ".tohost","aw",@progbits
.align 6
.globl tohost
tohost: .dword 0
.align 6
.globl fromhost
fromhost: .dword 0