more renaming and cleanup

This commit is contained in:
Richard Yan
2025-01-29 21:22:41 -08:00
parent f98cd9bc22
commit 0d842a5930
348 changed files with 6 additions and 136287 deletions

View File

@@ -0,0 +1,24 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_ACCUMULATOR_H
#define SRC_MAIN_C_ACCUMULATOR_H
#include "rocc-software/src/xcustom.h"
#define k_DO_WRITE 0
#define k_DO_READ 1
#define k_DO_LOAD 2
#define k_DO_ACCUM 3
#define XCUSTOM_ACC 0
#define doWrite(y, rocc_rd, data) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, data, rocc_rd, k_DO_WRITE);
#define doRead(y, rocc_rd) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, 0, rocc_rd, k_DO_READ);
#define doLoad(y, rocc_rd, mem_addr) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, mem_addr, rocc_rd, k_DO_LOAD);
#define doAccum(y, rocc_rd, data) \
ROCC_INSTRUCTION(XCUSTOM_ACC, y, data, rocc_rd, k_DO_ACCUM);
#endif // SRC_MAIN_C_ACCUMULATOR_H

View File

@@ -0,0 +1,10 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_CHARACTER_H
#define SRC_MAIN_C_CHARACTER_H
#include "rocc-software/src/xcustom.h"
#define XCUSTOM_CHAR 2
#endif // SRC_MAIN_C_CHARACTER_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,79 @@
// See LICENSE for license details.
#ifndef COUNTER_H_
#define COUNTER_H_
#define DISABLE 0
#define INCREMENTAL_COUNTERS 44
// All existing Gemmini performance counters
#define MAIN_LD_CYCLES 1
#define MAIN_ST_CYCLES 2
#define MAIN_EX_CYCLES 3
#define MAIN_LD_ST_CYCLES 4
#define MAIN_LD_EX_CYCLES 5
#define MAIN_ST_EX_CYCLES 6
#define MAIN_LD_ST_EX_CYCLES 7
#define LOAD_DMA_WAIT_CYCLE 8
#define LOAD_ACTIVE_CYCLE 9
#define LOAD_SCRATCHPAD_WAIT_CYCLE 10
#define STORE_DMA_WAIT_CYCLE 11
#define STORE_ACTIVE_CYCLE 12
#define STORE_POOLING_CYCLE 13
#define STORE_SCRATCHPAD_WAIT_CYCLE 14
#define DMA_TLB_MISS_CYCLE 15
#define DMA_TLB_HIT_REQ 16
#define DMA_TLB_TOTAL_REQ 17
#define RDMA_ACTIVE_CYCLE 18
#define RDMA_TLB_WAIT_CYCLES 19
#define RDMA_TL_WAIT_CYCLES 20
#define WDMA_ACTIVE_CYCLE 21
#define WDMA_TLB_WAIT_CYCLES 22
#define WDMA_TL_WAIT_CYCLES 23
#define EXE_ACTIVE_CYCLE 24
#define EXE_FLUSH_CYCLE 25
#define EXE_CONTROL_Q_BLOCK_CYCLE 26
#define EXE_PRELOAD_HAZ_CYCLE 27
#define EXE_OVERLAP_HAZ_CYCLE 28
#define SCRATCHPAD_A_WAIT_CYCLE 29
#define SCRATCHPAD_B_WAIT_CYCLE 30
#define SCRATCHPAD_D_WAIT_CYCLE 31
#define ACC_A_WAIT_CYCLE 32
#define ACC_B_WAIT_CYCLE 33
#define ACC_D_WAIT_CYCLE 34
#define A_GARBAGE_CYCLES 35
#define B_GARBAGE_CYCLES 36
#define D_GARBAGE_CYCLES 37
#define IM2COL_MEM_CYCLES 38
#define IM2COL_ACTIVE_CYCLES 39
#define IM2COL_TRANSPOSER_WAIT_CYCLE 40
#define RESERVATION_STATION_FULL_CYCLES 41
#define RESERVATION_STATION_ACTIVE_CYCLES 42
#define LOOP_MATMUL_ACTIVE_CYCLES 43
#define TRANSPOSE_PRELOAD_UNROLLER_ACTIVE_CYCLES 44
#define RESERVATION_STATION_LD_COUNT (INCREMENTAL_COUNTERS + 1)
#define RESERVATION_STATION_ST_COUNT (INCREMENTAL_COUNTERS + 2)
#define RESERVATION_STATION_EX_COUNT (INCREMENTAL_COUNTERS + 3)
#define RDMA_BYTES_REC (INCREMENTAL_COUNTERS + 4)
#define WDMA_BYTES_SENT (INCREMENTAL_COUNTERS + 5)
#define RDMA_TOTAL_LATENCY (INCREMENTAL_COUNTERS + 6)
#define WDMA_TOTAL_LATENCY (INCREMENTAL_COUNTERS + 7)
#endif

View File

@@ -0,0 +1,576 @@
#ifndef GEMMINI_NN_H
#define GEMMINI_NN_H
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#ifndef BAREMETAL
#include <sys/mman.h>
#endif
#include "include/gemmini.h"
#include "include/gemmini_testutils.h"
struct ConvParams {
int batch_size;
int in_row_dim;
int in_col_dim;
int out_row_dim;
int out_col_dim;
int kernel_size;
int in_channels;
int out_channels;
int in_stride;
int weight_stride;
int out_stride;
int stride;
int padding;
bool bias;
bool depthwise;
int n_patches;
int patch_size;
acc_scale_t output_scale;
scale_t res_scale;
int pool_size, pool_stride, pool_padding, out_dim_pooled;
int I, J, K;
};
struct FcParams {
int batch_size;
int in_features;
int out_features;
acc_scale_t output_scale;
bool bias;
int I, J, K;
};
#define HIST_IMAGES(IMAGES) \
for (int num = -128; num <= 127; num++) { \
int count = 0; \
for (int i = 0; i < sizeof(IMAGES)/sizeof(IMAGES[0]); i++) { \
for (int j = 0; j < sizeof(IMAGES[0])/sizeof(IMAGES[0][0]); j++) { \
for (int k = 0; k < sizeof(IMAGES[0][0])/sizeof(IMAGES[0][0][0]); k++) { \
for (int l = 0; l < sizeof(IMAGES[0][0][0])/sizeof(IMAGES[0][0][0][0]); l++) { \
if (IMAGES[i][j][k][l] == num) { \
count++; \
} \
} \
} \
} \
} \
if (count > 0) \
printf("%d: %d times\n", num, count); \
}
#define HIST_MATRIX(MATRIX) \
for (int num = -128; num <= 127; num++) { \
int count = 0; \
for (int i = 0; i < sizeof(MATRIX)/sizeof(MATRIX[0]); i++) { \
for (int j = 0; j < sizeof(MATRIX[0])/sizeof(MATRIX[0][0]); j++) { \
if (MATRIX[i][j] == num) { \
count++; \
} \
} \
} \
if (count > 0) \
printf("%d: %d times\n", num, count); \
}
// This function runs a tiled matrix multiplication, with explicit tiling
// factors
static void tiled_matmul_nn(size_t dim_I, size_t dim_J, size_t dim_K,
const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J],
const void * D, elem_t C[dim_I][dim_J],
int act, acc_scale_t scale, bool repeating_bias,
size_t tile_I, size_t tile_J, size_t tile_K,
enum tiled_matmul_type_t tiled_matmul_type,
bool check, char * layer_name)
{
if (check)
printf("%s: gemmini\n", layer_name);
tiled_matmul(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)C,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
tile_I, tile_J, tile_K,
false, false,
false, false,
0,
tiled_matmul_type);
if (check) {
printf("%s: CPU\n", layer_name);
elem_t gold[dim_I][dim_J];
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)gold,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
CPU);
if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) {
printf("Layer calculated incorrectly: %s\n", layer_name);
exit(1);
}
}
}
// This function runs a tiled matrix multiplication, with automatically
// calculated tiling factors
// With default auto-stride calc (A_stride = dim_K, B_stride/C_stride/D_stride = dim_J)
static void tiled_matmul_nn_auto(size_t dim_I, size_t dim_J, size_t dim_K,
const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J],
const void * D, elem_t C[dim_I][dim_J],
int act, acc_scale_t scale, bool repeating_bias,
enum tiled_matmul_type_t tiled_matmul_type,
bool check, char * layer_name)
{
if (check)
printf("%s: gemmini\n", layer_name);
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)C,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
tiled_matmul_type);
if (check) {
printf("%s: CPU\n", layer_name);
elem_t gold[dim_I][dim_J];
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)gold,
dim_K, dim_J, dim_J, dim_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
CPU);
if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) {
printf("Layer calculated incorrectly: %s\n", layer_name);
exit(1);
}
}
}
// need to specify stride
// auto tiling calc
static void tiled_matmul_nn_stride_auto(size_t dim_I, size_t dim_J, size_t dim_K,
const size_t A_stride, const size_t B_stride, const size_t C_stride,
const elem_t * A, const elem_t * B, const void * D, const elem_t * C,
int act, acc_scale_t scale, bool repeating_bias,
enum tiled_matmul_type_t tiled_matmul_type)
{
tiled_matmul_auto(dim_I, dim_J, dim_K,
(elem_t*)A, (elem_t*)B, D, (elem_t*)C,
A_stride, B_stride, C_stride, C_stride,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
act, scale, 0, repeating_bias,
false, false,
false, false,
0,
tiled_matmul_type);
}
static void conv_dw(size_t I, size_t J,
const size_t batch_size, const size_t channels,
const size_t in_row_dim, const size_t in_col_dim,
const size_t out_row_dim, const size_t out_col_dim,
const size_t kernel_size,
const elem_t input[batch_size][in_row_dim][in_col_dim][channels],
const elem_t weight[channels][kernel_size][kernel_size],
const acc_t * bias,
// elem_t output [batch_size][out_row_dim][out_col_dim][channels],
elem_t output [I][J],
const struct ConvParams * params)
{
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * params->stride - params->padding;
acc_t result = 0;
if (params->bias) {
result = bias[channel];
}
for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) {
int in_col = out_col * params->stride - params->padding;
for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) {
result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col];
}
in_col++;
}
in_row++;
}
if (result < 0) {
result = 0;
}
acc_t scaled = ACC_SCALE(result, params->output_scale);
if (scaled > elem_t_max) {
scaled = elem_t_max;
} else if (scaled < elem_t_min) {
scaled = elem_t_min;
}
size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col;
output[r][channel] = scaled;
// output[batch][out_row][out_col][channel] = scaled;
}
}
}
}
}
static void conv_dw_with_col2im(size_t prev_I, size_t prev_J, size_t I, size_t J,
const size_t batch_size, const size_t channels,
const size_t out_row_dim, const size_t out_col_dim, const size_t kernel_size,
const elem_t input[prev_I][prev_J],
const elem_t weight[channels][kernel_size][kernel_size],
const acc_t * bias,
// elem_t output [batch_size][out_dim][out_dim][channels],
elem_t output [I][J],
const struct ConvParams * params)
{
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * params->stride - params->padding;
acc_t result = 0;
if (params->bias) {
result = bias[channel];
}
for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) {
int in_col = out_col * params->stride - params->padding;
for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) {
// result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col];
size_t r = batch * params->in_row_dim * params->in_col_dim + in_row * params->in_col_dim + in_col;
result += input[r][channel] * weight[channel][kernel_row][kernel_col];
}
in_col++;
}
in_row++;
}
if (result < 0) {
result = 0;
}
acc_t scaled = ACC_SCALE(result, params->output_scale);
if (scaled > elem_t_max) {
scaled = elem_t_max;
} else if (scaled < elem_t_min) {
scaled = elem_t_min;
}
size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col;
output[r][channel] = scaled;
// output[batch][out_row][out_col][channel] = scaled;
}
}
}
}
}
static void im2col(size_t batch_size, size_t channels, size_t im_row_dim, size_t im_col_dim,
size_t I, size_t K,
const elem_t input[batch_size][im_row_dim][im_col_dim][channels],
elem_t output[I][K],
const struct ConvParams * params)
{
int patch_row = 0;
for (int n_batch = 0; n_batch < params->batch_size; n_batch++) {
for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) {
for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) {
int patch_col = 0;
for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) {
for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) {
for (int im_channel = 0; im_channel < params->in_channels; im_channel++) {
int pixel_row = im_row + filter_row;
int pixel_col = im_col + filter_col;
if (pixel_row < 0 || pixel_row >= params->in_row_dim
|| pixel_col < 0 || pixel_col >= params->in_col_dim) {
// output[patch_row][patch_col] = 0;
} else {
output[patch_row][patch_col] = input[n_batch][pixel_row][pixel_col][im_channel];
}
patch_col++;
}
}
}
patch_row++;
}
}
}
}
static void im2col_with_col2im(size_t prev_I, size_t prev_J,
size_t next_I, size_t next_K,
const elem_t input[prev_I][prev_J],
elem_t output[next_I][next_K],
const struct ConvParams * params)
{
int out_row = 0;
for (int n_batch = 0; n_batch < params->batch_size; n_batch++) {
for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) {
for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) {
int out_col = 0;
for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) {
for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) {
for (int im_channel = 0; im_channel < params->in_channels; im_channel++) {
int pixel_row = im_row + filter_row;
int pixel_col = im_col + filter_col;
if (pixel_row < 0 || pixel_row >= params->in_row_dim
|| pixel_col < 0 || pixel_col >= params->in_col_dim) {
// output[out_row][out_col] = 0;
} else {
int in_row = n_batch * params->in_row_dim * params->in_col_dim + pixel_row * params->in_col_dim + pixel_col;
int in_col = im_channel;
output[out_row][out_col] = input[in_row][in_col];
}
out_col++;
}
}
}
out_row++;
}
}
}
}
// Compute C = A + B with saturating add
void vecadd(size_t len, const elem_t * A, const elem_t * B, elem_t * C, scale_t A_shift) {
for (size_t i = 0; i < len; i++) {
acc_t result = MVIN_SCALE(A[i], A_shift) + B[i];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < elem_t_min) {
result = elem_t_min;
}
C[i] = result;
}
}
void resadd1(const size_t batch_size, const size_t channels, const size_t im_dim,
const elem_t A[batch_size][im_dim][im_dim][channels],
const elem_t B[batch_size][im_dim][im_dim][channels],
elem_t C[batch_size][im_dim][im_dim][channels],
bool relu,
const struct ConvParams * params) {
const int minimum = relu ? 0 : elem_t_min;
for (size_t batch = 0; batch < params->batch_size; batch++) {
for (size_t row = 0; row < params->out_dim_pooled; row++) {
for (size_t col = 0; col < params->out_dim_pooled; col++) {
for (size_t channel = 0; channel < params->out_channels; channel++) {
acc_t result = MVIN_SCALE(A[batch][row][col][channel], params->res_scale) + B[batch][row][col][channel];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < minimum) {
result = minimum;
}
C[batch][row][col][channel] = result;
}
}
}
}
}
void resadd2(const size_t I, const size_t J,
const size_t batch_size, const size_t channels, const size_t im_dim,
const elem_t A[I][J],
const elem_t B[batch_size][im_dim][im_dim][channels],
elem_t C[batch_size][im_dim][im_dim][channels],
bool relu,
const struct ConvParams * params) {
const int minimum = relu ? 0 : elem_t_min;
for (size_t batch = 0; batch < params->batch_size; batch++) {
for (size_t row = 0; row < params->out_dim_pooled; row++) {
for (size_t col = 0; col < params->out_dim_pooled; col++) {
for (size_t channel = 0; channel < params->out_channels; channel++) {
size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col;
acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[batch][row][col][channel];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < minimum) {
result = minimum;
}
C[batch][row][col][channel] = result;
}
}
}
}
}
void resadd3(const size_t I, const size_t J,
const elem_t A[I][J],
const elem_t B[I][J],
elem_t C[I][J],
bool relu,
const struct ConvParams * params) {
const int minimum = relu ? 0 : elem_t_min;
for (size_t batch = 0; batch < params->batch_size; batch++) {
for (size_t row = 0; row < params->out_dim_pooled; row++) {
for (size_t col = 0; col < params->out_dim_pooled; col++) {
for (size_t channel = 0; channel < params->out_channels; channel++) {
size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col;
acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[r][channel];
if (result > elem_t_max) {
result = elem_t_max;
} else if (result < minimum) {
result = minimum;
}
C[r][channel] = result;
}
}
}
}
}
// Pooling
void pool(size_t batch_size, size_t channels, size_t in_row_dim, size_t in_col_dim,
size_t out_row_dim, size_t out_col_dim,
elem_t input[batch_size][in_row_dim][in_col_dim][channels],
elem_t output[batch_size][out_row_dim][out_col_dim][channels],
const struct ConvParams * params)
{
size_t kernel_size = params->pool_size;
size_t stride = params->pool_stride;
// size_t in_dim = params->out_dim;
size_t padding = params->pool_padding;
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * stride - padding;
elem_t result = elem_t_min;
for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) {
int in_col = out_col * stride - padding;
for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) {
if (input[batch][in_row][in_col][channel] > result) {
result = input[batch][in_row][in_col][channel];
}
} else if (0 > result) {
result = 0;
}
in_col++;
}
in_row++;
}
output[batch][out_row][out_col][channel] = result;
}
}
}
}
}
void pool_with_col2im(size_t I, size_t J,
size_t batch_size, size_t channels, size_t out_row_dim, size_t out_col_dim,
elem_t input[I][J],
elem_t output[batch_size][out_row_dim][out_col_dim][channels],
const struct ConvParams * params)
{
size_t kernel_size = params->pool_size;
size_t stride = params->pool_stride;
size_t in_row_dim = params->out_row_dim;
size_t in_col_dim = params->out_col_dim;
size_t padding = params->pool_padding;
for (int batch = 0; batch < batch_size; batch++) {
for (int channel = 0; channel < channels; channel++) {
for (int out_row = 0; out_row < out_row_dim; out_row++) {
for (int out_col = 0; out_col < out_col_dim; out_col++) {
int in_row = out_row * stride - padding;
elem_t result = elem_t_min;
for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) {
int in_col = out_col * stride - padding;
for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) {
if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) {
if (input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel] > result) {
result = input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel];
}
} else if (0 > result) {
result = 0;
}
in_col++;
}
in_row++;
}
output[batch][out_row][out_col][channel] = result;
}
}
}
}
}
#endif // GEMMINI_NN_H

View File

@@ -0,0 +1,90 @@
#ifndef GEMMINI_PARAMS_H
#define GEMMINI_PARAMS_H
#include <stdint.h>
#include <limits.h>
#define XCUSTOM_ACC 3
#define DIM 16
#define ADDR_LEN 32
#define BANK_NUM 4
#define BANK_ROWS 1024
#define ACC_ROWS 1024
#define MAX_BYTES 64
#define MAX_BLOCK_LEN (MAX_BYTES/(DIM*2))
#define MAX_BLOCK_LEN_ACC (MAX_BYTES/(DIM*2))
typedef uint16_t elem_t;
#define ELEM_T_IS_LOWPREC_FLOAT
static const float elem_t_max = 65504.0;
static const float elem_t_min = -65504.0;
typedef uint16_t acc_t;
typedef double full_t;
#define ELEM_T_IS_FLOAT
#define ELEM_T_EXP_BITS 5
#define ELEM_T_SIG_BITS 11
#define ACC_T_EXP_BITS 5
#define ACC_T_SIG_BITS 11
typedef uint16_t elem_t_bits;
typedef uint16_t acc_t_bits;
#define HAS_MVIN_SCALE
typedef uint16_t scale_t;
typedef uint16_t scale_t_bits;
typedef int32_t scale_acc_t;
typedef uint32_t scale_acc_t_bits;
typedef uint16_t acc_scale_t;
typedef uint16_t acc_scale_t_bits;
#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))
#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))
#define MVIN_SCALE_IDENTITY 0x3c00
#define ACC_SCALE_IDENTITY 1.0
#define ROUNDING_RIGHT_SHIFT(x, shift) \
((x) / (1 << (shift)))
#ifdef __cplusplus
#define SAME_TYPE(x) decltype(x)
#else
#define SAME_TYPE(x) typeof(x)
#endif
#define ROUND_NEAR_EVEN(x) \
({ const SAME_TYPE(x) x_ = (x); \
const long long i = x_; \
const long long next = x_ < 0 ? x_ - 1 : x_ + 1; \
SAME_TYPE(x) rem = x_ - i; \
rem = rem < 0 ? -rem : rem; \
SAME_TYPE(x) result = rem < 0.5 ? i : (rem > 0.5 ? next : ( \
i % 2 == 0 ? i : next)); \
result; })
// Rounding right shift equation: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
#define ROUNDING_RIGHT_SHIFT_BITS(x, shift) \
((shift) > 0 ? (((x) >> (shift)) + \
(((shift) == 0 ? 0 : (((x) >> ((shift)-1)) & 1)) & \
((((shift) <= 1 ? 0 : ((x) & ((1 << ((shift)-1)) - 1))) != 0) | (((x) >> (shift)) & 1)))) : ((x) << (-(shift))))
#define ACC_SCALE(x, scale) \
((x))
#define MVIN_SCALE(x, scale) \
((x) * (scale))
#define MVIN_SCALE_ACC(x, scale) (x)
#define ACC_SCALE_T_IS_FLOAT
#define ACC_SCALE_EXP_BITS 5
#define ACC_SCALE_SIG_BITS 11
#define ACC_READ_SMALL_WIDTH
#define HAS_FIRST_LAYER_OPTIMIZATIONS
#endif // GEMMINI_PARAMS_H

View File

@@ -0,0 +1,92 @@
#ifndef GEMMINI_PARAMS_H
#define GEMMINI_PARAMS_H
#include <stdint.h>
#include <limits.h>
#define XCUSTOM_ACC 3
#define DIM 8
#define ADDR_LEN 32
#define BANK_NUM 8
#define BANK_ROWS 1024
#define ACC_ROWS 512
#define MAX_BYTES 64
#define MAX_BLOCK_LEN (MAX_BYTES/(DIM*4))
#define MAX_BLOCK_LEN_ACC (MAX_BYTES/(DIM*4))
typedef float elem_t;
static const elem_t elem_t_max = 3.4028235E38;
static const elem_t elem_t_min = -3.4028235E38;
typedef float acc_t;
typedef double full_t;
#define ELEM_T_IS_FLOAT
#define ELEM_T_EXP_BITS 8
#define ELEM_T_SIG_BITS 24
#define ACC_T_EXP_BITS 8
#define ACC_T_SIG_BITS 24
typedef uint32_t elem_t_bits;
typedef uint32_t acc_t_bits;
#define HAS_MVIN_SCALE
typedef float scale_t;
typedef uint32_t scale_t_bits;
#define HAS_MVIN_ACC_SCALE
typedef float scale_acc_t;
typedef uint32_t scale_acc_t_bits;
typedef float acc_scale_t;
typedef uint32_t acc_scale_t_bits;
#define row_align(blocks) __attribute__((aligned(blocks*DIM*sizeof(elem_t))))
#define row_align_acc(blocks) __attribute__((aligned(blocks*DIM*sizeof(acc_t))))
#define MVIN_SCALE_IDENTITY 1.0
#define ACC_SCALE_IDENTITY 1.0
#define ROUNDING_RIGHT_SHIFT(x, shift) \
((x) / (1 << (shift)))
#ifdef __cplusplus
#define SAME_TYPE(x) decltype(x)
#else
#define SAME_TYPE(x) typeof(x)
#endif
#define ROUND_NEAR_EVEN(x) \
({ const SAME_TYPE(x) x_ = (x); \
const long long i = x_; \
const long long next = x_ < 0 ? x_ - 1 : x_ + 1; \
SAME_TYPE(x) rem = x_ - i; \
rem = rem < 0 ? -rem : rem; \
SAME_TYPE(x) result = rem < 0.5 ? i : (rem > 0.5 ? next : ( \
i % 2 == 0 ? i : next)); \
result; })
// Rounding right shift equation: https://riscv.github.io/documents/riscv-v-spec/#_vector_fixed_point_rounding_mode_register_vxrm
#define ROUNDING_RIGHT_SHIFT_BITS(x, shift) \
((shift) > 0 ? (((x) >> (shift)) + \
(((shift) == 0 ? 0 : (((x) >> ((shift)-1)) & 1)) & \
((((shift) <= 1 ? 0 : ((x) & ((1 << ((shift)-1)) - 1))) != 0) | (((x) >> (shift)) & 1)))) : ((x) << (-(shift))))
#define ACC_SCALE(x, scale) \
((x) * (scale))
#define MVIN_SCALE(x, scale) \
((x) * (scale))
#define MVIN_SCALE_ACC(x, scale) \
((x) * (scale))
#define ACC_SCALE_T_IS_FLOAT
#define ACC_SCALE_EXP_BITS 8
#define ACC_SCALE_SIG_BITS 24
#define ACC_READ_SMALL_WIDTH
#define ACC_READ_FULL_WIDTH
#define HAS_FIRST_LAYER_OPTIMIZATIONS
#endif // GEMMINI_PARAMS_H

View File

@@ -0,0 +1 @@
gemmini_params.dim16fp16.h

View File

@@ -0,0 +1,285 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_GEMMINI_TESTUTILS_H
#define SRC_MAIN_C_GEMMINI_TESTUTILS_H
#undef abs
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include <stdbool.h>
#include "include/gemmini_params.h"
#include "include/gemmini.h"
#ifdef BAREMETAL
#undef assert
#define assert(expr) \
if (!(expr)) { \
printf("Failed assertion: " #expr "\n " __FILE__ ":%u\n", __LINE__); \
exit(1); \
}
#endif
// #define GEMMINI_ASSERTIONS
// Matmul utility functions
static void matmul(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[k][c];
}
}
static void matmul_short(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[r][k]*B[k][c];
}
}
static void matmul_full(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
// Identical to the other matmul function, but with a 64-bit bias
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[k][c];
}
}
static void matmul_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[k][c];
}
}
static void matmul_short_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[k][r]*B[k][c];
}
}
static void matmul_full_A_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[k][c];
}
}
static void matmul_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[c][k];
}
}
static void matmul_short_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[r][k]*B[c][k];
}
}
static void matmul_full_B_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[r][k]*B[c][k];
}
}
static void matmul_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[c][k];
}
}
static void matmul_short_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], elem_t D[DIM][DIM], elem_t C[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C[r][c] += A[k][r]*B[c][k];
}
}
static void matmul_full_AB_transposed(elem_t A[DIM][DIM], elem_t B[DIM][DIM], full_t D[DIM][DIM], full_t C_full[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
C_full[r][c] = D[r][c];
for (size_t k = 0; k < DIM; k++)
C_full[r][c] += A[k][r]*B[c][k];
}
}
static void matadd(full_t sum[DIM][DIM], full_t m1[DIM][DIM], full_t m2[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++)
sum[r][c] = m1[r][c] + m2[r][c];
}
// THIS IS A ROUNDING SHIFT! It also performs a saturating cast
static void matshift(full_t full[DIM][DIM], elem_t out[DIM][DIM], int shift) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
// Bitshift and round element
full_t shifted = ROUNDING_RIGHT_SHIFT(full[r][c], shift);
// Saturate and cast element
#ifndef ELEM_T_IS_FLOAT
full_t elem = shifted > elem_t_max ? elem_t_max : (shifted < elem_t_min ? elem_t_min : shifted);
out[r][c] = elem;
#else
out[r][c] = shifted; // TODO should we also saturate when using floats?
#endif
}
}
static void matscale(full_t full[DIM][DIM], elem_t out[DIM][DIM], acc_scale_t scale) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++) {
// Bitshift and round element
full_t scaled = ACC_SCALE(full[r][c], scale);
// Saturate and cast element
#ifndef ELEM_T_IS_FLOAT
full_t elem = scaled > elem_t_max ? elem_t_max : (scaled < elem_t_min ? elem_t_min : scaled);
out[r][c] = elem;
#else
out[r][c] = scaled; // TODO should we also saturate when using floats?
#endif
}
}
static void matrelu(elem_t in[DIM][DIM], elem_t out[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++)
out[r][c] = in[r][c] > 0 ? in[r][c] : 0;
}
static void transpose(elem_t in[DIM][DIM], elem_t out[DIM][DIM]) {
for (size_t r = 0; r < DIM; r++)
for (size_t c = 0; c < DIM; c++)
out[c][r] = in[r][c];
}
int rand() {
static uint32_t x = 777;
x = x * 1664525 + 1013904223;
return x >> 24;
}
#ifdef ELEM_T_IS_FLOAT
double rand_double() {
double a = (double)(rand() % 128) / (double)(1 + (rand() % 64));
double b = (double)(rand() % 128) / (double)(1 + (rand() % 64));
return a - b;
}
#endif
static void printMatrix(elem_t m[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i) {
for (size_t j = 0; j < DIM; ++j)
#ifndef ELEM_T_IS_FLOAT
printf("%d ", m[i][j]);
#else
printf("%x ", elem_t_to_elem_t_bits(m[i][j]));
#endif
printf("\n");
}
}
static void printMatrixAcc(acc_t m[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i) {
for (size_t j = 0; j < DIM; ++j)
#ifndef ELEM_T_IS_FLOAT
printf("%d ", m[i][j]);
#else
printf("%x ", acc_t_to_acc_t_bits(m[i][j]));
#endif
printf("\n");
}
}
static int is_equal(elem_t x[DIM][DIM], elem_t y[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i)
for (size_t j = 0; j < DIM; ++j) {
#ifndef ELEM_T_IS_FLOAT
if (x[i][j] != y[i][j])
#else
bool isnanx = elem_t_isnan(x[i][j]);
bool isnany = elem_t_isnan(y[i][j]);
if (x[i][j] != y[i][j] && !(isnanx && isnany))
#endif
return 0;
}
return 1;
}
static int is_equal_transposed(elem_t x[DIM][DIM], elem_t y[DIM][DIM]) {
for (size_t i = 0; i < DIM; ++i)
for (size_t j = 0; j < DIM; ++j) {
#ifndef ELEM_T_IS_FLOAT
if (x[i][j] != y[j][i])
#else
bool isnanx = elem_t_isnan(x[i][j]);
bool isnany = elem_t_isnan(y[j][i]);
if (x[i][j] != y[j][i] && !(isnanx && isnany))
#endif
return 0;
}
return 1;
}
// This is a GNU extension known as statment expressions
#define MAT_IS_EQUAL(dim_i, dim_j, x, y) \
({int result = 1; \
for (size_t i = 0; i < dim_i; i++) \
for (size_t j = 0; j < dim_j; ++j) { \
if (x[i][j] != y[i][j]) { \
result = 0; \
break; \
} \
} \
result;})
static uint64_t read_cycles() {
uint64_t cycles;
asm volatile ("rdcycle %0" : "=r" (cycles));
return cycles;
// const uint32_t * mtime = (uint32_t *)(33554432 + 0xbff8);
// const uint32_t * mtime = (uint32_t *)(33554432 + 0xbffc);
// return *mtime;
}
#undef abs
#endif // SRC_MAIN_C_GEMMINI_TESTUTILS_H

View File

@@ -0,0 +1,13 @@
// See LICENSE for license details.
#ifndef SRC_MAIN_C_TRANSLATOR_H
#define SRC_MAIN_C_TRANSLATOR_H
#include "rocc-software/src/xcustom.h"
#define XCUSTOM_TRANS 1
#define doTranslate(y, vaddr) \
ROCC_INSTRUCTION(XCUSTOM_TRANS, y, vaddr, 0, 0);
#endif // SRC_MAIN_C_TRANSLATOR_H