#ifndef GEMMINI_NN_H #define GEMMINI_NN_H #include #include #include #ifndef BAREMETAL #include #endif #include "include/gemmini.h" #include "include/gemmini_testutils.h" struct ConvParams { int batch_size; int in_row_dim; int in_col_dim; int out_row_dim; int out_col_dim; int kernel_size; int in_channels; int out_channels; int in_stride; int weight_stride; int out_stride; int stride; int padding; bool bias; bool depthwise; int n_patches; int patch_size; acc_scale_t output_scale; scale_t res_scale; int pool_size, pool_stride, pool_padding, out_dim_pooled; int I, J, K; }; struct FcParams { int batch_size; int in_features; int out_features; acc_scale_t output_scale; bool bias; int I, J, K; }; #define HIST_IMAGES(IMAGES) \ for (int num = -128; num <= 127; num++) { \ int count = 0; \ for (int i = 0; i < sizeof(IMAGES)/sizeof(IMAGES[0]); i++) { \ for (int j = 0; j < sizeof(IMAGES[0])/sizeof(IMAGES[0][0]); j++) { \ for (int k = 0; k < sizeof(IMAGES[0][0])/sizeof(IMAGES[0][0][0]); k++) { \ for (int l = 0; l < sizeof(IMAGES[0][0][0])/sizeof(IMAGES[0][0][0][0]); l++) { \ if (IMAGES[i][j][k][l] == num) { \ count++; \ } \ } \ } \ } \ } \ if (count > 0) \ printf("%d: %d times\n", num, count); \ } #define HIST_MATRIX(MATRIX) \ for (int num = -128; num <= 127; num++) { \ int count = 0; \ for (int i = 0; i < sizeof(MATRIX)/sizeof(MATRIX[0]); i++) { \ for (int j = 0; j < sizeof(MATRIX[0])/sizeof(MATRIX[0][0]); j++) { \ if (MATRIX[i][j] == num) { \ count++; \ } \ } \ } \ if (count > 0) \ printf("%d: %d times\n", num, count); \ } // This function runs a tiled matrix multiplication, with explicit tiling // factors static void tiled_matmul_nn(size_t dim_I, size_t dim_J, size_t dim_K, const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J], const void * D, elem_t C[dim_I][dim_J], int act, acc_scale_t scale, bool repeating_bias, size_t tile_I, size_t tile_J, size_t tile_K, enum tiled_matmul_type_t tiled_matmul_type, bool check, char * layer_name) { if (check) printf("%s: gemmini\n", layer_name); tiled_matmul(dim_I, dim_J, dim_K, (elem_t*)A, (elem_t*)B, D, (elem_t*)C, dim_K, dim_J, dim_J, dim_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, act, scale, 0, repeating_bias, tile_I, tile_J, tile_K, false, false, false, false, 0, tiled_matmul_type); if (check) { printf("%s: CPU\n", layer_name); elem_t gold[dim_I][dim_J]; tiled_matmul_auto(dim_I, dim_J, dim_K, (elem_t*)A, (elem_t*)B, D, (elem_t*)gold, dim_K, dim_J, dim_J, dim_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, act, scale, 0, repeating_bias, false, false, false, false, 0, CPU); if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) { printf("Layer calculated incorrectly: %s\n", layer_name); exit(1); } } } // This function runs a tiled matrix multiplication, with automatically // calculated tiling factors // With default auto-stride calc (A_stride = dim_K, B_stride/C_stride/D_stride = dim_J) static void tiled_matmul_nn_auto(size_t dim_I, size_t dim_J, size_t dim_K, const elem_t A[dim_I][dim_K], const elem_t B[dim_K][dim_J], const void * D, elem_t C[dim_I][dim_J], int act, acc_scale_t scale, bool repeating_bias, enum tiled_matmul_type_t tiled_matmul_type, bool check, char * layer_name) { if (check) printf("%s: gemmini\n", layer_name); tiled_matmul_auto(dim_I, dim_J, dim_K, (elem_t*)A, (elem_t*)B, D, (elem_t*)C, dim_K, dim_J, dim_J, dim_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, act, scale, 0, repeating_bias, false, false, false, false, 0, tiled_matmul_type); if (check) { printf("%s: CPU\n", layer_name); elem_t gold[dim_I][dim_J]; tiled_matmul_auto(dim_I, dim_J, dim_K, (elem_t*)A, (elem_t*)B, D, (elem_t*)gold, dim_K, dim_J, dim_J, dim_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, act, scale, 0, repeating_bias, false, false, false, false, 0, CPU); if (!MAT_IS_EQUAL(dim_I, dim_J, C, gold)) { printf("Layer calculated incorrectly: %s\n", layer_name); exit(1); } } } // need to specify stride // auto tiling calc static void tiled_matmul_nn_stride_auto(size_t dim_I, size_t dim_J, size_t dim_K, const size_t A_stride, const size_t B_stride, const size_t C_stride, const elem_t * A, const elem_t * B, const void * D, const elem_t * C, int act, acc_scale_t scale, bool repeating_bias, enum tiled_matmul_type_t tiled_matmul_type) { tiled_matmul_auto(dim_I, dim_J, dim_K, (elem_t*)A, (elem_t*)B, D, (elem_t*)C, A_stride, B_stride, C_stride, C_stride, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, act, scale, 0, repeating_bias, false, false, false, false, 0, tiled_matmul_type); } static void conv_dw(size_t I, size_t J, const size_t batch_size, const size_t channels, const size_t in_row_dim, const size_t in_col_dim, const size_t out_row_dim, const size_t out_col_dim, const size_t kernel_size, const elem_t input[batch_size][in_row_dim][in_col_dim][channels], const elem_t weight[channels][kernel_size][kernel_size], const acc_t * bias, // elem_t output [batch_size][out_row_dim][out_col_dim][channels], elem_t output [I][J], const struct ConvParams * params) { for (int batch = 0; batch < batch_size; batch++) { for (int channel = 0; channel < channels; channel++) { for (int out_row = 0; out_row < out_row_dim; out_row++) { for (int out_col = 0; out_col < out_col_dim; out_col++) { int in_row = out_row * params->stride - params->padding; acc_t result = 0; if (params->bias) { result = bias[channel]; } for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) { int in_col = out_col * params->stride - params->padding; for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) { if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) { result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col]; } in_col++; } in_row++; } if (result < 0) { result = 0; } acc_t scaled = ACC_SCALE(result, params->output_scale); if (scaled > elem_t_max) { scaled = elem_t_max; } else if (scaled < elem_t_min) { scaled = elem_t_min; } size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col; output[r][channel] = scaled; // output[batch][out_row][out_col][channel] = scaled; } } } } } static void conv_dw_with_col2im(size_t prev_I, size_t prev_J, size_t I, size_t J, const size_t batch_size, const size_t channels, const size_t out_row_dim, const size_t out_col_dim, const size_t kernel_size, const elem_t input[prev_I][prev_J], const elem_t weight[channels][kernel_size][kernel_size], const acc_t * bias, // elem_t output [batch_size][out_dim][out_dim][channels], elem_t output [I][J], const struct ConvParams * params) { for (int batch = 0; batch < batch_size; batch++) { for (int channel = 0; channel < channels; channel++) { for (int out_row = 0; out_row < out_row_dim; out_row++) { for (int out_col = 0; out_col < out_col_dim; out_col++) { int in_row = out_row * params->stride - params->padding; acc_t result = 0; if (params->bias) { result = bias[channel]; } for (int kernel_row = 0; kernel_row < params->kernel_size; kernel_row++) { int in_col = out_col * params->stride - params->padding; for (int kernel_col = 0; kernel_col < params->kernel_size; kernel_col++) { if (in_row >= 0 && in_row < params->in_row_dim && in_col >= 0 && in_col < params->in_col_dim) { // result += input[batch][in_row][in_col][channel] * weight[channel][kernel_row][kernel_col]; size_t r = batch * params->in_row_dim * params->in_col_dim + in_row * params->in_col_dim + in_col; result += input[r][channel] * weight[channel][kernel_row][kernel_col]; } in_col++; } in_row++; } if (result < 0) { result = 0; } acc_t scaled = ACC_SCALE(result, params->output_scale); if (scaled > elem_t_max) { scaled = elem_t_max; } else if (scaled < elem_t_min) { scaled = elem_t_min; } size_t r = batch * params->out_row_dim * params->out_col_dim + out_row * params->out_col_dim + out_col; output[r][channel] = scaled; // output[batch][out_row][out_col][channel] = scaled; } } } } } static void im2col(size_t batch_size, size_t channels, size_t im_row_dim, size_t im_col_dim, size_t I, size_t K, const elem_t input[batch_size][im_row_dim][im_col_dim][channels], elem_t output[I][K], const struct ConvParams * params) { int patch_row = 0; for (int n_batch = 0; n_batch < params->batch_size; n_batch++) { for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) { for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) { int patch_col = 0; for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) { for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) { for (int im_channel = 0; im_channel < params->in_channels; im_channel++) { int pixel_row = im_row + filter_row; int pixel_col = im_col + filter_col; if (pixel_row < 0 || pixel_row >= params->in_row_dim || pixel_col < 0 || pixel_col >= params->in_col_dim) { // output[patch_row][patch_col] = 0; } else { output[patch_row][patch_col] = input[n_batch][pixel_row][pixel_col][im_channel]; } patch_col++; } } } patch_row++; } } } } static void im2col_with_col2im(size_t prev_I, size_t prev_J, size_t next_I, size_t next_K, const elem_t input[prev_I][prev_J], elem_t output[next_I][next_K], const struct ConvParams * params) { int out_row = 0; for (int n_batch = 0; n_batch < params->batch_size; n_batch++) { for (int im_row = -params->padding; im_row < params->in_row_dim - params->kernel_size + params->padding + 1; im_row += params->stride) { for (int im_col = -params->padding; im_col < params->in_col_dim - params->kernel_size + params->padding + 1; im_col += params->stride) { int out_col = 0; for (int filter_row = 0; filter_row < params->kernel_size; filter_row++) { for (int filter_col = 0; filter_col < params->kernel_size; filter_col++) { for (int im_channel = 0; im_channel < params->in_channels; im_channel++) { int pixel_row = im_row + filter_row; int pixel_col = im_col + filter_col; if (pixel_row < 0 || pixel_row >= params->in_row_dim || pixel_col < 0 || pixel_col >= params->in_col_dim) { // output[out_row][out_col] = 0; } else { int in_row = n_batch * params->in_row_dim * params->in_col_dim + pixel_row * params->in_col_dim + pixel_col; int in_col = im_channel; output[out_row][out_col] = input[in_row][in_col]; } out_col++; } } } out_row++; } } } } // Compute C = A + B with saturating add void vecadd(size_t len, const elem_t * A, const elem_t * B, elem_t * C, scale_t A_shift) { for (size_t i = 0; i < len; i++) { acc_t result = MVIN_SCALE(A[i], A_shift) + B[i]; if (result > elem_t_max) { result = elem_t_max; } else if (result < elem_t_min) { result = elem_t_min; } C[i] = result; } } void resadd1(const size_t batch_size, const size_t channels, const size_t im_dim, const elem_t A[batch_size][im_dim][im_dim][channels], const elem_t B[batch_size][im_dim][im_dim][channels], elem_t C[batch_size][im_dim][im_dim][channels], bool relu, const struct ConvParams * params) { const int minimum = relu ? 0 : elem_t_min; for (size_t batch = 0; batch < params->batch_size; batch++) { for (size_t row = 0; row < params->out_dim_pooled; row++) { for (size_t col = 0; col < params->out_dim_pooled; col++) { for (size_t channel = 0; channel < params->out_channels; channel++) { acc_t result = MVIN_SCALE(A[batch][row][col][channel], params->res_scale) + B[batch][row][col][channel]; if (result > elem_t_max) { result = elem_t_max; } else if (result < minimum) { result = minimum; } C[batch][row][col][channel] = result; } } } } } void resadd2(const size_t I, const size_t J, const size_t batch_size, const size_t channels, const size_t im_dim, const elem_t A[I][J], const elem_t B[batch_size][im_dim][im_dim][channels], elem_t C[batch_size][im_dim][im_dim][channels], bool relu, const struct ConvParams * params) { const int minimum = relu ? 0 : elem_t_min; for (size_t batch = 0; batch < params->batch_size; batch++) { for (size_t row = 0; row < params->out_dim_pooled; row++) { for (size_t col = 0; col < params->out_dim_pooled; col++) { for (size_t channel = 0; channel < params->out_channels; channel++) { size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col; acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[batch][row][col][channel]; if (result > elem_t_max) { result = elem_t_max; } else if (result < minimum) { result = minimum; } C[batch][row][col][channel] = result; } } } } } void resadd3(const size_t I, const size_t J, const elem_t A[I][J], const elem_t B[I][J], elem_t C[I][J], bool relu, const struct ConvParams * params) { const int minimum = relu ? 0 : elem_t_min; for (size_t batch = 0; batch < params->batch_size; batch++) { for (size_t row = 0; row < params->out_dim_pooled; row++) { for (size_t col = 0; col < params->out_dim_pooled; col++) { for (size_t channel = 0; channel < params->out_channels; channel++) { size_t r = batch * params->out_dim_pooled * params->out_dim_pooled + row * params->out_dim_pooled + col; acc_t result = MVIN_SCALE(A[r][channel], params->res_scale) + B[r][channel]; if (result > elem_t_max) { result = elem_t_max; } else if (result < minimum) { result = minimum; } C[r][channel] = result; } } } } } // Pooling void pool(size_t batch_size, size_t channels, size_t in_row_dim, size_t in_col_dim, size_t out_row_dim, size_t out_col_dim, elem_t input[batch_size][in_row_dim][in_col_dim][channels], elem_t output[batch_size][out_row_dim][out_col_dim][channels], const struct ConvParams * params) { size_t kernel_size = params->pool_size; size_t stride = params->pool_stride; // size_t in_dim = params->out_dim; size_t padding = params->pool_padding; for (int batch = 0; batch < batch_size; batch++) { for (int channel = 0; channel < channels; channel++) { for (int out_row = 0; out_row < out_row_dim; out_row++) { for (int out_col = 0; out_col < out_col_dim; out_col++) { int in_row = out_row * stride - padding; elem_t result = elem_t_min; for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) { int in_col = out_col * stride - padding; for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) { if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) { if (input[batch][in_row][in_col][channel] > result) { result = input[batch][in_row][in_col][channel]; } } else if (0 > result) { result = 0; } in_col++; } in_row++; } output[batch][out_row][out_col][channel] = result; } } } } } void pool_with_col2im(size_t I, size_t J, size_t batch_size, size_t channels, size_t out_row_dim, size_t out_col_dim, elem_t input[I][J], elem_t output[batch_size][out_row_dim][out_col_dim][channels], const struct ConvParams * params) { size_t kernel_size = params->pool_size; size_t stride = params->pool_stride; size_t in_row_dim = params->out_row_dim; size_t in_col_dim = params->out_col_dim; size_t padding = params->pool_padding; for (int batch = 0; batch < batch_size; batch++) { for (int channel = 0; channel < channels; channel++) { for (int out_row = 0; out_row < out_row_dim; out_row++) { for (int out_col = 0; out_col < out_col_dim; out_col++) { int in_row = out_row * stride - padding; elem_t result = elem_t_min; for (int kernel_row = 0; kernel_row < kernel_size; kernel_row++) { int in_col = out_col * stride - padding; for (int kernel_col = 0; kernel_col < kernel_size; kernel_col++) { if (in_row >= 0 && in_row < in_row_dim && in_col >= 0 && in_col < in_col_dim) { if (input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel] > result) { result = input[batch * in_row_dim * in_col_dim + in_row * in_col_dim + in_col][channel]; } } else if (0 > result) { result = 0; } in_col++; } in_row++; } output[batch][out_row][out_col][channel] = result; } } } } } #endif // GEMMINI_NN_H