diff --git a/tests/regression/sgemm_gemmini_dma/generate_operands.py b/tests/regression/sgemm_gemmini_dma/generate_operands.py
new file mode 100644
index 00000000..a0c0e68a
--- /dev/null
+++ b/tests/regression/sgemm_gemmini_dma/generate_operands.py
@@ -0,0 +1,35 @@
+import numpy as np
+
+# Function to generate random fp16 values
+def generate_fp16_matrix(size):
+    return np.random.rand(size, size).astype(np.float16)
+
+# Function to save the matrix to a binary file
+def save_matrix_to_bin(file_name, matrix):
+    matrix.tofile(file_name)
+
+# Function to perform matrix multiplication and truncate to specified size
+def truncated_matrix_multiplication(matrix_a, matrix_b, size):
+    truncated_a = matrix_a.flatten()[:size * size].reshape(size, size)
+    truncated_b = matrix_b.flatten()[:size * size].reshape(size, size)
+    result = np.matmul(truncated_a, truncated_b)
+    return result.astype(np.float16)
+
+# Generate the 512x512 matrices
+size = 512
+matrix_a = generate_fp16_matrix(size)
+matrix_b = generate_fp16_matrix(size)
+
+# Save the operand matrices to binary files
+# save_matrix_to_bin("input.a.bin", matrix_a)
+# save_matrix_to_bin("input.b.bin", matrix_b)
+
+# Generate and save the reference matrices for 128x128, 256x256, and 512x512 sizes
+sizes = [128, 256, 512]
+for s in sizes:
+    ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s)
+    print(ref_matrix)
+    # save_matrix_to_bin(f"ref{s}.bin", ref_matrix)
+
+print("All files generated successfully.")
+
diff --git a/tests/regression/sgemm_gemmini_dma/kernel.cpp b/tests/regression/sgemm_gemmini_dma/kernel.cpp
index c9e38ab0..89e3600c 100644
--- a/tests/regression/sgemm_gemmini_dma/kernel.cpp
+++ b/tests/regression/sgemm_gemmini_dma/kernel.cpp
@@ -42,7 +42,10 @@
 #define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__)
 // #define PRINTF(...) vx_printf(__VA_ARGS__)
 #define SWISH(beta, x) ((x) / (1 + exp(-(beta) * (x))))
-#define POWER
+// #define POWER
+
+typedef uint16_t smem_elem_t;
+// typedef float smem_elem_t;
 
 inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) {
   vx_fence();
@@ -53,9 +56,9 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg,
                                  const uint32_t threadblock_id,
                                  const uint32_t tid_in_threadblock) {
   asm volatile ("matmul_start_%=:" :: );
-  const float * const A = (const float * const) arg->addr_a;
-  const float * const B = (const float * const) arg->addr_b;
-  float * const C = (float * const) arg->addr_c;
+  const smem_elem_t * const A = (const smem_elem_t * const) arg->addr_a;
+  const smem_elem_t * const B = (const smem_elem_t * const) arg->addr_b;
+  smem_elem_t * const C = (smem_elem_t * const) arg->addr_c;
 
   if (HW_TID() == 0) {
     gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0);
@@ -80,11 +83,13 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg,
 
   const uint32_t num_tile_rows_per_tb = num_tiles_m / NUM_CLUSTERS;
 
+  constexpr scale_t MVIN_SCALE_IDENTITY_HEX = 0x3c00;
+
   if (HW_TID() == 0) {
-    gemmini_extended3_config_ld(dim_k * sizeof(elem_t), MVIN_SCALE_IDENTITY, false, 0);
-    gemmini_extended3_config_ld(dim_n * sizeof(elem_t), MVIN_SCALE_IDENTITY, false, 1);
+    gemmini_extended3_config_ld(dim_k * sizeof(elem_t), MVIN_SCALE_IDENTITY_HEX, false, 0);
+    gemmini_extended3_config_ld(dim_n * sizeof(elem_t), MVIN_SCALE_IDENTITY_HEX, false, 1);
     // gemmini_extended3_config_ld(repeating_bias ? 0 : (stride_D * sizeof_D), D_scale_factor, low_D, 2);
-    gemmini_extended_config_st(dim_n * sizeof(elem_t), 0, MVIN_SCALE_IDENTITY);
+    gemmini_extended_config_st(dim_n * sizeof(elem_t), 0, MVIN_SCALE_IDENTITY_HEX);
     // gemmini_extended_config_st(stride_C * sizeof_C, act & 3, scale);
   }
 
@@ -130,7 +135,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg,
 
       // // move out to dram
       // if (HW_TID() == 0) {
-        float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N;
+        smem_elem_t * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N;
         ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, BOUND_INST, k_LOOP_WS_CONFIG_BOUNDS)
         ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, (uint64_t) dram_c_tile_start, k_LOOP_WS_CONFIG_ADDRS_DC)
         ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, dim_n, k_LOOP_WS_CONFIG_STRIDES_DC)
@@ -150,7 +155,8 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg,
         PRINTF("total cycles:         %d\n", marker1 - marker0);
         for (int i = 0; i < dim_m; i += 8) {
           for (int j = 0; j < dim_n; j += 8) {
-            PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4]));
+            // PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4]));
+            PRINTF("%04x %04x ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4]));
           }
           PRINTF("\n");
         }
@@ -181,4 +187,4 @@ int main() {
   vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
 #endif
   return 0;
-}
\ No newline at end of file
+}