From 377466ed1ce7da0bb92ba996380b6541cb5bca30 Mon Sep 17 00:00:00 2001
From: Blaise Tine <tinebp@yahoo.com>
Date: Sun, 5 Sep 2021 21:01:52 -0700
Subject: [PATCH] fpu area optimization

---
 hw/rtl/VX_types.vh            |  2 +-
 hw/rtl/fp_cores/VX_fp_class.v | 28 +++++++++++
 hw/rtl/fp_cores/VX_fp_cvt.v   | 86 ++++++++++++++++---------------
 hw/rtl/fp_cores/VX_fp_ncomp.v | 59 +++++++++++++---------
 hw/rtl/fp_cores/VX_fp_type.v  | 27 ----------
 hw/rtl/fp_cores/VX_fpu_fpga.v |  2 +-
 hw/rtl/libs/VX_find_first.v   | 61 ++++++++++++++++++++++
 hw/rtl/libs/VX_lzc.v          | 95 +++++++----------------------------
 8 files changed, 186 insertions(+), 174 deletions(-)
 create mode 100644 hw/rtl/fp_cores/VX_fp_class.v
 delete mode 100644 hw/rtl/fp_cores/VX_fp_type.v
 create mode 100644 hw/rtl/libs/VX_find_first.v

diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh
index 785ce444..4654daae 100644
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@@ -11,7 +11,7 @@ typedef struct packed {
     logic is_nan;
     logic is_quiet;
     logic is_signaling;    
-} fp_type_t;
+} fp_class_t;
 
 typedef struct packed {
     logic NV; // 4-Invalid
diff --git a/hw/rtl/fp_cores/VX_fp_class.v b/hw/rtl/fp_cores/VX_fp_class.v
new file mode 100644
index 00000000..d30247e5
--- /dev/null
+++ b/hw/rtl/fp_cores/VX_fp_class.v
@@ -0,0 +1,28 @@
+
+`include "VX_define.vh"
+
+module VX_fp_class # (    
+    parameter MAN_BITS = 23,
+    parameter EXP_BITS = 8
+) (
+    input  [EXP_BITS-1:0] exp_i,
+    input  [MAN_BITS-1:0] man_i,
+    output fp_class_t     clss_o
+);
+    wire is_normal    = (exp_i != '0) && (exp_i != '1);
+    wire is_zero      = (exp_i == '0) && (man_i == '0);
+    wire is_subnormal = (exp_i == '0) && (man_i != '0);
+    wire is_inf       = (exp_i == '1) && (man_i == '0); 
+    wire is_nan       = (exp_i == '1) && (man_i != '0);
+    wire is_signaling = is_nan && ~man_i[MAN_BITS-1];
+    wire is_quiet     = is_nan && ~is_signaling;
+
+    assign clss_o.is_normal    = is_normal;
+    assign clss_o.is_zero      = is_zero;
+    assign clss_o.is_subnormal = is_subnormal;
+    assign clss_o.is_inf       = is_inf;
+    assign clss_o.is_nan       = is_nan;
+    assign clss_o.is_quiet     = is_quiet;
+    assign clss_o.is_signaling = is_signaling;
+
+endmodule
\ No newline at end of file
diff --git a/hw/rtl/fp_cores/VX_fp_cvt.v b/hw/rtl/fp_cores/VX_fp_cvt.v
index 25e178c3..98cdded2 100644
--- a/hw/rtl/fp_cores/VX_fp_cvt.v
+++ b/hw/rtl/fp_cores/VX_fp_cvt.v
@@ -59,13 +59,16 @@ module VX_fp_cvt #(
     
     // Input processing
     
-    fp_type_t [LANES-1:0] in_a_type;
+    fp_class_t [LANES-1:0] fp_clss;
       
     for (genvar i = 0; i < LANES; ++i) begin
-        VX_fp_type fp_type (
+        VX_fp_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class (
             .exp_i  (dataa[i][30:23]),
             .man_i  (dataa[i][22:0]),
-            .type_o (in_a_type[i])
+            .clss_o (fp_clss[i])
         );
     end
 
@@ -74,16 +77,19 @@ module VX_fp_cvt #(
     wire [LANES-1:0]                    input_sign;
     
     for (genvar i = 0; i < LANES; ++i) begin
+    `IGNORE_WARNINGS_BEGIN
         wire [INT_MAN_WIDTH-1:0] int_mantissa;
         wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
         wire fmt_sign       = dataa[i][31];
         wire int_sign       = dataa[i][31] & is_signed;
         assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
-        assign fmt_mantissa = INT_MAN_WIDTH'({in_a_type[i].is_normal, dataa[i][MAN_BITS-1:0]});            
-
-        assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS+EXP_BITS-1:MAN_BITS]};
+        assign fmt_mantissa = INT_MAN_WIDTH'({fp_clss[i].is_normal, dataa[i][MAN_BITS-1:0]});
+        assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} +
+                                 {1'b0, fp_clss[i].is_subnormal} +
+                                 (FMT_SHIFT_COMPENSATION - EXP_BIAS);
         assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
         assign input_sign[i]   = is_itof ? int_sign : fmt_sign;
+    `IGNORE_WARNINGS_END
     end
 
     // Pipeline stage0
@@ -93,7 +99,7 @@ module VX_fp_cvt #(
     wire                    is_itof_s0;
     wire                    unsigned_s0;
     wire [2:0]              rnd_mode_s0;
-    fp_type_t [LANES-1:0]   in_a_type_s0;
+    fp_class_t [LANES-1:0]  fp_clss_s0;
     wire [LANES-1:0]        input_sign_s0;
     wire [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
     wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
@@ -101,14 +107,14 @@ module VX_fp_cvt #(
     wire stall;
 
     VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
+        .DATAW  (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_class_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
         .RESETW (1)
     ) pipe_reg0 (
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in,    tag_in,    is_itof,    !is_signed,  frm,         in_a_type,    input_sign,    fmt_exponent,    encoded_mant}),
-        .data_out ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
+        .data_in  ({valid_in,    tag_in,    is_itof,    !is_signed,  frm,         fp_clss,    input_sign,    fmt_exponent,    encoded_mant}),
+        .data_out ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fp_clss_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
     );
     
     // Normalization
@@ -119,8 +125,8 @@ module VX_fp_cvt #(
     for (genvar i = 0; i < LANES; ++i) begin
         wire mant_is_nonzero;
         VX_lzc #(
-            .WIDTH (INT_MAN_WIDTH),
-            .MODE  (1)
+            .N    (INT_MAN_WIDTH),
+            .MODE (1)
         ) lzc (
             .in_i    (encoded_mant_s0[i]),
             .cnt_o   (renorm_shamt_s0[i]),
@@ -134,20 +140,12 @@ module VX_fp_cvt #(
     
     for (genvar i = 0; i < LANES; ++i) begin
     `IGNORE_WARNINGS_BEGIN
-        // Input mantissa needs to be normalized
-        wire [INT_EXP_WIDTH-1:0] fp_input_exp;
-        wire [INT_EXP_WIDTH-1:0] int_input_exp;
-
-        // Realign input mantissa, append zeroes if destination is wider
+       // Realign input mantissa, append zeroes if destination is wider
         assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
 
         // Unbias exponent and compensate for shift
-        assign fp_input_exp = fmt_exponent_s0[i] + 
-                                {1'b0, in_a_type_s0[i].is_subnormal} + 
-                                    (FMT_SHIFT_COMPENSATION - EXP_BIAS) - 
-                                        {1'b0, renorm_shamt_s0[i]};
-                                 
-        assign int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
+        wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] - {1'b0, renorm_shamt_s0[i]};                                 
+        wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
 
         assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp;
     `IGNORE_WARNINGS_END
@@ -160,21 +158,21 @@ module VX_fp_cvt #(
     wire                    is_itof_s1;
     wire                    unsigned_s1;
     wire [2:0]              rnd_mode_s1;
-    fp_type_t [LANES-1:0]   in_a_type_s1;
+    fp_class_t [LANES-1:0]  fp_clss_s1;
     wire [LANES-1:0]        input_sign_s1;
     wire [LANES-1:0]        mant_is_zero_s1;
     wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
     wire [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
 
     VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
+        .DATAW  (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_class_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
         .RESETW (1)
     ) pipe_reg1 (
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, input_sign_s0, mant_is_zero_s0, input_mant_s0, input_exp_s0}),
-        .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
+        .data_in  ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fp_clss_s0, input_sign_s0, mant_is_zero_s0, input_mant_s0, input_exp_s0}),
+        .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fp_clss_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
     );
 
     // Perform adjustments to mantissa and exponent
@@ -245,7 +243,7 @@ module VX_fp_cvt #(
     wire                    is_itof_s2;
     wire                    unsigned_s2;
     wire [2:0]              rnd_mode_s2;
-    fp_type_t [LANES-1:0]   in_a_type_s2;   
+    fp_class_t [LANES-1:0]  fp_clss_s2;   
     wire [LANES-1:0]        mant_is_zero_s2;
     wire [LANES-1:0]        input_sign_s2;
     wire [LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
@@ -253,14 +251,14 @@ module VX_fp_cvt #(
     wire [LANES-1:0]        of_before_round_s2;
     
     VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + 1 + `INST_FRM_BITS + LANES * ($bits(fp_type_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
+        .DATAW  (1 + TAGW + 1 + 1 + `INST_FRM_BITS + LANES * ($bits(fp_class_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
         .RESETW (1)
     ) pipe_reg2 (
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
-        .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
+        .data_in  ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fp_clss_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
+        .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fp_clss_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
     );
 
     wire [LANES-1:0]       rounded_sign;
@@ -314,7 +312,7 @@ module VX_fp_cvt #(
     wire [TAGW-1:0]         tag_in_s3;
     wire                    is_itof_s3;
     wire                    unsigned_s3;
-    fp_type_t [LANES-1:0]   in_a_type_s3;   
+    fp_class_t [LANES-1:0]  fp_clss_s3;   
     wire [LANES-1:0]        mant_is_zero_s3;
     wire [LANES-1:0]        input_sign_s3;
     wire [LANES-1:0]        rounded_sign_s3;
@@ -322,14 +320,14 @@ module VX_fp_cvt #(
     wire [LANES-1:0]        of_before_round_s3;
 
     VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + 32 + 1 + 1)),
+        .DATAW  (1 + TAGW + 1 + 1 + LANES * ($bits(fp_class_t) + 1 + 1 + 32 + 1 + 1)),
         .RESETW (1)
     ) pipe_reg3 (
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, rounded_abs,    rounded_sign,    of_before_round_s2}),
-        .data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, in_a_type_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
+        .data_in  ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, fp_clss_s2, mant_is_zero_s2, input_sign_s2, rounded_abs,    rounded_sign,    of_before_round_s2}),
+        .data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, fp_clss_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
     );
      
     wire [LANES-1:0] of_after_round;
@@ -362,14 +360,14 @@ module VX_fp_cvt #(
 
     for (genvar i = 0; i < LANES; ++i) begin
         // Detect special case from source format, I2F casts don't produce a special result
-        assign fp_result_is_special[i] = ~is_itof_s3 & (in_a_type_s3[i].is_zero | in_a_type_s3[i].is_nan);
+        assign fp_result_is_special[i] = ~is_itof_s3 & (fp_clss_s3[i].is_zero | fp_clss_s3[i].is_nan);
 
         // Signalling input NaNs raise invalid flag, otherwise no flags set
-        assign fp_special_status[i] = in_a_type_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0;   // invalid operation
+        assign fp_special_status[i] = fp_clss_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0;   // invalid operation
 
         // Assemble result according to destination format
-        assign fp_special_result[i] = in_a_type_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
-                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
+        assign fp_special_result[i] = fp_clss_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
+                                                            : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
     end
 
     // INT Special case handling
@@ -381,7 +379,7 @@ module VX_fp_cvt #(
     for (genvar i = 0; i < LANES; ++i) begin
          // Assemble result according to destination format
         always @(*) begin
-            if (input_sign_s3[i] && !in_a_type_s3[i].is_nan) begin
+            if (input_sign_s3[i] && !fp_clss_s3[i].is_nan) begin
                 int_special_result[i][30:0] = 0;               // alone yields 2**(31)-1
                 int_special_result[i][31]   = ~unsigned_s3;    // for unsigned casts yields 2**31
             end else begin
@@ -391,8 +389,8 @@ module VX_fp_cvt #(
         end            
 
         // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
-        assign int_result_is_special[i] = in_a_type_s3[i].is_nan 
-                                        | in_a_type_s3[i].is_inf 
+        assign int_result_is_special[i] = fp_clss_s3[i].is_nan 
+                                        | fp_clss_s3[i].is_inf 
                                         | of_before_round_s3[i] 
                                         | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero[i]);
                                         
@@ -411,11 +409,11 @@ module VX_fp_cvt #(
         wire [31:0] fp_result, int_result;
 
         wire inexact = is_itof_s3 ? (| fp_round_sticky_bits[i]) // overflow is invalid in i2f;        
-                                  : (| fp_round_sticky_bits[i]) | (~in_a_type_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
+                                  : (| fp_round_sticky_bits[i]) | (~fp_clss_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
                                   
         assign fp_regular_status.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round[i]); // overflow is invalid for I2F casts
         assign fp_regular_status.DZ = 1'b0; // no divisions
-        assign fp_regular_status.OF = ~is_itof_s3 & (~in_a_type_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
+        assign fp_regular_status.OF = ~is_itof_s3 & (~fp_clss_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
         assign fp_regular_status.UF = uf_after_round[i] & inexact;
         assign fp_regular_status.NX = inexact;
 
diff --git a/hw/rtl/fp_cores/VX_fp_ncomp.v b/hw/rtl/fp_cores/VX_fp_ncomp.v
index 11300a87..a96c05eb 100644
--- a/hw/rtl/fp_cores/VX_fp_ncomp.v
+++ b/hw/rtl/fp_cores/VX_fp_ncomp.v
@@ -30,6 +30,9 @@ module VX_fp_ncomp #(
     input wire  ready_out,
     output wire valid_out
 );  
+    localparam  EXP_BITS = 8;
+    localparam  MAN_BITS = 23;
+        
     localparam  NEG_INF     = 32'h00000001,
                 NEG_NORM    = 32'h00000002,
                 NEG_SUBNORM = 32'h00000004,
@@ -44,7 +47,7 @@ module VX_fp_ncomp #(
     wire [LANES-1:0]        tmp_a_sign, tmp_b_sign;
     wire [LANES-1:0][7:0]   tmp_a_exponent, tmp_b_exponent;
     wire [LANES-1:0][22:0]  tmp_a_mantissa, tmp_b_mantissa;
-    fp_type_t [LANES-1:0]   tmp_a_type, tmp_b_type;
+    fp_class_t [LANES-1:0]  tmp_a_clss, tmp_b_clss;
     wire [LANES-1:0]        tmp_a_smaller, tmp_ab_equal;
 
     // Setup
@@ -57,20 +60,26 @@ module VX_fp_ncomp #(
         assign tmp_b_exponent[i] = datab[i][30:23];
         assign tmp_b_mantissa[i] = datab[i][22:0];
 
-        VX_fp_type fp_type_a (
+        VX_fp_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_a (
             .exp_i  (tmp_a_exponent[i]),
             .man_i  (tmp_a_mantissa[i]),
-            .type_o (tmp_a_type[i])
+            .clss_o (tmp_a_clss[i])
         );
 
-        VX_fp_type fp_type_b (
+        VX_fp_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_b (
             .exp_i  (tmp_b_exponent[i]),
             .man_i  (tmp_b_mantissa[i]),
-            .type_o (tmp_b_type[i])
+            .clss_o (tmp_b_clss[i])
         );
 
         assign tmp_a_smaller[i] = $signed(dataa[i]) < $signed(datab[i]);
-        assign tmp_ab_equal[i]  = (dataa[i] == datab[i]) | (tmp_a_type[i].is_zero & tmp_b_type[i].is_zero);
+        assign tmp_ab_equal[i]  = (dataa[i] == datab[i]) | (tmp_a_clss[i].is_zero & tmp_b_clss[i].is_zero);
     end  
 
     // Pipeline stage0
@@ -83,41 +92,41 @@ module VX_fp_ncomp #(
     wire [LANES-1:0]        a_sign_s0, b_sign_s0;
     wire [LANES-1:0][7:0]   a_exponent_s0;
     wire [LANES-1:0][22:0]  a_mantissa_s0;
-    fp_type_t [LANES-1:0]   a_type_s0, b_type_s0;
+    fp_class_t [LANES-1:0]  a_clss_s0, b_clss_s0;
     wire [LANES-1:0]        a_smaller_s0, ab_equal_s0;
 
     wire stall;
 
     VX_pipe_register #(
-        .DATAW  (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_type_t) + 1 + 1)),
+        .DATAW  (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_class_t) + 1 + 1)),
         .RESETW (1),
         .DEPTH  (0)
     ) pipe_reg0 (
         .clk      (clk),
         .reset    (reset),
         .enable   (!stall),
-        .data_in  ({valid_in,    tag_in,    op_type,    frm,    dataa,    datab,    tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
-        .data_out ({valid_in_s0, tag_in_s0, op_type_s0, frm_s0, dataa_s0, datab_s0, a_sign_s0,  b_sign_s0,  a_exponent_s0,  a_mantissa_s0,  a_type_s0,  b_type_s0,  a_smaller_s0,  ab_equal_s0})
+        .data_in  ({valid_in,    tag_in,    op_type,    frm,    dataa,    datab,    tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_clss, tmp_b_clss, tmp_a_smaller, tmp_ab_equal}),
+        .data_out ({valid_in_s0, tag_in_s0, op_type_s0, frm_s0, dataa_s0, datab_s0, a_sign_s0,  b_sign_s0,  a_exponent_s0,  a_mantissa_s0,  a_clss_s0,  b_clss_s0,  a_smaller_s0,  ab_equal_s0})
     ); 
 
     // FCLASS
     reg [LANES-1:0][31:0] fclass_mask;  // generate a 10-bit mask for integer reg
     for (genvar i = 0; i < LANES; i++) begin
         always @(*) begin 
-            if (a_type_s0[i].is_normal) begin
+            if (a_clss_s0[i].is_normal) begin
                 fclass_mask[i] = a_sign_s0[i] ? NEG_NORM : POS_NORM;
             end 
-            else if (a_type_s0[i].is_inf) begin
+            else if (a_clss_s0[i].is_inf) begin
                 fclass_mask[i] = a_sign_s0[i] ? NEG_INF : POS_INF;
             end 
-            else if (a_type_s0[i].is_zero) begin
+            else if (a_clss_s0[i].is_zero) begin
                 fclass_mask[i] = a_sign_s0[i] ? NEG_ZERO : POS_ZERO;
             end 
-            else if (a_type_s0[i].is_subnormal) begin
+            else if (a_clss_s0[i].is_subnormal) begin
                 fclass_mask[i] = a_sign_s0[i] ? NEG_SUBNORM : POS_SUBNORM;
             end 
-            else if (a_type_s0[i].is_nan) begin
-                fclass_mask[i] = {22'h0, a_type_s0[i].is_quiet, a_type_s0[i].is_signaling, 8'h0};
+            else if (a_clss_s0[i].is_nan) begin
+                fclass_mask[i] = {22'h0, a_clss_s0[i].is_quiet, a_clss_s0[i].is_signaling, 8'h0};
             end 
             else begin                     
                 fclass_mask[i] = QUT_NAN;
@@ -129,11 +138,11 @@ module VX_fp_ncomp #(
     reg [LANES-1:0][31:0] fminmax_res;  // result of fmin/fmax
     for (genvar i = 0; i < LANES; i++) begin
         always @(*) begin
-            if (a_type_s0[i].is_nan && b_type_s0[i].is_nan)
+            if (a_clss_s0[i].is_nan && b_clss_s0[i].is_nan)
                 fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
-            else if (a_type_s0[i].is_nan) 
+            else if (a_clss_s0[i].is_nan) 
                 fminmax_res[i] = datab_s0[i];
-            else if (b_type_s0[i].is_nan) 
+            else if (b_clss_s0[i].is_nan) 
                 fminmax_res[i] = dataa_s0[i];
             else begin 
                 case (frm_s0) // use LSB to distinguish MIN and MAX
@@ -166,7 +175,7 @@ module VX_fp_ncomp #(
             case (frm_s0)
                 `INST_FRM_RNE: begin // LE
                     fcmp_fflags[i] = 5'h0;
-                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
+                    if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
                         fcmp_res[i]       = 32'h0;
                         fcmp_fflags[i].NV = 1'b1;
                     end else begin
@@ -175,7 +184,7 @@ module VX_fp_ncomp #(
                 end
                 `INST_FRM_RTZ: begin // LS
                     fcmp_fflags[i] = 5'h0;
-                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
+                    if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
                         fcmp_res[i]       = 32'h0;
                         fcmp_fflags[i].NV = 1'b1;
                     end else begin
@@ -184,9 +193,9 @@ module VX_fp_ncomp #(
                 end
                 `INST_FRM_RDN: begin // EQ
                     fcmp_fflags[i] = 5'h0;
-                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
+                    if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
                         fcmp_res[i]       = 32'h0;
-                        fcmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling; 
+                        fcmp_fflags[i].NV = a_clss_s0[i].is_signaling | b_clss_s0[i].is_signaling; 
                     end else begin
                         fcmp_res[i] = {31'h0, ab_equal_s0[i]};
                     end
@@ -225,11 +234,11 @@ module VX_fp_ncomp #(
                         3,4: begin
                             tmp_result[i] = fminmax_res[i];
                             tmp_fflags[i] = 0;
-                            tmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling;
+                            tmp_fflags[i].NV = a_clss_s0[i].is_signaling | b_clss_s0[i].is_signaling;
                         end
                         //5,6,7: MOVE
                         default: begin
-                            tmp_result[i] = dataa[i];
+                            tmp_result[i] = dataa_s0[i];
                             tmp_fflags[i] = 'x;
                         end
                     endcase
diff --git a/hw/rtl/fp_cores/VX_fp_type.v b/hw/rtl/fp_cores/VX_fp_type.v
deleted file mode 100644
index bdc41b86..00000000
--- a/hw/rtl/fp_cores/VX_fp_type.v
+++ /dev/null
@@ -1,27 +0,0 @@
-
-`include "VX_define.vh"
-
-module VX_fp_type (
-    // inputs
-    input  [7:0]  exp_i,
-    input  [22:0] man_i,
-    // outputs
-    output fp_type_t type_o
-);
-    wire is_normal    = (exp_i != 8'd0) && (exp_i != 8'hff);
-    wire is_zero      = (exp_i == 8'd0) && (man_i == 23'd0);
-    wire is_subnormal = (exp_i == 8'd0) && (man_i != 23'd0);
-    wire is_inf       = (exp_i == 8'hff) && (man_i == 23'd0); 
-    wire is_nan       = (exp_i == 8'hff) && (man_i != 23'd0);
-    wire is_signaling = is_nan && (man_i[22] == 1'b0);
-    wire is_quiet     = is_nan && !is_signaling;
-
-    assign type_o.is_normal    = is_normal;
-    assign type_o.is_zero      = is_zero;
-    assign type_o.is_subnormal = is_subnormal;
-    assign type_o.is_inf       = is_inf;
-    assign type_o.is_nan       = is_nan;
-    assign type_o.is_quiet     = is_quiet;
-    assign type_o.is_signaling = is_signaling;
-
-endmodule
\ No newline at end of file
diff --git a/hw/rtl/fp_cores/VX_fpu_fpga.v b/hw/rtl/fp_cores/VX_fpu_fpga.v
index f07afb0d..029f8976 100644
--- a/hw/rtl/fp_cores/VX_fpu_fpga.v
+++ b/hw/rtl/fp_cores/VX_fpu_fpga.v
@@ -1,7 +1,7 @@
 `include "VX_define.vh"
 
 module VX_fpu_fpga #( 
-    parameter TAGW = 1
+    parameter TAGW = 4
 ) (
     input wire clk,
     input wire reset,
diff --git a/hw/rtl/libs/VX_find_first.v b/hw/rtl/libs/VX_find_first.v
new file mode 100644
index 00000000..ab724c7e
--- /dev/null
+++ b/hw/rtl/libs/VX_find_first.v
@@ -0,0 +1,61 @@
+`include "VX_platform.vh"
+
+`TRACING_OFF
+module VX_find_first #(
+    parameter N       = 1,
+    parameter DATAW   = 1,
+    parameter REVERSE = 0,
+    localparam LOGN   = $clog2(N)
+) (
+    input  wire [N-1:0][DATAW-1:0] data_i,
+    input  wire [N-1:0]            valid_i,    
+    output wire [DATAW-1:0]        data_o,
+    output wire                    valid_o
+);
+    if (N > 1) begin    
+        wire [N-1:0] valid_r;
+        wire [N-1:0][DATAW-1:0] data_r;
+
+        for (genvar i = 0; i < N; ++i) begin
+            assign valid_r[i] = REVERSE ? valid_i[N-1-i] : valid_i[i];
+            assign data_r[i]  = REVERSE ? data_i[N-1-i] : data_i[i];
+        end
+
+    `IGNORE_WARNINGS_BEGIN
+        wire [2**LOGN-1:0]            s_n;
+        wire [2**LOGN-1:0][DATAW-1:0] d_n;       
+    `IGNORE_WARNINGS_END
+
+        for (genvar i = 0; i < LOGN; ++i) begin
+            if (i == (LOGN-1)) begin
+                for (genvar j = 0; j < 2**i; ++j) begin
+                    if ((j*2) < (N-1)) begin
+                        assign s_n[2**i-1+j] = valid_r[j*2] | valid_r[j*2+1];
+                        assign d_n[2**i-1+j] = valid_r[j*2] ? data_r[j*2] : data_r[j*2+1];
+                    end
+                    if ((j*2) == (N-1)) begin
+                        assign s_n[2**i-1+j] = valid_r[j*2];
+                        assign d_n[2**i-1+j] = data_r[j*2];
+                    end
+                    if ((j*2) > (N-1)) begin
+                        assign s_n[2**i-1+j] = 0;
+                        assign d_n[2**i-1+j] = 'x;
+                    end
+                end
+            end else begin
+                for (genvar j = 0; j < 2**i; ++j) begin
+                    assign s_n[2**i-1+j] = s_n[2**(i+1)-1+j*2] | s_n[2**(i+1)-1+j*2+1];
+                    assign d_n[2**i-1+j] = s_n[2**(i+1)-1+j*2] ? d_n[2**(i+1)-1+j*2] : d_n[2**(i+1)-1+j*2+1];
+                end
+            end
+        end     
+        
+        assign valid_o = s_n[0];
+        assign data_o  = d_n[0];  
+    end else begin
+        assign valid_o = valid_i;
+        assign data_o  = data_i[0];  
+    end    
+  
+endmodule
+`TRACING_ON
\ No newline at end of file
diff --git a/hw/rtl/libs/VX_lzc.v b/hw/rtl/libs/VX_lzc.v
index 816d65b6..5b97028b 100644
--- a/hw/rtl/libs/VX_lzc.v
+++ b/hw/rtl/libs/VX_lzc.v
@@ -1,88 +1,31 @@
 `include "VX_platform.vh"
 
-/// Modified port of lzc module from fpnew Libray
-/// reference: https://github.com/pulp-platform/fpnew
-/// A trailing zero counter / leading zero counter.
-/// Set MODE to 0 for trailing zero counter => cnt_o is the number of trailing zeros (from the LSB)
-/// Set MODE to 1 for leading zero counter  => cnt_o is the number of leading zeros  (from the MSB)
-/// If the input does not contain a zero, `empty_o` is asserted. Additionally `cnt_o` contains
-/// the maximum number of zeros - 1. For example:
-///   in_i = 000_0000, empty_o = 1, cnt_o = 6 (mode = 0)
-///   in_i = 000_0001, empty_o = 0, cnt_o = 0 (mode = 0)
-///   in_i = 000_1000, empty_o = 0, cnt_o = 3 (mode = 0)
-/// Furthermore, this unit contains a more efficient implementation for Verilator (simulation only).
-/// This speeds up simulation significantly.
-
 `TRACING_OFF
 module VX_lzc #(
-    /// The width of the input vector.
-    parameter int unsigned WIDTH = 2,
-    parameter bit          MODE  = 1'b0 // 0 -> trailing zero, 1 -> leading zero
+    parameter N     = 2,
+    parameter MODE  = 0, // 0 -> trailing zero, 1 -> leading zero
+    localparam LOGN = $clog2(N)
 ) (
-    input  logic [WIDTH-1:0]         in_i,
-    output logic [$clog2(WIDTH)-1:0] cnt_o,
-    output logic                     valid_o
+    input  wire [N-1:0]    in_i,
+    output wire [LOGN-1:0] cnt_o,
+    output wire            valid_o
 );
-`IGNORE_WARNINGS_BEGIN
+    wire [N-1:0][LOGN-1:0] indices;
 
-    localparam int unsigned NUM_LEVELS = $clog2(WIDTH);
-
-    // pragma translate_off
-    initial begin
-        assert(WIDTH > 0) else $fatal("input must be at least one bit wide");
-    end
-    // pragma translate_on
-
-    logic [WIDTH-1:0][NUM_LEVELS-1:0]          index_lut;
-    logic [2**NUM_LEVELS-1:0]                  sel_nodes;
-    logic [2**NUM_LEVELS-1:0][NUM_LEVELS-1:0]  index_nodes;
-
-    logic [WIDTH-1:0] in_tmp;
-
-    // reverse vector if required
-    always_comb begin : flip_vector
-        for (int unsigned i = 0; i < WIDTH; i++) begin
-            in_tmp[i] = (MODE) ? in_i[WIDTH-1-i] : in_i[i];
-        end
+    for (genvar i = 0; i < N; ++i) begin
+        assign indices[i] = MODE ? LOGN'(N-1-i) : LOGN'(i);
     end
 
-    for (genvar j = 0; unsigned'(j) < WIDTH; j++) begin : g_index_lut
-        assign index_lut[j] = NUM_LEVELS'(unsigned'(j));
-    end
-
-    for (genvar level = 0; unsigned'(level) < NUM_LEVELS; level++) begin : g_levels
-        if (unsigned'(level) == NUM_LEVELS-1) begin : g_last_level
-            for (genvar k = 0; k < 2**level; k++) begin : g_level
-                // if two successive indices are still in the vector...
-                if (unsigned'(k) * 2 < WIDTH-1) begin
-                    assign sel_nodes[2**level-1+k]   = in_tmp[k*2] | in_tmp[k*2+1];
-                    assign index_nodes[2**level-1+k] = (in_tmp[k*2] == 1'b1) ? index_lut[k*2] :
-                                                                               index_lut[k*2+1];
-                end
-                // if only the first index is still in the vector...
-                if (unsigned'(k) * 2 == WIDTH-1) begin
-                    assign sel_nodes[2**level-1+k]   = in_tmp[k*2];
-                    assign index_nodes[2**level-1+k] = index_lut[k*2];
-                end
-                // if index is out of range
-                if (unsigned'(k) * 2 > WIDTH-1) begin
-                    assign sel_nodes[2**level-1+k]   = 1'b0;
-                    assign index_nodes[2**level-1+k] = '0;
-                end
-            end
-        end else begin
-            for (genvar l = 0; l < 2**level; l++) begin : g_level
-                assign sel_nodes[2**level-1+l]   = sel_nodes[2**(level+1)-1+l*2] | sel_nodes[2**(level+1)-1+l*2+1];
-                assign index_nodes[2**level-1+l] = (sel_nodes[2**(level+1)-1+l*2] == 1'b1) ? index_nodes[2**(level+1)-1+l*2] :
-                                                                                             index_nodes[2**(level+1)-1+l*2+1];
-            end
-        end
-    end
-
-    assign cnt_o   = NUM_LEVELS > unsigned'(0) ? index_nodes[0] : $clog2(WIDTH)'(0);
-    assign valid_o = NUM_LEVELS > unsigned'(0) ? sel_nodes[0]  : (|in_i);
-
-`IGNORE_WARNINGS_END
+    VX_find_first #(
+        .N       (N),
+        .DATAW   (LOGN),
+        .REVERSE (MODE)
+    ) find_first (        
+        .data_i  (indices),
+        .valid_i (in_i),
+        .data_o  (cnt_o),
+        .valid_o (valid_o)
+    );
   
 endmodule
 `TRACING_ON
\ No newline at end of file