Merge branch 'master' into graphics

2021-10-15 19:32:11 -07:00
parent 668cfb5da4 e2b5799a01
commit e380ded5e1
542 changed files with 124552 additions and 18682 deletions
--- a/hw/rtl/fp_cores/VX_fp_class.sv
+++ b/hw/rtl/fp_cores/VX_fp_class.sv
@@ -0,0 +1,28 @@
+
+`include "VX_fpu_define.vh"
+
+module VX_fp_class # (    
+    parameter MAN_BITS = 23,
+    parameter EXP_BITS = 8
+) (
+    input  [EXP_BITS-1:0] exp_i,
+    input  [MAN_BITS-1:0] man_i,
+    output fp_class_t     clss_o
+);
+    wire is_normal    = (exp_i != '0) && (exp_i != '1);
+    wire is_zero      = (exp_i == '0) && (man_i == '0);
+    wire is_subnormal = (exp_i == '0) && (man_i != '0);
+    wire is_inf       = (exp_i == '1) && (man_i == '0); 
+    wire is_nan       = (exp_i == '1) && (man_i != '0);
+    wire is_signaling = is_nan && ~man_i[MAN_BITS-1];
+    wire is_quiet     = is_nan && ~is_signaling;
+
+    assign clss_o.is_normal    = is_normal;
+    assign clss_o.is_zero      = is_zero;
+    assign clss_o.is_subnormal = is_subnormal;
+    assign clss_o.is_inf       = is_inf;
+    assign clss_o.is_nan       = is_nan;
+    assign clss_o.is_quiet     = is_quiet;
+    assign clss_o.is_signaling = is_signaling;
+
+endmodule
--- a/hw/rtl/fp_cores/VX_fp_cvt.sv
+++ b/hw/rtl/fp_cores/VX_fp_cvt.sv
@@ -1,4 +1,4 @@
-`include "VX_define.vh"
+`include "VX_fpu_define.vh"

 /// Modified port of cast module from fpnew Libray 
 /// reference: https://github.com/pulp-platform/fpnew
@@ -15,7 +15,7 @@ module VX_fp_cvt #(

    input wire [TAGW-1:0] tag_in,

-    input wire [`FRM_BITS-1:0] frm,
+    input wire [`INST_FRM_BITS-1:0] frm,

    input wire is_itof,
    input wire is_signed,
@@ -59,13 +59,16 @@ module VX_fp_cvt #(
    
    // Input processing
    
-    fp_type_t [LANES-1:0] in_a_type;
+    fp_class_t [LANES-1:0] fp_clss;
      
    for (genvar i = 0; i < LANES; ++i) begin
-        VX_fp_type fp_type (
+        VX_fp_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class (
            .exp_i  (dataa[i][30:23]),
            .man_i  (dataa[i][22:0]),
-            .type_o (in_a_type[i])
+            .clss_o (fp_clss[i])
        );
    end

@@ -74,16 +77,18 @@ module VX_fp_cvt #(
    wire [LANES-1:0]                    input_sign;
    
    for (genvar i = 0; i < LANES; ++i) begin
+    `IGNORE_WARNINGS_BEGIN
        wire [INT_MAN_WIDTH-1:0] int_mantissa;
        wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
        wire fmt_sign       = dataa[i][31];
        wire int_sign       = dataa[i][31] & is_signed;
        assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
-        assign fmt_mantissa = INT_MAN_WIDTH'({in_a_type[i].is_normal, dataa[i][MAN_BITS-1:0]});            
-
-        assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS+EXP_BITS-1:MAN_BITS]};
+        assign fmt_mantissa = INT_MAN_WIDTH'({fp_clss[i].is_normal, dataa[i][MAN_BITS-1:0]});
+        assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} +
+                                 {1'b0, fp_clss[i].is_subnormal};
        assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
        assign input_sign[i]   = is_itof ? int_sign : fmt_sign;
+    `IGNORE_WARNINGS_END
    end

    // Pipeline stage0
@@ -93,7 +98,7 @@ module VX_fp_cvt #(
    wire                    is_itof_s0;
    wire                    unsigned_s0;
    wire [2:0]              rnd_mode_s0;
-    fp_type_t [LANES-1:0]   in_a_type_s0;
+    fp_class_t [LANES-1:0]  fp_clss_s0;
    wire [LANES-1:0]        input_sign_s0;
    wire [LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
    wire [LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
@@ -101,14 +106,14 @@ module VX_fp_cvt #(
    wire stall;

    VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
+        .DATAW  (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_class_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
        .RESETW (1)
    ) pipe_reg0 (
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in,    tag_in,    is_itof,    !is_signed,  frm,         in_a_type,    input_sign,    fmt_exponent,    encoded_mant}),
-        .data_out ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
+        .data_in  ({valid_in,    tag_in,    is_itof,    !is_signed,  frm,         fp_clss,    input_sign,    fmt_exponent,    encoded_mant}),
+        .data_out ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fp_clss_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
    );
    
    // Normalization
@@ -119,8 +124,8 @@ module VX_fp_cvt #(
    for (genvar i = 0; i < LANES; ++i) begin
        wire mant_is_nonzero;
        VX_lzc #(
-            .WIDTH (INT_MAN_WIDTH),
-            .MODE  (1)
+            .N    (INT_MAN_WIDTH),
+            .MODE (1)
        ) lzc (
            .in_i    (encoded_mant_s0[i]),
            .cnt_o   (renorm_shamt_s0[i]),
@@ -134,20 +139,12 @@ module VX_fp_cvt #(
    
    for (genvar i = 0; i < LANES; ++i) begin
    `IGNORE_WARNINGS_BEGIN
-        // Input mantissa needs to be normalized
-        wire [INT_EXP_WIDTH-1:0] fp_input_exp;
-        wire [INT_EXP_WIDTH-1:0] int_input_exp;
-
-        // Realign input mantissa, append zeroes if destination is wider
+       // Realign input mantissa, append zeroes if destination is wider
        assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];

        // Unbias exponent and compensate for shift
-        assign fp_input_exp = fmt_exponent_s0[i] + 
-                                {1'b0, in_a_type_s0[i].is_subnormal} + 
-                                    (FMT_SHIFT_COMPENSATION - EXP_BIAS) - 
-                                        {1'b0, renorm_shamt_s0[i]};
-                                 
-        assign int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
+        wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] + (FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};                                 
+        wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};

        assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp;
    `IGNORE_WARNINGS_END
@@ -160,21 +157,21 @@ module VX_fp_cvt #(
    wire                    is_itof_s1;
    wire                    unsigned_s1;
    wire [2:0]              rnd_mode_s1;
-    fp_type_t [LANES-1:0]   in_a_type_s1;
+    fp_class_t [LANES-1:0]  fp_clss_s1;
    wire [LANES-1:0]        input_sign_s1;
    wire [LANES-1:0]        mant_is_zero_s1;
    wire [LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
    wire [LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;

    VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + `FRM_BITS + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
+        .DATAW  (1 + TAGW + 1 + `INST_FRM_BITS + 1 + LANES * ($bits(fp_class_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
        .RESETW (1)
    ) pipe_reg1 (
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, in_a_type_s0, input_sign_s0, mant_is_zero_s0, input_mant_s0, input_exp_s0}),
-        .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
+        .data_in  ({valid_in_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fp_clss_s0, input_sign_s0, mant_is_zero_s0, input_mant_s0, input_exp_s0}),
+        .data_out ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fp_clss_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
    );

    // Perform adjustments to mantissa and exponent
@@ -183,39 +180,35 @@ module VX_fp_cvt #(
    wire [LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
    wire [LANES-1:0]                    of_before_round_s1;

-    for (genvar i = 0; i < LANES; ++i) begin            
-        wire [INT_EXP_WIDTH-1:0] destination_exp;   // re-biased exponent for destination
+    for (genvar i = 0; i < LANES; ++i) begin
        reg [2*INT_MAN_WIDTH:0] preshift_mant;      // mantissa before final shift                
        reg [SHAMT_BITS-1:0]    denorm_shamt;       // shift amount for denormalization
        reg [INT_EXP_WIDTH-1:0] final_exp;          // after eventual adjustments
        reg                     of_before_round;

-        // Rebias the exponent
-        assign destination_exp = input_exp_s1[i] + EXP_BIAS;
-        
        always @(*) begin           
        `IGNORE_WARNINGS_BEGIN     
            // Default assignment
-            final_exp       = destination_exp; // take exponent as is, only look at lower bits
-            preshift_mant   = {input_mant_s1[i], 33'b0}; // Place mantissa to the left of the shifter
+            final_exp       = input_exp_s1[i] + EXP_BIAS; // take exponent as is, only look at lower bits
+            preshift_mant   = {input_mant_s1[i], 33'b0};  // Place mantissa to the left of the shifter
            denorm_shamt    = 0;      // right of mantissa
            of_before_round = 1'b0;

            // Handle INT casts
            if (is_itof_s1) begin                   
-                if ($signed(destination_exp) >= $signed(2**EXP_BITS-1)) begin
+                if ($signed(input_exp_s1[i]) >= $signed(2**EXP_BITS-1-EXP_BIAS)) begin
                    // Overflow or infinities (for proper rounding)
                    final_exp     = (2**EXP_BITS-2); // largest normal value
                    preshift_mant = ~0;  // largest normal value and RS bits set
                    of_before_round = 1'b1;
-                end else if ($signed(destination_exp) < $signed(-MAN_BITS)) begin
+                end else if ($signed(input_exp_s1[i]) < $signed(-MAN_BITS-EXP_BIAS)) begin
                    // Limit the shift to retain sticky bits
                    final_exp     = 0; // denormal result
-                    denorm_shamt  = denorm_shamt + (2 + MAN_BITS); // to sticky                
-                end else if ($signed(destination_exp) < $signed(1)) begin
+                    denorm_shamt  = (2 + MAN_BITS); // to sticky                
+                end else if ($signed(input_exp_s1[i]) < $signed(1-EXP_BIAS)) begin
                    // Denormalize underflowing values
                    final_exp     = 0; // denormal result
-                    denorm_shamt  = denorm_shamt + 1 - destination_exp; // adjust right shifting               
+                    denorm_shamt  = (1-EXP_BIAS) - input_exp_s1[i]; // adjust right shifting               
                end
            end else begin                                
                if ($signed(input_exp_s1[i]) >= $signed((MAX_INT_WIDTH-1) + unsigned_s1)) begin
@@ -224,7 +217,7 @@ module VX_fp_cvt #(
                    of_before_round = 1'b1;                
                end else if ($signed(input_exp_s1[i]) < $signed(-1)) begin
                    // underflow
-                    denorm_shamt = MAX_INT_WIDTH + 1; // all bits go to the sticky
+                    denorm_shamt = MAX_INT_WIDTH+1; // all bits go to the sticky
                end else begin
                    // By default right shift mantissa to be an integer
                    denorm_shamt = (MAX_INT_WIDTH-1) - input_exp_s1[i];
@@ -245,7 +238,7 @@ module VX_fp_cvt #(
    wire                    is_itof_s2;
    wire                    unsigned_s2;
    wire [2:0]              rnd_mode_s2;
-    fp_type_t [LANES-1:0]   in_a_type_s2;   
+    fp_class_t [LANES-1:0]  fp_clss_s2;   
    wire [LANES-1:0]        mant_is_zero_s2;
    wire [LANES-1:0]        input_sign_s2;
    wire [LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
@@ -253,14 +246,14 @@ module VX_fp_cvt #(
    wire [LANES-1:0]        of_before_round_s2;
    
    VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + 1 + `FRM_BITS + LANES * ($bits(fp_type_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
+        .DATAW  (1 + TAGW + 1 + 1 + `INST_FRM_BITS + LANES * ($bits(fp_class_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
        .RESETW (1)
    ) pipe_reg2 (
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, in_a_type_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
-        .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
+        .data_in  ({valid_in_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fp_clss_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
+        .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fp_clss_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
    );

    wire [LANES-1:0]       rounded_sign;
@@ -314,7 +307,7 @@ module VX_fp_cvt #(
    wire [TAGW-1:0]         tag_in_s3;
    wire                    is_itof_s3;
    wire                    unsigned_s3;
-    fp_type_t [LANES-1:0]   in_a_type_s3;   
+    fp_class_t [LANES-1:0]  fp_clss_s3;   
    wire [LANES-1:0]        mant_is_zero_s3;
    wire [LANES-1:0]        input_sign_s3;
    wire [LANES-1:0]        rounded_sign_s3;
@@ -322,14 +315,14 @@ module VX_fp_cvt #(
    wire [LANES-1:0]        of_before_round_s3;

    VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + 1 + LANES * ($bits(fp_type_t) + 1 + 1 + 32 + 1 + 1)),
+        .DATAW  (1 + TAGW + 1 + 1 + LANES * ($bits(fp_class_t) + 1 + 1 + 32 + 1 + 1)),
        .RESETW (1)
    ) pipe_reg3 (
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, in_a_type_s2, mant_is_zero_s2, input_sign_s2, rounded_abs,    rounded_sign,    of_before_round_s2}),
-        .data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, in_a_type_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
+        .data_in  ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, fp_clss_s2, mant_is_zero_s2, input_sign_s2, rounded_abs,    rounded_sign,    of_before_round_s2}),
+        .data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, fp_clss_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
    );
     
    wire [LANES-1:0] of_after_round;
@@ -362,14 +355,14 @@ module VX_fp_cvt #(

    for (genvar i = 0; i < LANES; ++i) begin
        // Detect special case from source format, I2F casts don't produce a special result
-        assign fp_result_is_special[i] = ~is_itof_s3 & (in_a_type_s3[i].is_zero | in_a_type_s3[i].is_nan);
+        assign fp_result_is_special[i] = ~is_itof_s3 & (fp_clss_s3[i].is_zero | fp_clss_s3[i].is_nan);

        // Signalling input NaNs raise invalid flag, otherwise no flags set
-        assign fp_special_status[i] = in_a_type_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0;   // invalid operation
+        assign fp_special_status[i] = fp_clss_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0;   // invalid operation

        // Assemble result according to destination format
-        assign fp_special_result[i] = in_a_type_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
-                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
+        assign fp_special_result[i] = fp_clss_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
+                                                            : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
    end

    // INT Special case handling
@@ -381,7 +374,7 @@ module VX_fp_cvt #(
    for (genvar i = 0; i < LANES; ++i) begin
         // Assemble result according to destination format
        always @(*) begin
-            if (input_sign_s3[i] && !in_a_type_s3[i].is_nan) begin
+            if (input_sign_s3[i] && !fp_clss_s3[i].is_nan) begin
                int_special_result[i][30:0] = 0;               // alone yields 2**(31)-1
                int_special_result[i][31]   = ~unsigned_s3;    // for unsigned casts yields 2**31
            end else begin
@@ -391,8 +384,8 @@ module VX_fp_cvt #(
        end            

        // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
-        assign int_result_is_special[i] = in_a_type_s3[i].is_nan 
-                                        | in_a_type_s3[i].is_inf 
+        assign int_result_is_special[i] = fp_clss_s3[i].is_nan 
+                                        | fp_clss_s3[i].is_inf 
                                        | of_before_round_s3[i] 
                                        | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero[i]);
                                        
@@ -411,11 +404,11 @@ module VX_fp_cvt #(
        wire [31:0] fp_result, int_result;

        wire inexact = is_itof_s3 ? (| fp_round_sticky_bits[i]) // overflow is invalid in i2f;        
-                                  : (| fp_round_sticky_bits[i]) | (~in_a_type_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
+                                  : (| fp_round_sticky_bits[i]) | (~fp_clss_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
                                  
        assign fp_regular_status.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round[i]); // overflow is invalid for I2F casts
        assign fp_regular_status.DZ = 1'b0; // no divisions
-        assign fp_regular_status.OF = ~is_itof_s3 & (~in_a_type_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
+        assign fp_regular_status.OF = ~is_itof_s3 & (~fp_clss_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
        assign fp_regular_status.UF = uf_after_round[i] & inexact;
        assign fp_regular_status.NX = inexact;

@@ -435,7 +428,7 @@ module VX_fp_cvt #(
    assign stall = ~ready_out && valid_out;

    VX_pipe_register #(
-        .DATAW  (1 + TAGW + (LANES * 32) + (LANES * `FFG_BITS)),
+        .DATAW  (1 + TAGW + (LANES * 32) + (LANES * `FFLAGS_BITS)),
        .RESETW (1)
    ) pipe_reg4 (
        .clk      (clk),
--- a/hw/rtl/fp_cores/VX_fp_div.sv
+++ b/hw/rtl/fp_cores/VX_fp_div.sv
@@ -1,8 +1,4 @@
-`include "VX_define.vh"
-
-`ifndef SYNTHESIS
-`include "float_dpi.vh"
-`endif
+`include "VX_fpu_define.vh"

 module VX_fp_div #( 
    parameter TAGW = 1,
@@ -16,7 +12,7 @@ module VX_fp_div #(

    input wire [TAGW-1:0] tag_in,

-    input wire [`FRM_BITS-1:0] frm,
+    input wire [`INST_FRM_BITS-1:0] frm,
    
    input wire [LANES-1:0][31:0]  dataa,
    input wire [LANES-1:0][31:0]  datab,
@@ -39,7 +35,7 @@ module VX_fp_div #(
        fflags_t f;

        always @(*) begin        
-            dpi_fdiv (dataa[i], datab[i], frm, r, f);
+            dpi_fdiv (enable && valid_in, dataa[i], datab[i], frm, r, f);
        end
        `UNUSED_VAR (f)

--- a/hw/rtl/fp_cores/VX_fp_fma.sv
+++ b/hw/rtl/fp_cores/VX_fp_fma.sv
@@ -1,8 +1,4 @@
-`include "VX_define.vh"
-
-`ifndef SYNTHESIS
-`include "float_dpi.vh"
-`endif
+`include "VX_fpu_define.vh"

 module VX_fp_fma #( 
    parameter TAGW = 1,
@@ -16,7 +12,7 @@ module VX_fp_fma #(

    input wire [TAGW-1:0] tag_in,
    
-    input wire [`FRM_BITS-1:0] frm,
+    input wire [`INST_FRM_BITS-1:0] frm,

    input wire  do_madd,
    input wire  do_sub,
@@ -68,7 +64,7 @@ module VX_fp_fma #(
        fflags_t f;

        always @(*) begin        
-            dpi_fmadd (a, b, c, frm, r, f);
+            dpi_fmadd (enable && valid_in, a, b, c, frm, r, f);
        end
        `UNUSED_VAR (f)

--- a/hw/rtl/fp_cores/VX_fp_ncomp.sv
+++ b/hw/rtl/fp_cores/VX_fp_ncomp.sv
@@ -1,4 +1,4 @@
-`include "VX_define.vh"
+`include "VX_fpu_define.vh"

 /// Modified port of noncomp module from fpnew Libray 
 /// reference: https://github.com/pulp-platform/fpnew
@@ -15,8 +15,8 @@ module VX_fp_ncomp #(

    input wire [TAGW-1:0] tag_in,
    
-    input wire [`FPU_BITS-1:0] op_type,
-    input wire [`FRM_BITS-1:0] frm,
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FRM_BITS-1:0] frm,

    input wire [LANES-1:0][31:0]  dataa,
    input wire [LANES-1:0][31:0]  datab,
@@ -30,6 +30,9 @@ module VX_fp_ncomp #(
    input wire  ready_out,
    output wire valid_out
 );  
+    localparam  EXP_BITS = 8;
+    localparam  MAN_BITS = 23;
+        
    localparam  NEG_INF     = 32'h00000001,
                NEG_NORM    = 32'h00000002,
                NEG_SUBNORM = 32'h00000004,
@@ -38,86 +41,92 @@ module VX_fp_ncomp #(
                POS_SUBNORM = 32'h00000020,
                POS_NORM    = 32'h00000040,
                POS_INF     = 32'h00000080,
-                SIG_NAN     = 32'h00000100,
+                //SIG_NAN   = 32'h00000100,
                QUT_NAN     = 32'h00000200;

-    wire [LANES-1:0]        tmp_a_sign, tmp_b_sign;
-    wire [LANES-1:0][7:0]   tmp_a_exponent, tmp_b_exponent;
-    wire [LANES-1:0][22:0]  tmp_a_mantissa, tmp_b_mantissa;
-    fp_type_t [LANES-1:0]   tmp_a_type, tmp_b_type;
-    wire [LANES-1:0]        tmp_a_smaller, tmp_ab_equal;
+    wire [LANES-1:0]        a_sign, b_sign;
+    wire [LANES-1:0][7:0]   a_exponent, b_exponent;
+    wire [LANES-1:0][22:0]  a_mantissa, b_mantissa;
+    fp_class_t [LANES-1:0]  a_clss, b_clss;
+    wire [LANES-1:0]        a_smaller, ab_equal;

    // Setup
    for (genvar i = 0; i < LANES; i++) begin
-        assign     tmp_a_sign[i] = dataa[i][31]; 
-        assign tmp_a_exponent[i] = dataa[i][30:23];
-        assign tmp_a_mantissa[i] = dataa[i][22:0];
+        assign     a_sign[i] = dataa[i][31]; 
+        assign a_exponent[i] = dataa[i][30:23];
+        assign a_mantissa[i] = dataa[i][22:0];

-        assign     tmp_b_sign[i] = datab[i][31]; 
-        assign tmp_b_exponent[i] = datab[i][30:23];
-        assign tmp_b_mantissa[i] = datab[i][22:0];
+        assign     b_sign[i] = datab[i][31]; 
+        assign b_exponent[i] = datab[i][30:23];
+        assign b_mantissa[i] = datab[i][22:0];

-        VX_fp_type fp_type_a (
-            .exp_i  (tmp_a_exponent[i]),
-            .man_i  (tmp_a_mantissa[i]),
-            .type_o (tmp_a_type[i])
+        VX_fp_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_a (
+            .exp_i  (a_exponent[i]),
+            .man_i  (a_mantissa[i]),
+            .clss_o (a_clss[i])
        );

-        VX_fp_type fp_type_b (
-            .exp_i  (tmp_b_exponent[i]),
-            .man_i  (tmp_b_mantissa[i]),
-            .type_o (tmp_b_type[i])
+        VX_fp_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_b (
+            .exp_i  (b_exponent[i]),
+            .man_i  (b_mantissa[i]),
+            .clss_o (b_clss[i])
        );

-        assign tmp_a_smaller[i] = $signed(dataa[i]) < $signed(datab[i]);
-        assign tmp_ab_equal[i]  = (dataa[i] == datab[i]) | (tmp_a_type[i].is_zero & tmp_b_type[i].is_zero);
+        assign a_smaller[i] = $signed(dataa[i]) < $signed(datab[i]);
+        assign ab_equal[i]  = (dataa[i] == datab[i]) | (a_clss[i].is_zero & b_clss[i].is_zero);
    end  

    // Pipeline stage0

    wire                    valid_in_s0;
    wire [TAGW-1:0]         tag_in_s0;
-    wire [`FPU_BITS-1:0]    op_type_s0;
-    wire [`FRM_BITS-1:0]    frm_s0;
+    wire [`INST_FPU_BITS-1:0] op_type_s0;
+    wire [`INST_FRM_BITS-1:0] frm_s0;
    wire [LANES-1:0][31:0]  dataa_s0, datab_s0;
    wire [LANES-1:0]        a_sign_s0, b_sign_s0;
    wire [LANES-1:0][7:0]   a_exponent_s0;
    wire [LANES-1:0][22:0]  a_mantissa_s0;
-    fp_type_t [LANES-1:0]   a_type_s0, b_type_s0;
+    fp_class_t [LANES-1:0]  a_clss_s0, b_clss_s0;
    wire [LANES-1:0]        a_smaller_s0, ab_equal_s0;

    wire stall;

    VX_pipe_register #(
-        .DATAW  (1 + TAGW + `FPU_BITS + `FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_type_t) + 1 + 1)),
+        .DATAW  (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_class_t) + 1 + 1)),
        .RESETW (1),
        .DEPTH  (0)
    ) pipe_reg0 (
        .clk      (clk),
        .reset    (reset),
        .enable   (!stall),
-        .data_in  ({valid_in,    tag_in,    op_type,    frm,    dataa,    datab,    tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
-        .data_out ({valid_in_s0, tag_in_s0, op_type_s0, frm_s0, dataa_s0, datab_s0, a_sign_s0,  b_sign_s0,  a_exponent_s0,  a_mantissa_s0,  a_type_s0,  b_type_s0,  a_smaller_s0,  ab_equal_s0})
+        .data_in  ({valid_in,    tag_in,    op_type,    frm,    dataa,    datab,    a_sign,    b_sign,    a_exponent,    a_mantissa,    a_clss,    b_clss,    a_smaller,    ab_equal}),
+        .data_out ({valid_in_s0, tag_in_s0, op_type_s0, frm_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_clss_s0, b_clss_s0, a_smaller_s0, ab_equal_s0})
    ); 

    // FCLASS
    reg [LANES-1:0][31:0] fclass_mask;  // generate a 10-bit mask for integer reg
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin 
-            if (a_type_s0[i].is_normal) begin
+            if (a_clss_s0[i].is_normal) begin
                fclass_mask[i] = a_sign_s0[i] ? NEG_NORM : POS_NORM;
            end 
-            else if (a_type_s0[i].is_inf) begin
+            else if (a_clss_s0[i].is_inf) begin
                fclass_mask[i] = a_sign_s0[i] ? NEG_INF : POS_INF;
            end 
-            else if (a_type_s0[i].is_zero) begin
+            else if (a_clss_s0[i].is_zero) begin
                fclass_mask[i] = a_sign_s0[i] ? NEG_ZERO : POS_ZERO;
            end 
-            else if (a_type_s0[i].is_subnormal) begin
+            else if (a_clss_s0[i].is_subnormal) begin
                fclass_mask[i] = a_sign_s0[i] ? NEG_SUBNORM : POS_SUBNORM;
            end 
-            else if (a_type_s0[i].is_nan) begin
-                fclass_mask[i] = {22'h0, a_type_s0[i].is_quiet, a_type_s0[i].is_signaling, 8'h0};
+            else if (a_clss_s0[i].is_nan) begin
+                fclass_mask[i] = {22'h0, a_clss_s0[i].is_quiet, a_clss_s0[i].is_signaling, 8'h0};
            end 
            else begin                     
                fclass_mask[i] = QUT_NAN;
@@ -129,11 +138,11 @@ module VX_fp_ncomp #(
    reg [LANES-1:0][31:0] fminmax_res;  // result of fmin/fmax
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
-            if (a_type_s0[i].is_nan && b_type_s0[i].is_nan)
+            if (a_clss_s0[i].is_nan && b_clss_s0[i].is_nan)
                fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
-            else if (a_type_s0[i].is_nan) 
+            else if (a_clss_s0[i].is_nan) 
                fminmax_res[i] = datab_s0[i];
-            else if (b_type_s0[i].is_nan) 
+            else if (b_clss_s0[i].is_nan) 
                fminmax_res[i] = dataa_s0[i];
            else begin 
                case (frm_s0) // use LSB to distinguish MIN and MAX
@@ -160,33 +169,33 @@ module VX_fp_ncomp #(

    // Comparison    
    reg [LANES-1:0][31:0] fcmp_res;     // result of comparison
-    fflags_t [LANES-1:0]   fcmp_fflags;  // comparison fflags
+    fflags_t [LANES-1:0]  fcmp_fflags;  // comparison fflags
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
            case (frm_s0)
-                `FRM_RNE: begin // LE
+                `INST_FRM_RNE: begin // LE
                    fcmp_fflags[i] = 5'h0;
-                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
+                    if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
                        fcmp_res[i]       = 32'h0;
                        fcmp_fflags[i].NV = 1'b1;
                    end else begin
                        fcmp_res[i] = {31'h0, (a_smaller_s0[i] | ab_equal_s0[i])};
                    end
                end
-                `FRM_RTZ: begin // LS
+                `INST_FRM_RTZ: begin // LS
                    fcmp_fflags[i] = 5'h0;
-                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
+                    if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
                        fcmp_res[i]       = 32'h0;
                        fcmp_fflags[i].NV = 1'b1;
                    end else begin
                        fcmp_res[i] = {31'h0, (a_smaller_s0[i] & ~ab_equal_s0[i])};
                    end                    
                end
-                `FRM_RDN: begin // EQ
+                `INST_FRM_RDN: begin // EQ
                    fcmp_fflags[i] = 5'h0;
-                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
+                    if (a_clss_s0[i].is_nan || b_clss_s0[i].is_nan) begin
                        fcmp_res[i]       = 32'h0;
-                        fcmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling; 
+                        fcmp_fflags[i].NV = a_clss_s0[i].is_signaling | b_clss_s0[i].is_signaling; 
                    end else begin
                        fcmp_res[i] = {31'h0, ab_equal_s0[i]};
                    end
@@ -207,11 +216,11 @@ module VX_fp_ncomp #(
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
            case (op_type_s0)
-                `FPU_CLASS: begin
+                `INST_FPU_CLASS: begin
                    tmp_result[i] = fclass_mask[i];
                    tmp_fflags[i] = 'x;
                end   
-                `FPU_CMP: begin 
+                `INST_FPU_CMP: begin 
                    tmp_result[i] = fcmp_res[i];
                    tmp_fflags[i] = fcmp_fflags[i];
                end      
@@ -225,11 +234,11 @@ module VX_fp_ncomp #(
                        3,4: begin
                            tmp_result[i] = fminmax_res[i];
                            tmp_fflags[i] = 0;
-                            tmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling;
+                            tmp_fflags[i].NV = a_clss_s0[i].is_signaling | b_clss_s0[i].is_signaling;
                        end
                        //5,6,7: MOVE
                        default: begin
-                            tmp_result[i] = dataa[i];
+                            tmp_result[i] = dataa_s0[i];
                            tmp_fflags[i] = 'x;
                        end
                    endcase
@@ -238,15 +247,15 @@ module VX_fp_ncomp #(
        end
    end

-    wire has_fflags_s0 = ((op_type_s0 == `FPU_MISC) 
-                       && (frm_s0 == 3             // MIN
-                        || frm_s0 == 4))           // MAX 
-                      || (op_type_s0 == `FPU_CMP); // CMP
+    wire has_fflags_s0 = ((op_type_s0 == `INST_FPU_MISC) 
+                       && (frm_s0 == 3                  // MIN
+                        || frm_s0 == 4))                // MAX 
+                      || (op_type_s0 == `INST_FPU_CMP); // CMP

    assign stall = ~ready_out && valid_out;

    VX_pipe_register #(
-        .DATAW  (1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS)),
+        .DATAW  (1 + TAGW + (LANES * 32) + 1 + (LANES * `FFLAGS_BITS)),
        .RESETW (1)
    ) pipe_reg1 (
        .clk      (clk),
--- a/hw/rtl/fp_cores/VX_fp_rounding.sv
+++ b/hw/rtl/fp_cores/VX_fp_rounding.sv
@@ -1,5 +1,4 @@
-
-`include "VX_define.vh"
+`include "VX_fpu_define.vh"

 /// Modified port of rouding module from fpnew Libray
 /// reference: https://github.com/pulp-platform/fpnew
@@ -34,7 +33,7 @@ module VX_fp_rounding #(

    always @(*) begin
        case (rnd_mode_i)
-            `FRM_RNE: // Decide accoring to round/sticky bits
+            `INST_FRM_RNE: // Decide accoring to round/sticky bits
                case (round_sticky_bits_i)
                      2'b00, 
                      2'b01: round_up = 1'b0;            // < ulp/2 away, round down
@@ -42,10 +41,10 @@ module VX_fp_rounding #(
                      2'b11: round_up = 1'b1;            // > ulp/2 away, round up
                    default: round_up = 1'bx;
                endcase
-            `FRM_RTZ: round_up = 1'b0; // always round down
-            `FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i;  // to 0 if +, away if -
-            `FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
-            `FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
+            `INST_FRM_RTZ: round_up = 1'b0; // always round down
+            `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i;  // to 0 if +, away if -
+            `INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
+            `INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
            default:  round_up = 1'bx; // propagate x
        endcase
    end
@@ -58,7 +57,7 @@ module VX_fp_rounding #(

    // In case of effective subtraction (thus signs of addition operands must have differed) and a
    // true zero result, the result sign is '-' in case of RDN and '+' for other modes.
-    assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `FRM_RDN)
+    assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `INST_FRM_RDN)
                                                              : sign_i;

 endmodule
--- a/hw/rtl/fp_cores/VX_fp_sqrt.sv
+++ b/hw/rtl/fp_cores/VX_fp_sqrt.sv
@@ -1,8 +1,4 @@
-`include "VX_define.vh"
-
-`ifndef SYNTHESIS
-`include "float_dpi.vh"
-`endif
+`include "VX_fpu_define.vh"

 module VX_fp_sqrt #( 
    parameter TAGW = 1,
@@ -16,7 +12,7 @@ module VX_fp_sqrt #(

    input wire [TAGW-1:0] tag_in,
    
-    input wire [`FRM_BITS-1:0] frm,
+    input wire [`INST_FRM_BITS-1:0] frm,

    input wire [LANES-1:0][31:0]  dataa,
    output wire [LANES-1:0][31:0] result,  
@@ -38,7 +34,7 @@ module VX_fp_sqrt #(
        fflags_t f;

        always @(*) begin        
-            dpi_fsqrt (dataa[i], frm, r, f);
+            dpi_fsqrt (enable && valid_in, dataa[i], frm, r, f);
        end
        `UNUSED_VAR (f)

--- a/hw/rtl/fp_cores/VX_fp_type.v
+++ b/hw/rtl/fp_cores/VX_fp_type.v
@@ -1,27 +0,0 @@
-
-`include "VX_define.vh"
-
-module VX_fp_type (
-    // inputs
-    input  [7:0]  exp_i,
-    input  [22:0] man_i,
-    // outputs
-    output fp_type_t type_o
-);
-    wire is_normal    = (exp_i != 8'd0) && (exp_i != 8'hff);
-    wire is_zero      = (exp_i == 8'd0) && (man_i == 23'd0);
-    wire is_subnormal = (exp_i == 8'd0) && (man_i != 23'd0);
-    wire is_inf       = (exp_i == 8'hff) && (man_i == 23'd0); 
-    wire is_nan       = (exp_i == 8'hff) && (man_i != 23'd0);
-    wire is_signaling = is_nan && (man_i[22] == 1'b0);
-    wire is_quiet     = is_nan && !is_signaling;
-
-    assign type_o.is_normal    = is_normal;
-    assign type_o.is_zero      = is_zero;
-    assign type_o.is_subnormal = is_subnormal;
-    assign type_o.is_inf       = is_inf;
-    assign type_o.is_nan       = is_nan;
-    assign type_o.is_quiet     = is_quiet;
-    assign type_o.is_signaling = is_signaling;
-
-endmodule
--- a/hw/rtl/fp_cores/VX_fpu_define.vh
+++ b/hw/rtl/fp_cores/VX_fpu_define.vh
@@ -0,0 +1,14 @@
+`ifndef VX_FPU_DEFINE
+`define VX_FPU_DEFINE
+
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+`IGNORE_WARNINGS_BEGIN
+import fpu_types::*;
+`IGNORE_WARNINGS_END
+
+`endif
--- a/hw/rtl/fp_cores/VX_fpu_dpi.sv
+++ b/hw/rtl/fp_cores/VX_fpu_dpi.sv
@@ -1,7 +1,4 @@
-`ifndef SYNTHESIS
-
-`include "VX_define.vh"
-`include "float_dpi.vh"
+`include "VX_fpu_define.vh"

 module VX_fpu_dpi #( 
    parameter TAGW = 1
@@ -14,8 +11,8 @@ module VX_fpu_dpi #(

    input wire [TAGW-1:0] tag_in,
    
-    input wire [`FPU_BITS-1:0] op_type,
-    input wire [`MOD_BITS-1:0] frm,
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_MOD_BITS-1:0] frm,

    input wire [`NUM_THREADS-1:0][31:0]  dataa,
    input wire [`NUM_THREADS-1:0][31:0]  datab,
@@ -76,21 +73,21 @@ module VX_fpu_dpi #(
        is_fsgnjx = 0;

        case (op_type)
-            `FPU_ADD:   begin core_select = FPU_FMA; is_fadd = 1; end
-            `FPU_SUB:   begin core_select = FPU_FMA; is_fsub = 1; end
-            `FPU_MUL:   begin core_select = FPU_FMA; is_fmul = 1; end
-            `FPU_MADD:  begin core_select = FPU_FMA; is_fmadd = 1; end
-            `FPU_MSUB:  begin core_select = FPU_FMA; is_fmsub = 1; end
-            `FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
-            `FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
-            `FPU_DIV:   begin core_select = FPU_DIV; end
-            `FPU_SQRT:  begin core_select = FPU_SQRT; end
-            `FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end
-            `FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end
-            `FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end
-            `FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end
-            `FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end  
-            `FPU_CMP:   begin core_select = FPU_NCP; 
+            `INST_FPU_ADD:   begin core_select = FPU_FMA; is_fadd = 1; end
+            `INST_FPU_SUB:   begin core_select = FPU_FMA; is_fsub = 1; end
+            `INST_FPU_MUL:   begin core_select = FPU_FMA; is_fmul = 1; end
+            `INST_FPU_MADD:  begin core_select = FPU_FMA; is_fmadd = 1; end
+            `INST_FPU_MSUB:  begin core_select = FPU_FMA; is_fmsub = 1; end
+            `INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
+            `INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
+            `INST_FPU_DIV:   begin core_select = FPU_DIV; end
+            `INST_FPU_SQRT:  begin core_select = FPU_SQRT; end
+            `INST_FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end
+            `INST_FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end
+            `INST_FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end
+            `INST_FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end
+            `INST_FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end  
+            `INST_FPU_CMP:   begin core_select = FPU_NCP; 
                            is_fle = (frm == 0); 
                            is_flt = (frm == 1); 
                            is_feq = (frm == 2); 
@@ -126,15 +123,20 @@ module VX_fpu_dpi #(
        fflags_t [`NUM_THREADS-1:0] fflags_fnmadd;
        fflags_t [`NUM_THREADS-1:0] fflags_fnmsub;

+        wire fma_valid = (valid_in && core_select == FPU_FMA);
+        wire fma_ready = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
+
+        wire fma_fire = fma_valid && fma_ready;
+
        always @(*) begin        
            for (integer i = 0; i < `NUM_THREADS; i++) begin
-                dpi_fadd   (dataa[i], datab[i], frm, result_fadd[i], fflags_fadd[i]);
-                dpi_fsub   (dataa[i], datab[i], frm, result_fsub[i], fflags_fsub[i]);
-                dpi_fmul   (dataa[i], datab[i], frm, result_fmul[i], fflags_fmul[i]);
-                dpi_fmadd  (dataa[i], datab[i], datac[i], frm, result_fmadd[i], fflags_fmadd[i]);
-                dpi_fmsub  (dataa[i], datab[i], datac[i], frm, result_fmsub[i], fflags_fmsub[i]);
-                dpi_fnmadd (dataa[i], datab[i], datac[i], frm, result_fnmadd[i], fflags_fnmadd[i]);
-                dpi_fnmsub (dataa[i], datab[i], datac[i], frm, result_fnmsub[i], fflags_fnmsub[i]);
+                dpi_fadd   (fma_fire, dataa[i], datab[i], frm, result_fadd[i], fflags_fadd[i]);
+                dpi_fsub   (fma_fire, dataa[i], datab[i], frm, result_fsub[i], fflags_fsub[i]);
+                dpi_fmul   (fma_fire, dataa[i], datab[i], frm, result_fmul[i], fflags_fmul[i]);
+                dpi_fmadd  (fma_fire, dataa[i], datab[i], datac[i], frm, result_fmadd[i], fflags_fmadd[i]);
+                dpi_fmsub  (fma_fire, dataa[i], datab[i], datac[i], frm, result_fmsub[i], fflags_fmsub[i]);
+                dpi_fnmadd (fma_fire, dataa[i], datab[i], datac[i], frm, result_fnmadd[i], fflags_fnmadd[i]);
+                dpi_fnmsub (fma_fire, dataa[i], datab[i], datac[i], frm, result_fnmsub[i], fflags_fnmsub[i]);
            end
        end

@@ -154,10 +156,7 @@ module VX_fpu_dpi #(
                            is_fmsub  ? fflags_fmsub :
                            is_fnmadd ? fflags_fnmadd :               
                            is_fnmsub ? fflags_fnmsub : 
-                                        0;
-
-        wire enable = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
-        wire valid  = (valid_in && core_select == FPU_FMA);
+                                        0;                

        VX_shift_register #(
            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
@@ -166,13 +165,13 @@ module VX_fpu_dpi #(
        ) shift_reg (
            .clk      (clk),
            .reset    (reset),
-            .enable   (enable),
-            .data_in  ({valid,                       tag_in,                    result_fma,               fflags_fma}),
+            .enable   (fma_ready),
+            .data_in  ({fma_valid,                   tag_in,                    result_fma,               fflags_fma}),
            .data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]})
        );

        assign per_core_has_fflags[FPU_FMA] = 1;
-        assign per_core_ready_in[FPU_FMA] = enable;
+        assign per_core_ready_in[FPU_FMA] = fma_ready;

    end
    endgenerate
@@ -182,16 +181,18 @@ module VX_fpu_dpi #(

        wire [`NUM_THREADS-1:0][31:0] result_fdiv;
        fflags_t [`NUM_THREADS-1:0] fflags_fdiv;
+
+        wire fdiv_valid = (valid_in && core_select == FPU_DIV);
+        wire fdiv_ready = per_core_ready_out[FPU_DIV] || ~per_core_valid_out[FPU_DIV];
+
+        wire fdiv_fire = fdiv_valid && fdiv_ready;
        
        always @(*) begin        
            for (integer i = 0; i < `NUM_THREADS; i++) begin
-                dpi_fdiv (dataa[i], datab[i], frm, result_fdiv[i], fflags_fdiv[i]);
+                dpi_fdiv (fdiv_fire, dataa[i], datab[i], frm, result_fdiv[i], fflags_fdiv[i]);
            end
        end

-        wire enable = per_core_ready_out[FPU_DIV] || ~per_core_valid_out[FPU_DIV];
-        wire valid  = (valid_in && core_select == FPU_DIV);
-
        VX_shift_register #(
            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FDIV),
@@ -199,13 +200,13 @@ module VX_fpu_dpi #(
        ) shift_reg (
            .clk      (clk),
            .reset    (reset),
-            .enable   (enable),
-            .data_in  ({valid,                       tag_in,                    result_fdiv,               fflags_fdiv}),
+            .enable   (fdiv_ready),
+            .data_in  ({fdiv_valid,                  tag_in,                    result_fdiv,              fflags_fdiv}),
            .data_out ({per_core_valid_out[FPU_DIV], per_core_tag_out[FPU_DIV], per_core_result[FPU_DIV], per_core_fflags[FPU_DIV]})
        );

        assign per_core_has_fflags[FPU_DIV] = 1;
-        assign per_core_ready_in[FPU_DIV] = enable;
+        assign per_core_ready_in[FPU_DIV] = fdiv_ready;

    end
    endgenerate
@@ -215,16 +216,18 @@ module VX_fpu_dpi #(

        wire [`NUM_THREADS-1:0][31:0] result_fsqrt;
        fflags_t [`NUM_THREADS-1:0] fflags_fsqrt;
+
+        wire fsqrt_valid = (valid_in && core_select == FPU_SQRT);
+        wire fsqrt_ready = per_core_ready_out[FPU_SQRT] || ~per_core_valid_out[FPU_SQRT];
+                
+        wire fsqrt_fire = fsqrt_valid && fsqrt_ready;
        
        always @(*) begin        
            for (integer i = 0; i < `NUM_THREADS; i++) begin
-                dpi_fsqrt (dataa[i], frm, result_fsqrt[i], fflags_fsqrt[i]);
+                dpi_fsqrt (fsqrt_fire, dataa[i], frm, result_fsqrt[i], fflags_fsqrt[i]);
            end
        end

-        wire enable = per_core_ready_out[FPU_SQRT] || ~per_core_valid_out[FPU_SQRT];
-        wire valid  = (valid_in && core_select == FPU_SQRT);
-
        VX_shift_register #(
            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FSQRT),
@@ -232,13 +235,13 @@ module VX_fpu_dpi #(
        ) shift_reg (
            .clk      (clk),
            .reset    (reset),
-            .enable   (enable),
-            .data_in  ({valid,                        tag_in,                     result_fsqrt,              fflags_fsqrt}),
+            .enable   (fsqrt_ready),
+            .data_in  ({fsqrt_valid,                  tag_in,                     result_fsqrt,              fflags_fsqrt}),
            .data_out ({per_core_valid_out[FPU_SQRT], per_core_tag_out[FPU_SQRT], per_core_result[FPU_SQRT], per_core_fflags[FPU_SQRT]})
        );

        assign per_core_has_fflags[FPU_SQRT] = 1;
-        assign per_core_ready_in[FPU_SQRT] = enable;
+        assign per_core_ready_in[FPU_SQRT] = fsqrt_ready;

    end
    endgenerate
@@ -257,13 +260,18 @@ module VX_fpu_dpi #(
        fflags_t [`NUM_THREADS-1:0] fflags_utof;
        fflags_t [`NUM_THREADS-1:0] fflags_ftoi;
        fflags_t [`NUM_THREADS-1:0] fflags_ftou;
-        
+
+        wire fcvt_valid = (valid_in && core_select == FPU_CVT);
+        wire fcvt_ready = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
+
+        wire fcvt_fire = fcvt_valid && fcvt_ready;
+                
        always @(*) begin        
            for (integer i = 0; i < `NUM_THREADS; i++) begin
-                dpi_itof (dataa[i], frm, result_itof[i], fflags_itof[i]);
-                dpi_utof (dataa[i], frm, result_utof[i], fflags_utof[i]);
-                dpi_ftoi (dataa[i], frm, result_ftoi[i], fflags_ftoi[i]);
-                dpi_ftou (dataa[i], frm, result_ftou[i], fflags_ftou[i]);
+                dpi_itof (fcvt_fire, dataa[i], frm, result_itof[i], fflags_itof[i]);
+                dpi_utof (fcvt_fire, dataa[i], frm, result_utof[i], fflags_utof[i]);
+                dpi_ftoi (fcvt_fire, dataa[i], frm, result_ftoi[i], fflags_ftoi[i]);
+                dpi_ftou (fcvt_fire, dataa[i], frm, result_ftou[i], fflags_ftou[i]);
            end
        end

@@ -279,9 +287,6 @@ module VX_fpu_dpi #(
                             is_ftou ? fflags_ftou : 
                                       0;

-        wire enable = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
-        wire valid  = (valid_in && core_select == FPU_CVT);
-
        VX_shift_register #(
            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FCVT),
@@ -289,13 +294,13 @@ module VX_fpu_dpi #(
        ) shift_reg (
            .clk      (clk),
            .reset    (reset),
-            .enable   (enable),
-            .data_in  ({valid,                       tag_in,                    result_fcvt,              fflags_fcvt}),
+            .enable   (fcvt_ready),
+            .data_in  ({fcvt_valid,                  tag_in,                    result_fcvt,              fflags_fcvt}),
            .data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]})
        );

        assign per_core_has_fflags[FPU_CVT] = 1;
-        assign per_core_ready_in[FPU_CVT] = enable;
+        assign per_core_ready_in[FPU_CVT] = fcvt_ready;

    end
    endgenerate
@@ -321,18 +326,23 @@ module VX_fpu_dpi #(
        fflags_t [`NUM_THREADS-1:0] fflags_feq;
        fflags_t [`NUM_THREADS-1:0] fflags_fmin;
        fflags_t [`NUM_THREADS-1:0] fflags_fmax;
-        
+
+        wire fncp_valid = (valid_in && core_select == FPU_NCP);
+        wire fncp_ready = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
+
+        wire fncp_fire = fncp_valid && fncp_ready;
+                
        always @(*) begin        
            for (integer i = 0; i < `NUM_THREADS; i++) begin
-                dpi_fclss  (dataa[i], result_fclss[i]);
-                dpi_flt    (dataa[i], datab[i], result_flt[i], fflags_flt[i]);
-                dpi_fle    (dataa[i], datab[i], result_fle[i], fflags_fle[i]);
-                dpi_feq    (dataa[i], datab[i], result_feq[i], fflags_feq[i]);
-                dpi_fmin   (dataa[i], datab[i], result_fmin[i], fflags_fmin[i]);
-                dpi_fmax   (dataa[i], datab[i], result_fmax[i], fflags_fmax[i]);            
-                dpi_fsgnj  (dataa[i], datab[i], result_fsgnj[i]);
-                dpi_fsgnjn (dataa[i], datab[i], result_fsgnjn[i]);
-                dpi_fsgnjx (dataa[i], datab[i], result_fsgnjx[i]);
+                dpi_fclss  (fncp_fire, dataa[i], result_fclss[i]);
+                dpi_flt    (fncp_fire, dataa[i], datab[i], result_flt[i], fflags_flt[i]);
+                dpi_fle    (fncp_fire, dataa[i], datab[i], result_fle[i], fflags_fle[i]);
+                dpi_feq    (fncp_fire, dataa[i], datab[i], result_feq[i], fflags_feq[i]);
+                dpi_fmin   (fncp_fire, dataa[i], datab[i], result_fmin[i], fflags_fmin[i]);
+                dpi_fmax   (fncp_fire, dataa[i], datab[i], result_fmax[i], fflags_fmax[i]);            
+                dpi_fsgnj  (fncp_fire, dataa[i], datab[i], result_fsgnj[i]);
+                dpi_fsgnjn (fncp_fire, dataa[i], datab[i], result_fsgnjn[i]);
+                dpi_fsgnjx (fncp_fire, dataa[i], datab[i], result_fsgnjx[i]);
                result_fmv[i] = dataa[i];
            end
        end
@@ -357,9 +367,6 @@ module VX_fpu_dpi #(
                             is_fmax ? fflags_fmax : 
                                       0;

-        wire enable = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
-        wire valid  = (valid_in && core_select == FPU_NCP);
-
        VX_shift_register #(
            .DATAW  (1 + TAGW + 1 + `NUM_THREADS * (32 + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FNCP),
@@ -367,12 +374,12 @@ module VX_fpu_dpi #(
        ) shift_reg (
            .clk      (clk),
            .reset    (reset),
-            .enable   (enable),
-            .data_in  ({valid,                       tag_in,                    has_fflags_fncp,              result_fncp,              fflags_fncp}),
+            .enable   (fncp_ready),
+            .data_in  ({fncp_valid,                  tag_in,                    has_fflags_fncp,              result_fncp,              fflags_fncp}),
            .data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
        );
        
-        assign per_core_ready_in[FPU_NCP] = enable;
+        assign per_core_ready_in[FPU_NCP] = fncp_ready;

    end
    endgenerate
@@ -410,6 +417,4 @@ module VX_fpu_dpi #(

    assign ready_in = per_core_ready_in[core_select];

-endmodule
-
-`endif
+endmodule
--- a/hw/rtl/fp_cores/VX_fpu_fpga.sv
+++ b/hw/rtl/fp_cores/VX_fpu_fpga.sv
@@ -1,7 +1,7 @@
-`include "VX_define.vh"
+`include "VX_fpu_define.vh"

 module VX_fpu_fpga #( 
-    parameter TAGW = 1
+    parameter TAGW = 4
 ) (
    input wire clk,
    input wire reset,
@@ -11,8 +11,8 @@ module VX_fpu_fpga #(

    input wire [TAGW-1:0] tag_in,
    
-    input wire [`FPU_BITS-1:0] op_type,
-    input wire [`MOD_BITS-1:0] frm,
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_MOD_BITS-1:0] frm,

    input wire [`NUM_THREADS-1:0][31:0]  dataa,
    input wire [`NUM_THREADS-1:0][31:0]  datab,
@@ -54,19 +54,19 @@ module VX_fpu_fpga #(
        is_itof   = 0;
        is_signed = 0;
        case (op_type)
-            `FPU_ADD:    begin core_select = FPU_FMA; end
-            `FPU_SUB:    begin core_select = FPU_FMA; do_sub = 1; end
-            `FPU_MUL:    begin core_select = FPU_FMA; do_neg = 1; end
-            `FPU_MADD:   begin core_select = FPU_FMA; do_madd = 1; end
-            `FPU_MSUB:   begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; end
-            `FPU_NMADD:  begin core_select = FPU_FMA; do_madd = 1; do_neg = 1; end
-            `FPU_NMSUB:  begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end
-            `FPU_DIV:    begin core_select = FPU_DIV; end
-            `FPU_SQRT:   begin core_select = FPU_SQRT; end
-            `FPU_CVTWS:  begin core_select = FPU_CVT; is_signed = 1; end
-            `FPU_CVTWUS: begin core_select = FPU_CVT; end
-            `FPU_CVTSW:  begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
-            `FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; end
+            `INST_FPU_ADD:    begin core_select = FPU_FMA; end
+            `INST_FPU_SUB:    begin core_select = FPU_FMA; do_sub = 1; end
+            `INST_FPU_MUL:    begin core_select = FPU_FMA; do_neg = 1; end
+            `INST_FPU_MADD:   begin core_select = FPU_FMA; do_madd = 1; end
+            `INST_FPU_MSUB:   begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; end
+            `INST_FPU_NMADD:  begin core_select = FPU_FMA; do_madd = 1; do_neg = 1; end
+            `INST_FPU_NMSUB:  begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end
+            `INST_FPU_DIV:    begin core_select = FPU_DIV; end
+            `INST_FPU_SQRT:   begin core_select = FPU_SQRT; end
+            `INST_FPU_CVTWS:  begin core_select = FPU_CVT; is_signed = 1; end
+            `INST_FPU_CVTWUS: begin core_select = FPU_CVT; end
+            `INST_FPU_CVTSW:  begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
+            `INST_FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; end
            default:     begin core_select = FPU_NCP; end
        endcase
    end
--- a/hw/rtl/fp_cores/VX_fpu_fpnew.sv
+++ b/hw/rtl/fp_cores/VX_fpu_fpnew.sv
@@ -1,4 +1,4 @@
-`include "VX_define.vh"
+`include "VX_fpu_define.vh"
 `include "fpnew_pkg.sv"
 `include "defs_div_sqrt_mvp.sv"

@@ -18,8 +18,8 @@ module VX_fpu_fpnew #(

    input wire [TAGW-1:0] tag_in,
    
-    input wire [`FPU_BITS-1:0] op_type,
-    input wire [`MOD_BITS-1:0] frm,
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_MOD_BITS-1:0] frm,

    input wire [`NUM_THREADS-1:0][31:0]  dataa,
    input wire [`NUM_THREADS-1:0][31:0]  datab,
@@ -80,7 +80,7 @@ module VX_fpu_fpnew #(
    fpnew_pkg::status_t [`NUM_THREADS-1:0] fpu_status;

    reg [FOP_BITS-1:0] fpu_op;
-    reg [`FRM_BITS-1:0] fpu_rnd;
+    reg [`INST_FRM_BITS-1:0] fpu_rnd;
    reg fpu_op_mod;
    reg fpu_has_fflags, fpu_has_fflags_out;

@@ -94,38 +94,38 @@ module VX_fpu_fpnew #(
        fpu_operands[2] = datac;

        case (op_type)
-            `FPU_ADD: begin
+            `INST_FPU_ADD: begin
                    fpu_op = fpnew_pkg::ADD;
                    fpu_operands[1] = dataa;
                    fpu_operands[2] = datab;
                end
-            `FPU_SUB: begin 
+            `INST_FPU_SUB: begin 
                    fpu_op = fpnew_pkg::ADD; 
                    fpu_operands[1] = dataa;
                    fpu_operands[2] = datab;
                    fpu_op_mod = 1; 
                end
-            `FPU_MUL:   begin fpu_op = fpnew_pkg::MUL; end
-            `FPU_DIV:   begin fpu_op = fpnew_pkg::DIV; end
-            `FPU_SQRT:  begin fpu_op = fpnew_pkg::SQRT; end
-            `FPU_MADD:  begin fpu_op = fpnew_pkg::FMADD; end
-            `FPU_MSUB:  begin fpu_op = fpnew_pkg::FMADD;  fpu_op_mod = 1; end            
-            `FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
-            `FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
-            `FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end
-            `FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
-            `FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end
-            `FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
-            `FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end
-            `FPU_CMP:   begin fpu_op = fpnew_pkg::CMP; end
-            `FPU_MISC:  begin
+            `INST_FPU_MUL:   begin fpu_op = fpnew_pkg::MUL; end
+            `INST_FPU_DIV:   begin fpu_op = fpnew_pkg::DIV; end
+            `INST_FPU_SQRT:  begin fpu_op = fpnew_pkg::SQRT; end
+            `INST_FPU_MADD:  begin fpu_op = fpnew_pkg::FMADD; end
+            `INST_FPU_MSUB:  begin fpu_op = fpnew_pkg::FMADD;  fpu_op_mod = 1; end            
+            `INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
+            `INST_FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
+            `INST_FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end
+            `INST_FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
+            `INST_FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end
+            `INST_FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
+            `INST_FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end
+            `INST_FPU_CMP:   begin fpu_op = fpnew_pkg::CMP; end
+            `INST_FPU_MISC:  begin
                case (frm)
-                      0: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
-                      1: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
-                      2: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
-                      3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
-                      4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end    
-                default: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
+                      0: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `INST_FRM_RNE; fpu_has_fflags = 0; end
+                      1: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `INST_FRM_RTZ; fpu_has_fflags = 0; end
+                      2: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `INST_FRM_RDN; fpu_has_fflags = 0; end
+                      3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `INST_FRM_RNE; end
+                      4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `INST_FRM_RTZ; end    
+                default: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `INST_FRM_RUP; fpu_has_fflags = 0; end
                endcase    
            end
            default:;
--- a/hw/rtl/fp_cores/VX_fpu_types.vh
+++ b/hw/rtl/fp_cores/VX_fpu_types.vh
@@ -0,0 +1,32 @@
+`ifndef VX_FPU_TYPES
+`define VX_FPU_TYPES
+
+`include "VX_define.vh"
+
+package fpu_types;
+
+typedef struct packed {
+    logic is_normal;
+    logic is_zero;
+    logic is_subnormal;
+    logic is_inf;
+    logic is_nan;
+    logic is_quiet;
+    logic is_signaling;    
+} fp_class_t;
+
+`define FP_CLASS_BITS  $bits(fpu_types::fp_class_t)
+
+typedef struct packed {
+    logic NV; // 4-Invalid
+    logic DZ; // 3-Divide by zero
+    logic OF; // 2-Overflow
+    logic UF; // 1-Underflow
+    logic NX; // 0-Inexact
+} fflags_t;
+
+`define FFLAGS_BITS  $bits(fpu_types::fflags_t)
+
+endpackage
+
+`endif