Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions
--- a/hw/rtl/fpu/VX_fpu_class.sv
+++ b/hw/rtl/fpu/VX_fpu_class.sv
@@ -0,0 +1,45 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_class import VX_fpu_pkg::*; #(    
+    parameter MAN_BITS = 23,
+    parameter EXP_BITS = 8
+) (
+    input  [EXP_BITS-1:0] exp_i,
+    input  [MAN_BITS-1:0] man_i,
+    output fclass_t       clss_o
+);
+    wire is_normal    = (exp_i != '0) && (exp_i != '1);
+    wire is_zero      = (exp_i == '0) && (man_i == '0);
+    wire is_subnormal = (exp_i == '0) && (man_i != '0);
+    wire is_inf       = (exp_i == '1) && (man_i == '0); 
+    wire is_nan       = (exp_i == '1) && (man_i != '0);
+    wire is_signaling = is_nan && ~man_i[MAN_BITS-1];
+    wire is_quiet     = is_nan && ~is_signaling;
+
+    assign clss_o.is_normal    = is_normal;
+    assign clss_o.is_zero      = is_zero;
+    assign clss_o.is_subnormal = is_subnormal;
+    assign clss_o.is_inf       = is_inf;
+    assign clss_o.is_nan       = is_nan;
+    assign clss_o.is_quiet     = is_quiet;
+    assign clss_o.is_signaling = is_signaling;
+
+endmodule
+`endif
+
--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@@ -0,0 +1,464 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+/// Modified port of cast module from fpnew Libray 
+/// reference: https://github.com/pulp-platform/fpnew
+
+module VX_fpu_cvt import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire is_itof,
+    input wire is_signed,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);   
+    // Constants
+ 
+    localparam MAN_BITS = 23;
+    localparam EXP_BITS = 8;
+    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;    
+
+    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
+    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    
+    // Use 32-bit integer
+    localparam MAX_INT_WIDTH = 32;
+
+    // The internal mantissa includes normal bit or an entire integer
+    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
+
+    // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
+    localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
+
+    // The internal exponent must be able to represent the smallest denormal input value as signed
+    // or the number of bits in an integer
+    localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
+
+    // shift amount for denormalization
+    localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
+
+    localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
+    localparam NUM_FP_STICKY  = 2 * INT_MAN_WIDTH - MAN_BITS - 1;   // removed mantissa, 1. and R
+    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH;  // removed int and R
+    
+    // Input processing
+    
+    fclass_t [NUM_LANES-1:0] fclass;
+      
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        VX_fpu_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class (
+            .exp_i  (dataa[i][30:23]),
+            .man_i  (dataa[i][22:0]),
+            .clss_o (fclass[i])
+        );
+    end
+
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp;    
+    wire [NUM_LANES-1:0]                    input_sign;
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [INT_MAN_WIDTH-1:0] int_mantissa;
+        wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
+        wire fmt_sign        = dataa[i][31];
+        wire int_sign        = dataa[i][31] && is_signed;
+        assign int_mantissa  = int_sign ? (-dataa[i]) : dataa[i];
+        assign fmt_mantissa  = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
+        assign input_exp[i]  = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
+        assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
+        assign input_sign[i] = is_itof ? int_sign : fmt_sign;
+    end
+
+    // Pipeline stage0
+    
+    wire                    valid_in_s0;
+    wire [NUM_LANES-1:0]    lane_mask_s0;
+    wire [TAGW-1:0]         tag_in_s0;
+    wire                    is_itof_s0;
+    wire                    unsigned_s0;
+    wire [2:0]              rnd_mode_s0;
+    fclass_t [NUM_LANES-1:0] fclass_s0;
+    wire [NUM_LANES-1:0]    input_sign_s0;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
+
+    wire stall;
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + `INST_FRM_BITS + 1 + NUM_LANES * ($bits(fclass_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
+        .RESETW (1)
+    ) pipe_reg0 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
+        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
+    );
+    
+    // Normalization
+
+    wire [NUM_LANES-1:0][LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
+    wire [NUM_LANES-1:0] mant_is_zero_s0;                       // for integer zeroes
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire mant_is_nonzero_s0;
+        VX_lzc #(
+            .N (INT_MAN_WIDTH)
+        ) lzc (
+            .data_in   (encoded_mant_s0[i]),
+            .data_out  (renorm_shamt_s0[i]),
+            .valid_out (mant_is_nonzero_s0)
+        );
+        assign mant_is_zero_s0[i] = ~mant_is_nonzero_s0;  
+    end
+
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_n_s0;    // normalized input mantissa    
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_n_s0;     // unbiased true exponent
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+       // Realign input mantissa, append zeroes if destination is wider
+        assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
+
+        // Unbias exponent and compensate for shift
+        wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
+    end
+
+    // Pipeline stage1
+
+    wire                    valid_in_s1;
+    wire [NUM_LANES-1:0]    lane_mask_s1;
+    wire [TAGW-1:0]         tag_in_s1;
+    wire                    is_itof_s1;
+    wire                    unsigned_s1;
+    wire [2:0]              rnd_mode_s1;
+    fclass_t [NUM_LANES-1:0] fclass_s1;
+    wire [NUM_LANES-1:0]    input_sign_s1;
+    wire [NUM_LANES-1:0]    mant_is_zero_s1;
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + `INST_FRM_BITS + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
+        .RESETW (1)
+    ) pipe_reg1 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
+        .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
+    );
+
+    // Perform adjustments to mantissa and exponent
+
+    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
+    wire [NUM_LANES-1:0]                    of_before_round_s1;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        reg [2*INT_MAN_WIDTH:0] preshift_mant_s1;   // mantissa before final shift                
+        reg [SHAMT_BITS-1:0]    denorm_shamt_s1;    // shift amount for denormalization
+        reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1;   // after eventual adjustments
+        reg                     of_before_round_tmp_s1;
+
+        always @(*) begin
+            final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
+            preshift_mant_s1 = {input_mant_s1[i], 33'b0};
+            denorm_shamt_s1  = '0;
+            of_before_round_tmp_s1 = 1'b0;
+
+            if (is_itof_s1) begin                   
+                if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
+                    // Overflow or infinities (for proper rounding)
+                    final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
+                    preshift_mant_s1 = ~0;  // largest normal value and RS bits set
+                    of_before_round_tmp_s1 = 1'b1;
+                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
+                    // Limit the shift to retain sticky bits
+                    final_exp_tmp_s1 = '0; // denormal result
+                    denorm_shamt_s1  = (2 + MAN_BITS); // to sticky                
+                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
+                    // Denormalize underflowing values
+                    final_exp_tmp_s1 = '0; // denormal result
+                    denorm_shamt_s1  = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting               
+                end
+            end else begin
+                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
+                    // overflow: when converting to unsigned the range is larger by one
+                    of_before_round_tmp_s1 = 1'b1;                
+                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
+                    // underflow
+                    denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
+                end else begin
+                    // By default right shift mantissa to be an integer
+                    denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
+                end              
+            end
+        end
+
+        assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
+        assign final_exp_s1[i]        = final_exp_tmp_s1;
+        assign of_before_round_s1[i]  = of_before_round_tmp_s1;
+    end
+
+    // Pipeline stage2
+    
+    wire                    valid_in_s2;
+    wire [NUM_LANES-1:0]    lane_mask_s2;
+    wire [TAGW-1:0]         tag_in_s2;
+    wire                    is_itof_s2;
+    wire                    unsigned_s2;
+    wire [2:0]              rnd_mode_s2;
+    fclass_t [NUM_LANES-1:0] fclass_s2;   
+    wire [NUM_LANES-1:0]    mant_is_zero_s2;
+    wire [NUM_LANES-1:0]    input_sign_s2;
+    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
+    wire [NUM_LANES-1:0]    of_before_round_s2;
+    
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
+        .RESETW (1)
+    ) pipe_reg2 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
+        .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
+    );
+
+    wire [NUM_LANES-1:0]       rounded_sign_s2;
+    wire [NUM_LANES-1:0][31:0] rounded_abs_s2;      // absolute value of result after rounding
+    wire [NUM_LANES-1:0]       int_round_has_sticky_s2;
+    wire [NUM_LANES-1:0]       fp_round_has_sticky_s2;
+    
+    // Rouding and classification
+   
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [MAN_BITS-1:0]      final_mant_s2;        // mantissa after adjustments
+        wire [MAX_INT_WIDTH-1:0] final_int_s2;         // integer shifted in position
+        wire [1:0]               round_sticky_bits_s2;
+        wire [31:0]              fmt_pre_round_abs_s2;
+        wire [31:0]              pre_round_abs_s2;
+        wire [1:0]               int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
+
+        // Extract final mantissa and round bit, discard the normal bit (for FP)
+        assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
+        assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (MAX_INT_WIDTH+1) + 1];
+
+        // Collapse sticky bits
+        assign fp_round_sticky_bits_s2[0]  = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
+        assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
+        assign fp_round_has_sticky_s2[i]   = (| fp_round_sticky_bits_s2);
+        assign int_round_has_sticky_s2[i]  = (| int_round_sticky_bits_s2);
+
+        // select RS bits for destination operation
+        assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
+
+        // Pack exponent and mantissa into proper rounding form
+        assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
+
+        // Select output with destination format and operation
+        assign pre_round_abs_s2 = is_itof_s2 ? fmt_pre_round_abs_s2 : final_int_s2;
+
+        // Perform the rounding
+        VX_fpu_rounding #(
+            .DAT_WIDTH (32)
+        ) fp_rounding (
+            .abs_value_i (pre_round_abs_s2),
+            .sign_i      (input_sign_s2[i]),
+            .round_sticky_bits_i (round_sticky_bits_s2),
+            .rnd_mode_i  (rnd_mode_s2),
+            .effective_subtraction_i (1'b0),
+            .abs_rounded_o (rounded_abs_s2[i]),
+            .sign_o      (rounded_sign_s2[i]),
+            `UNUSED_PIN  (exact_zero_o)
+        );
+    end
+
+    // Pipeline stage3
+
+    wire                 valid_in_s3;
+    wire [NUM_LANES-1:0] lane_mask_s3;
+    wire [TAGW-1:0]      tag_in_s3;
+    wire                 is_itof_s3;
+    wire                 unsigned_s3;
+    fclass_t [NUM_LANES-1:0] fclass_s3;   
+    wire [NUM_LANES-1:0] mant_is_zero_s3;
+    wire [NUM_LANES-1:0] input_sign_s3;
+    wire [NUM_LANES-1:0] rounded_sign_s3;
+    wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
+    wire [NUM_LANES-1:0] of_before_round_s3;   
+    wire [NUM_LANES-1:0] int_round_has_sticky_s3;
+    wire [NUM_LANES-1:0] fp_round_has_sticky_s3; 
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
+        .RESETW (1)
+    ) pipe_reg3 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
+        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
+    );
+     
+    wire [NUM_LANES-1:0] of_after_round_s3;
+    wire [NUM_LANES-1:0] uf_after_round_s3;
+    wire [NUM_LANES-1:0][31:0] fmt_result_s3;
+    wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
+    wire [NUM_LANES-1:0] rounded_int_res_zero_s3;  // after rounding
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        // Assemble regular result, nan box short ones. Int zeroes need to be detected
+        assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
+
+        // Classification after rounding select by destination format
+        assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0);  // denormal
+        assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
+
+        // Negative integer result needs to be brought into two's complement
+        assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
+        assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
+    end
+
+    // FP Special case handling
+
+    wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
+    fflags_t [NUM_LANES-1:0]   fp_special_status_s3;
+    wire [NUM_LANES-1:0]       fp_result_is_special_s3;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        // Detect special case from source format, I2F casts don't produce a special result
+        assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
+
+        // Signalling input NaNs raise invalid flag, otherwise no flags set
+        assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
+
+        // Assemble result according to destination format
+        assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
+                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
+    end
+
+    // INT Special case handling
+
+    reg [NUM_LANES-1:0][31:0] int_special_result_s3;
+    fflags_t [NUM_LANES-1:0]  int_special_status_s3;
+    wire [NUM_LANES-1:0]      int_result_is_special_s3;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+         // Assemble result according to destination format
+        always @(*) begin
+            if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
+                int_special_result_s3[i][30:0] = '0;            // alone yields 2**(31)-1
+                int_special_result_s3[i][31]   = ~unsigned_s3;  // for unsigned casts yields 2**31
+            end else begin
+                int_special_result_s3[i][30:0] = 2**(31) - 1;   // alone yields 2**(31)-1
+                int_special_result_s3[i][31]   = unsigned_s3;   // for unsigned casts yields 2**31
+            end
+        end            
+
+        // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
+        assign int_result_is_special_s3[i] = fclass_s3[i].is_nan 
+                                           | fclass_s3[i].is_inf
+                                           | of_before_round_s3[i]
+                                           | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
+                                        
+        // All integer special cases are invalid
+        assign int_special_status_s3[i] = {1'b1, 4'h0};
+    end
+
+    // Result selection and Output handshake
+
+    fflags_t [NUM_LANES-1:0] tmp_fflags_s3;    
+    wire [NUM_LANES-1:0][31:0] tmp_result_s3;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        fflags_t fp_regular_status_s3, int_regular_status_s3;
+        fflags_t fp_status_s3, int_status_s3;    
+        wire [31:0] fp_result_s3, int_result_s3;
+
+        wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;        
+                                     : (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
+                                  
+        assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
+        assign fp_regular_status_s3.DZ = 1'b0; // no divisions
+        assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
+        assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
+        assign fp_regular_status_s3.NX = inexact_s3;
+
+        assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
+
+        assign fp_result_s3  = fp_result_is_special_s3[i]  ? fp_special_result_s3[i]  : fmt_result_s3[i];        
+        assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
+
+        assign fp_status_s3  = fp_result_is_special_s3[i]  ? fp_special_status_s3[i]  : fp_regular_status_s3;
+        assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
+
+        // Select output depending on special case detection
+        assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
+        assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
+    end
+
+    assign stall = ~ready_out && valid_out;
+
+    fflags_t fflags_merged;
+    `FPU_MERGE_FFLAGS(fflags_merged, tmp_fflags_s3, lane_mask_s3, NUM_LANES);
+
+    VX_pipe_register #(
+        .DATAW  (1 + TAGW + (NUM_LANES * 32) + `FP_FLAGS_BITS),
+        .RESETW (1)
+    ) pipe_reg4 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (!stall),
+        .data_in  ({valid_in_s3, tag_in_s3, tmp_result_s3, fflags_merged}),
+        .data_out ({valid_out, tag_out, result, fflags})
+    );
+
+    assign ready_in = ~stall;
+
+    assign has_fflags = 1'b1;
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_define.vh
+++ b/hw/rtl/fpu/VX_fpu_define.vh
@@ -0,0 +1,42 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`ifndef VX_FPU_DEFINE_VH
+`define VX_FPU_DEFINE_VH
+
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+`define FPU_MERGE_FFLAGS(out, in, mask, lanes) \
+    fflags_t __``out; \
+    always @(*) begin \
+        __``out = '0; \
+        for (integer __i = 0; __i < lanes; ++__i) begin \
+            if (mask[__i]) begin \
+                __``out.NX |= in[__i].NX; \
+                __``out.UF |= in[__i].UF; \
+                __``out.OF |= in[__i].OF; \
+                __``out.DZ |= in[__i].DZ; \
+                __``out.NV |= in[__i].NV; \
+            end \
+        end \
+    end \
+    assign out = __``out
+    
+`define FP_CLASS_BITS   $bits(VX_fpu_pkg::fclass_t)
+`define FP_FLAGS_BITS   $bits(VX_fpu_pkg::fflags_t)
+
+`endif // VX_FPU_DEFINE_VH
--- a/hw/rtl/fpu/VX_fpu_div.sv
+++ b/hw/rtl/fpu/VX_fpu_div.sv
@@ -0,0 +1,137 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_div import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire [`INST_FRM_BITS-1:0] frm,
+    
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    input wire [NUM_LANES-1:0][31:0]  datab,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    `UNUSED_VAR (frm)
+
+    wire stall  = ~ready_out && valid_out;
+    wire enable = ~stall;
+
+    fflags_t [NUM_LANES-1:0] per_lane_fflags;
+    wire [NUM_LANES-1:0] lane_mask_out;
+
+    VX_shift_register #(
+        .DATAW  (1 + NUM_LANES + TAGW),
+        .DEPTH  (`LATENCY_FDIV),
+        .RESETW (1)
+    ) shift_reg (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (enable),
+        .data_in  ({valid_in, lane_mask, tag_in}),
+        .data_out ({valid_out, lane_mask_out, tag_out})
+    );
+
+    assign ready_in = enable;
+
+`ifdef QUARTUS
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        acl_fdiv fdiv (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (enable),
+            .a      (dataa[i]),
+            .b      (datab[i]),
+            .q      (result[i])
+        );
+    end    
+    
+    assign has_fflags = 0;
+    assign per_lane_fflags = 'x;
+
+`elsif VIVADO
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [3:0] tuser;
+
+        xil_fdiv fdiv (
+            .aclk                (clk),
+            .aclken              (enable),
+            .s_axis_a_tvalid     (1'b1),
+            .s_axis_a_tdata      (dataa[i]),
+            .s_axis_b_tvalid     (1'b1),
+            .s_axis_b_tdata      (datab[i]),
+            `UNUSED_PIN (m_axis_result_tvalid),
+            .m_axis_result_tdata (result[i]),
+            .m_axis_result_tuser (tuser)
+        );
+                                    // NV, DZ, OF, UF, NX
+        assign per_lane_fflags[i] = {tuser[2], tuser[3], tuser[1], tuser[0], 1'b0};
+    end
+
+     assign has_fflags = 1;
+
+`else    
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin       
+        reg [63:0] r;
+        `UNUSED_VAR (r)
+        
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fdiv (enable && valid_in, int'(0), {32'hffffffff, dataa[i]}, {32'hffffffff,  datab[i]}, frm, r, f);
+        end
+
+        VX_shift_register #(
+            .DATAW  (32 + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FDIV)
+        ) shift_req_dpi (
+            .clk      (clk),
+            `UNUSED_PIN (reset),
+            .enable   (enable),
+            .data_in  ({r[31:0], f}),
+            .data_out ({result[i], per_lane_fflags[i]})
+        );
+    end
+
+    assign has_fflags = 1;
+
+`endif
+
+`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_dpi.sv
+++ b/hw/rtl/fpu/VX_fpu_dpi.sv
@@ -0,0 +1,490 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DPI
+
+module VX_fpu_dpi import VX_fpu_pkg::*; #( 
+    parameter NUM_LANES = 1,
+    parameter TAGW      = 1,
+    parameter OUT_REG   = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    input wire  valid_in,
+    output wire ready_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FMT_BITS-1:0] fmt,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    localparam FPU_FMA     = 0;
+    localparam FPU_DIVSQRT = 1;
+    localparam FPU_CVT     = 2;
+    localparam FPU_NCP     = 3;
+    localparam NUM_FPC     = 4;
+    localparam FPC_BITS    = `LOG2UP(NUM_FPC);
+
+    localparam RSP_DATAW = (NUM_LANES * `XLEN) + 1 + $bits(fflags_t) + TAGW;
+    
+    wire [NUM_FPC-1:0] per_core_ready_in;
+    wire [NUM_FPC-1:0][NUM_LANES-1:0][`XLEN-1:0] per_core_result;
+    wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
+    reg  [NUM_FPC-1:0] per_core_ready_out;
+    wire [NUM_FPC-1:0] per_core_valid_out;    
+    wire [NUM_FPC-1:0] per_core_has_fflags;  
+    fflags_t [NUM_FPC-1:0] per_core_fflags;  
+
+    wire div_ready_in, sqrt_ready_in;
+    wire [NUM_LANES-1:0][`XLEN-1:0] div_result, sqrt_result;
+    wire [TAGW-1:0] div_tag_out, sqrt_tag_out;
+    wire div_ready_out, sqrt_ready_out;
+    wire div_valid_out, sqrt_valid_out;    
+    wire div_has_fflags, sqrt_has_fflags;  
+    fflags_t div_fflags, sqrt_fflags;
+
+    reg [FPC_BITS-1:0] core_select;
+
+    reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
+    reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;    
+    reg dst_fmt, int_fmt;
+
+    reg [NUM_LANES-1:0][63:0] operands [3];
+    
+    always @(*) begin
+        for (integer i = 0; i < NUM_LANES; ++i) begin
+            operands[0][i] = 64'(dataa[i]);
+            operands[1][i] = 64'(datab[i]);
+            operands[2][i] = 64'(datac[i]);
+        end
+    end
+
+    `UNUSED_VAR (fmt)
+
+    always @(*) begin
+        is_fadd   = 0;
+        is_fsub   = 0;        
+        is_fmul   = 0;        
+        is_fmadd  = 0;
+        is_fmsub  = 0;
+        is_fnmadd = 0;           
+        is_fnmsub = 0; 
+        is_div    = 0;      
+        is_fcmp   = 0;
+        is_itof   = 0;
+        is_utof   = 0;
+        is_ftoi   = 0;
+        is_ftou   = 0;
+        is_f2f    = 0;
+        
+        dst_fmt   = 0;
+        int_fmt   = 0;
+        
+    `ifdef FLEN_64
+        dst_fmt = fmt[0];
+    `endif
+
+    `ifdef XLEN_64
+        int_fmt = fmt[1];
+    `endif
+
+        case (op_type)
+            `INST_FPU_ADD:   begin core_select = FPU_FMA; is_fadd = 1; end
+            `INST_FPU_SUB:   begin core_select = FPU_FMA; is_fsub = 1; end
+            `INST_FPU_MUL:   begin core_select = FPU_FMA; is_fmul = 1; end
+            `INST_FPU_MADD:  begin core_select = FPU_FMA; is_fmadd = 1; end
+            `INST_FPU_MSUB:  begin core_select = FPU_FMA; is_fmsub = 1; end
+            `INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
+            `INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
+            `INST_FPU_DIV:   begin core_select = FPU_DIVSQRT; is_div = 1; end
+            `INST_FPU_SQRT:  begin core_select = FPU_DIVSQRT; end
+            `INST_FPU_CMP:   begin core_select = FPU_NCP; is_fcmp = 1; end
+            `INST_FPU_F2I:   begin core_select = FPU_CVT; is_ftoi = 1; end
+            `INST_FPU_F2U:   begin core_select = FPU_CVT; is_ftou = 1; end
+            `INST_FPU_I2F:   begin core_select = FPU_CVT; is_itof = 1; end
+            `INST_FPU_U2F:   begin core_select = FPU_CVT; is_utof = 1; end
+            `INST_FPU_F2F:   begin core_select = FPU_CVT; is_f2f  = 1; end            
+            default:         begin core_select = FPU_NCP; end
+        endcase
+    end
+
+    generate 
+    begin : fma
+        
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
+        wire [NUM_LANES-1:0][63:0] result_fadd;
+        wire [NUM_LANES-1:0][63:0] result_fsub;
+        wire [NUM_LANES-1:0][63:0] result_fmul;
+        wire [NUM_LANES-1:0][63:0] result_fmadd;
+        wire [NUM_LANES-1:0][63:0] result_fmsub;
+        wire [NUM_LANES-1:0][63:0] result_fnmadd;
+        wire [NUM_LANES-1:0][63:0] result_fnmsub;
+        
+        fflags_t [NUM_LANES-1:0] fflags_fma;
+        fflags_t [NUM_LANES-1:0] fflags_fadd;
+        fflags_t [NUM_LANES-1:0] fflags_fsub;
+        fflags_t [NUM_LANES-1:0] fflags_fmul;
+        fflags_t [NUM_LANES-1:0] fflags_fmadd;
+        fflags_t [NUM_LANES-1:0] fflags_fmsub;
+        fflags_t [NUM_LANES-1:0] fflags_fnmadd;
+        fflags_t [NUM_LANES-1:0] fflags_fnmsub;
+
+        wire fma_valid = (valid_in && core_select == FPU_FMA);
+        wire fma_ready = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
+        wire fma_fire  = fma_valid && fma_ready;
+
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_fadd   (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
+                dpi_fsub   (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
+                dpi_fmul   (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
+                dpi_fmadd  (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
+                dpi_fmsub  (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
+                dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
+                dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
+
+                result_fma[i] = is_fadd   ? result_fadd[i][`XLEN-1:0] :
+                                is_fsub   ? result_fsub[i][`XLEN-1:0] :
+                                is_fmul   ? result_fmul[i][`XLEN-1:0] :
+                                is_fmadd  ? result_fmadd[i][`XLEN-1:0] :               
+                                is_fmsub  ? result_fmsub[i][`XLEN-1:0] :
+                                is_fnmadd ? result_fnmadd[i][`XLEN-1:0] :               
+                                is_fnmsub ? result_fnmsub[i][`XLEN-1:0] :
+                                            '0;
+
+                fflags_fma[i] = is_fadd   ? fflags_fadd[i] :
+                                is_fsub   ? fflags_fsub[i] :
+                                is_fmul   ? fflags_fmul[i] :
+                                is_fmadd  ? fflags_fmadd[i] :               
+                                is_fmsub  ? fflags_fmsub[i] :
+                                is_fnmadd ? fflags_fnmadd[i] :               
+                                is_fnmsub ? fflags_fnmsub[i] : 
+                                            '0;                
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fma, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FMA),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fma_ready),
+            .data_in  ({fma_valid, tag_in, result_fma, fflags_merged}),
+            .data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]})
+        );
+
+        assign per_core_has_fflags[FPU_FMA] = 1;
+        assign per_core_ready_in[FPU_FMA] = fma_ready;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fdiv
+
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
+        wire [NUM_LANES-1:0][63:0] result_fdiv;
+        fflags_t [NUM_LANES-1:0] fflags_fdiv;
+
+        wire fdiv_valid = (valid_in && core_select == FPU_DIVSQRT) && is_div;
+        wire fdiv_ready = div_ready_out || ~div_valid_out;
+        wire fdiv_fire  = fdiv_valid && fdiv_ready;
+        
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin                
+                dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
+                result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0];
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fdiv, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FDIV),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fdiv_ready),
+            .data_in  ({fdiv_valid, tag_in, result_fdiv_r, fflags_merged}),
+            .data_out ({div_valid_out, div_tag_out, div_result, div_fflags})
+        );
+
+        assign div_has_fflags = 1;
+        assign div_ready_in = fdiv_ready;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fsqrt
+
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
+        wire [NUM_LANES-1:0][63:0] result_fsqrt;
+        fflags_t [NUM_LANES-1:0] fflags_fsqrt;
+
+        wire fsqrt_valid = (valid_in && core_select == FPU_DIVSQRT) && ~is_div;
+        wire fsqrt_ready = sqrt_ready_out || ~sqrt_valid_out;                
+        wire fsqrt_fire  = fsqrt_valid && fsqrt_ready;
+        
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
+                result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0];
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fsqrt, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FSQRT),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fsqrt_ready),
+            .data_in  ({fsqrt_valid, tag_in, result_fsqrt_r, fflags_merged}),
+            .data_out ({sqrt_valid_out, sqrt_tag_out, sqrt_result, sqrt_fflags})
+        );
+
+        assign sqrt_has_fflags = 1;
+        assign sqrt_ready_in = fsqrt_ready;
+
+    end
+    endgenerate
+
+    generate
+    begin : fcvt
+
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
+        wire [NUM_LANES-1:0][63:0] result_itof;
+        wire [NUM_LANES-1:0][63:0] result_utof;
+        wire [NUM_LANES-1:0][63:0] result_ftoi;
+        wire [NUM_LANES-1:0][63:0] result_ftou;
+        wire [NUM_LANES-1:0][63:0] result_f2f;
+        
+        fflags_t [NUM_LANES-1:0] fflags_fcvt;
+        fflags_t [NUM_LANES-1:0] fflags_itof;
+        fflags_t [NUM_LANES-1:0] fflags_utof;
+        fflags_t [NUM_LANES-1:0] fflags_ftoi;
+        fflags_t [NUM_LANES-1:0] fflags_ftou;
+
+        wire fcvt_valid = (valid_in && core_select == FPU_CVT);
+        wire fcvt_ready = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
+        wire fcvt_fire  = fcvt_valid && fcvt_ready;
+                
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
+                dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
+                dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
+                dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
+                dpi_f2f  (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]);                
+
+                result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] :
+                                is_utof ? result_utof[i][`XLEN-1:0] :
+                                is_ftoi ? result_ftoi[i][`XLEN-1:0] :
+                                is_ftou ? result_ftou[i][`XLEN-1:0] : 
+                                is_f2f  ? result_f2f[i][`XLEN-1:0] : 
+                                        '0;
+
+                fflags_fcvt[i] = is_itof ? fflags_itof[i] :
+                                is_utof ? fflags_utof[i] :
+                                is_ftoi ? fflags_ftoi[i] :
+                                is_ftou ? fflags_ftou[i] :
+                                       '0;
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fcvt, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FCVT),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fcvt_ready),
+            .data_in  ({fcvt_valid, tag_in, result_fcvt, fflags_merged}),
+            .data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]})
+        );
+
+        assign per_core_has_fflags[FPU_CVT] = 1;
+        assign per_core_ready_in[FPU_CVT] = fcvt_ready;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fncp
+
+        reg [NUM_LANES-1:0][`XLEN-1:0]  result_fncp;
+        wire [NUM_LANES-1:0][63:0] result_fclss;
+        wire [NUM_LANES-1:0][63:0] result_flt;
+        wire [NUM_LANES-1:0][63:0] result_fle;
+        wire [NUM_LANES-1:0][63:0] result_feq;
+        wire [NUM_LANES-1:0][63:0] result_fmin;
+        wire [NUM_LANES-1:0][63:0] result_fmax;
+        wire [NUM_LANES-1:0][63:0] result_fsgnj;
+        wire [NUM_LANES-1:0][63:0] result_fsgnjn;
+        wire [NUM_LANES-1:0][63:0] result_fsgnjx;
+        reg [NUM_LANES-1:0][63:0] result_fmvx;
+        reg [NUM_LANES-1:0][63:0] result_fmvf;
+
+        fflags_t [NUM_LANES-1:0] fflags_fncp;
+        fflags_t [NUM_LANES-1:0] fflags_flt;
+        fflags_t [NUM_LANES-1:0] fflags_fle;
+        fflags_t [NUM_LANES-1:0] fflags_feq;
+        fflags_t [NUM_LANES-1:0] fflags_fmin;
+        fflags_t [NUM_LANES-1:0] fflags_fmax;
+
+        wire fncp_valid = (valid_in && core_select == FPU_NCP);
+        wire fncp_ready = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
+        wire fncp_fire  = fncp_valid && fncp_ready;
+                
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_fclss  (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]);
+                dpi_fle    (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
+                dpi_flt    (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);                
+                dpi_feq    (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
+                dpi_fmin   (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
+                dpi_fmax   (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);            
+                dpi_fsgnj  (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
+                dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
+                dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
+                result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0]));      // sign-extension
+                result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
+            end
+        end
+
+        always @(*) begin
+            result_fncp = 'x;
+            fflags_fncp = 'x;
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                case (frm)
+                0:  begin result_fncp[i] = is_fcmp ? result_fle[i][`XLEN-1:0] : result_fsgnj[i][`XLEN-1:0];  fflags_fncp[i] = fflags_fle[i]; end
+                1:  begin result_fncp[i] = is_fcmp ? result_flt[i][`XLEN-1:0] : result_fsgnjn[i][`XLEN-1:0]; fflags_fncp[i] = fflags_flt[i]; end
+                2:  begin result_fncp[i] = is_fcmp ? result_feq[i][`XLEN-1:0] : result_fsgnjx[i][`XLEN-1:0]; fflags_fncp[i] = fflags_feq[i]; end
+                3:  begin result_fncp[i] = result_fclss[i][`XLEN-1:0]; end
+                4:  begin result_fncp[i] = result_fmvx[i][`XLEN-1:0]; end
+                5:  begin result_fncp[i] = result_fmvf[i][`XLEN-1:0]; end
+                6:  begin result_fncp[i] = result_fmin[i][`XLEN-1:0]; fflags_fncp[i] = fflags_fmin[i]; end
+                7:  begin result_fncp[i] = result_fmax[i][`XLEN-1:0]; fflags_fncp[i] = fflags_fmax[i]; end
+                endcase
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fncp, lane_mask, NUM_LANES);
+
+        wire has_fflags_fncp = (frm >= 6) || is_fcmp;
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + 1 + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FNCP),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fncp_ready),
+            .data_in  ({fncp_valid, tag_in, has_fflags_fncp, result_fncp, fflags_merged}),
+            .data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
+        );
+        
+        assign per_core_ready_in[FPU_NCP] = fncp_ready;
+
+    end
+    endgenerate
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
+
+    VX_stream_arb #(
+        .NUM_INPUTS (2),
+        .DATAW      (RSP_DATAW), 
+        .ARBITER    ("R"),
+        .OUT_REG    (0)
+    ) div_sqrt_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  ({sqrt_valid_out, div_valid_out}), 
+        .ready_in  ({sqrt_ready_out, div_ready_out}),
+        .data_in   ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out}, 
+                     {div_result, div_has_fflags, div_fflags, div_tag_out}}),
+        .data_out  ({per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], per_core_fflags[FPU_DIVSQRT], per_core_tag_out[FPU_DIVSQRT]}),
+        .valid_out (per_core_valid_out[FPU_DIVSQRT]),
+        .ready_out (per_core_ready_out[FPU_DIVSQRT]),
+        `UNUSED_PIN (sel_out)
+    );
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out;
+
+    for (genvar i = 0; i < NUM_FPC; ++i) begin
+        assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
+    end
+
+    VX_stream_arb #(
+        .NUM_INPUTS (NUM_FPC),
+        .DATAW      (RSP_DATAW), 
+        .ARBITER    ("R"),
+        .OUT_REG    (OUT_REG)
+    ) rsp_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  (per_core_valid_out), 
+        .ready_in  (per_core_ready_out),
+        .data_in   (per_core_data_out),
+        .data_out  ({result, has_fflags, fflags, tag_out}),
+        .valid_out (valid_out),
+        .ready_out (ready_out),
+        `UNUSED_PIN (sel_out)
+    );
+
+    assign ready_in = per_core_ready_in[core_select];
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_dsp.sv
+++ b/hw/rtl/fpu/VX_fpu_dsp.sv
@@ -0,0 +1,325 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_dsp import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 4, 
+    parameter TAGW      = 4,
+    parameter OUT_REG   = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    input wire  valid_in,
+    output wire ready_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FMT_BITS-1:0] fmt,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    localparam FPU_FMA     = 0;
+    localparam FPU_DIVSQRT = 1;
+    localparam FPU_CVT     = 2;
+    localparam FPU_NCP     = 3;
+    localparam NUM_FPC     = 4;
+    localparam FPC_BITS    = `LOG2UP(NUM_FPC);
+
+    localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAGW;
+
+    `UNUSED_VAR (fmt)    
+
+    wire [NUM_FPC-1:0] per_core_ready_in;
+    wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
+    wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
+    wire [NUM_FPC-1:0] per_core_ready_out;
+    wire [NUM_FPC-1:0] per_core_valid_out;    
+    wire [NUM_FPC-1:0] per_core_has_fflags;  
+    fflags_t [NUM_FPC-1:0] per_core_fflags;
+
+    wire div_ready_in, sqrt_ready_in;
+    wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
+    wire [TAGW-1:0] div_tag_out, sqrt_tag_out;
+    wire div_ready_out, sqrt_ready_out;
+    wire div_valid_out, sqrt_valid_out;    
+    wire div_has_fflags, sqrt_has_fflags;  
+    fflags_t div_fflags, sqrt_fflags;
+
+    reg [FPC_BITS-1:0] core_select;
+    reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed;
+
+    always @(*) begin
+        is_madd   = 0;
+        is_sub    = 0;        
+        is_neg    = 0;
+        is_div    = 0;
+        is_itof   = 0;
+        is_signed = 0;
+        case (op_type)
+            `INST_FPU_ADD:    begin core_select = FPU_FMA; end
+            `INST_FPU_SUB:    begin core_select = FPU_FMA; is_sub = 1; end
+            `INST_FPU_MUL:    begin core_select = FPU_FMA; is_neg = 1; end
+            `INST_FPU_MADD:   begin core_select = FPU_FMA; is_madd = 1; end
+            `INST_FPU_MSUB:   begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end
+            `INST_FPU_NMADD:  begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end
+            `INST_FPU_NMSUB:  begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end
+            `INST_FPU_DIV:    begin core_select = FPU_DIVSQRT; is_div = 1; end
+            `INST_FPU_SQRT:   begin core_select = FPU_DIVSQRT; end
+            `INST_FPU_F2I:    begin core_select = FPU_CVT; is_signed = 1; end
+            `INST_FPU_F2U:    begin core_select = FPU_CVT; end
+            `INST_FPU_I2F:    begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
+            `INST_FPU_U2F:    begin core_select = FPU_CVT; is_itof = 1; end
+            default:          begin core_select = FPU_NCP; end
+        endcase
+    end
+
+    `RESET_RELAY (fma_reset, reset);
+    `RESET_RELAY (div_reset, reset);
+    `RESET_RELAY (sqrt_reset, reset);
+    `RESET_RELAY (cvt_reset, reset);
+    `RESET_RELAY (ncp_reset, reset);
+
+    wire [NUM_LANES-1:0][31:0] dataa_s;
+    wire [NUM_LANES-1:0][31:0] datab_s;
+    wire [NUM_LANES-1:0][31:0] datac_s;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        assign dataa_s[i] = dataa[i][31:0];
+        assign datab_s[i] = datab[i][31:0];
+        assign datac_s[i] = datac[i][31:0];
+    end
+
+    `UNUSED_VAR (dataa)
+    `UNUSED_VAR (datab)
+    `UNUSED_VAR (datac)
+
+    VX_fpu_fma #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW)
+    ) fpu_fma (
+        .clk        (clk), 
+        .reset      (fma_reset), 
+        .valid_in   (valid_in && (core_select == FPU_FMA)),
+        .ready_in   (per_core_ready_in[FPU_FMA]),
+        .lane_mask  (lane_mask),
+        .tag_in     (tag_in), 
+        .frm        (frm),
+        .is_madd    (is_madd),
+        .is_sub     (is_sub),
+        .is_neg     (is_neg),
+        .dataa      (dataa_s), 
+        .datab      (datab_s), 
+        .datac      (datac_s), 
+        .has_fflags (per_core_has_fflags[FPU_FMA]),
+        .fflags     (per_core_fflags[FPU_FMA]),
+        .result     (per_core_result[FPU_FMA]),
+        .tag_out    (per_core_tag_out[FPU_FMA]),
+        .ready_out  (per_core_ready_out[FPU_FMA]),
+        .valid_out  (per_core_valid_out[FPU_FMA])
+    );
+
+    VX_fpu_div #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW)
+    ) fpu_div (
+        .clk        (clk), 
+        .reset      (div_reset), 
+        .valid_in   (valid_in && (core_select == FPU_DIVSQRT) && is_div),
+        .ready_in   (div_ready_in),
+        .lane_mask  (lane_mask),
+        .tag_in     (tag_in),
+        .frm        (frm), 
+        .dataa      (dataa_s), 
+        .datab      (datab_s), 
+        .has_fflags (div_has_fflags),
+        .fflags     (div_fflags), 
+        .result     (div_result),
+        .tag_out    (div_tag_out),
+        .valid_out  (div_valid_out),
+        .ready_out  (div_ready_out)
+    );
+
+    VX_fpu_sqrt #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW)
+    ) fpu_sqrt (
+        .clk        (clk), 
+        .reset      (sqrt_reset), 
+        .valid_in   (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
+        .ready_in   (sqrt_ready_in),
+        .lane_mask  (lane_mask),
+        .tag_in     (tag_in),
+        .frm        (frm), 
+        .dataa      (dataa_s), 
+        .has_fflags (sqrt_has_fflags),
+        .fflags     (sqrt_fflags),
+        .result     (sqrt_result),
+        .tag_out    (sqrt_tag_out),
+        .valid_out  (sqrt_valid_out),
+        .ready_out  (sqrt_ready_out)
+    );
+
+    wire cvt_rt_int_in = ~is_itof;
+    wire cvt_rt_int_out;
+
+    VX_fpu_cvt #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW+1)
+    ) fpu_cvt (
+        .clk        (clk), 
+        .reset      (cvt_reset), 
+        .valid_in   (valid_in && (core_select == FPU_CVT)),
+        .ready_in   (per_core_ready_in[FPU_CVT]),
+        .lane_mask  (lane_mask),
+        .tag_in     ({cvt_rt_int_in, tag_in}), 
+        .frm        (frm),
+        .is_itof    (is_itof), 
+        .is_signed  (is_signed), 
+        .dataa      (dataa_s), 
+        .has_fflags (per_core_has_fflags[FPU_CVT]),
+        .fflags     (per_core_fflags[FPU_CVT]),
+        .result     (per_core_result[FPU_CVT]),
+        .tag_out    ({cvt_rt_int_out, per_core_tag_out[FPU_CVT]}),
+        .valid_out  (per_core_valid_out[FPU_CVT]),
+        .ready_out  (per_core_ready_out[FPU_CVT])
+    );
+
+    wire ncp_rt_int_in = (op_type == `INST_FPU_CMP)
+                      || `INST_FPU_IS_CLASS(op_type, frm) 
+                      || `INST_FPU_IS_MVXW(op_type, frm);
+    wire ncp_rt_int_out;
+
+    wire ncp_rt_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
+    wire ncp_rt_sext_out;
+    
+    VX_fpu_ncomp #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW+2)
+    ) fpu_ncomp (
+        .clk        (clk),
+        .reset      (ncp_reset), 
+        .valid_in   (valid_in && (core_select == FPU_NCP)),
+        .ready_in   (per_core_ready_in[FPU_NCP]),
+        .lane_mask  (lane_mask),
+        .tag_in     ({ncp_rt_sext_in, ncp_rt_int_in, tag_in}),
+        .op_type    (op_type),
+        .frm        (frm),
+        .dataa      (dataa_s),
+        .datab      (datab_s), 
+        .result     (per_core_result[FPU_NCP]), 
+        .has_fflags (per_core_has_fflags[FPU_NCP]),
+        .fflags     (per_core_fflags[FPU_NCP]),
+        .tag_out    ({ncp_rt_sext_out, ncp_rt_int_out, per_core_tag_out[FPU_NCP]}),
+        .valid_out  (per_core_valid_out[FPU_NCP]),
+        .ready_out  (per_core_ready_out[FPU_NCP])
+    );
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
+
+    VX_stream_arb #(
+        .NUM_INPUTS (2),
+        .DATAW      (RSP_DATAW), 
+        .ARBITER    ("R"),
+        .OUT_REG    (0)
+    ) div_sqrt_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  ({sqrt_valid_out, div_valid_out}), 
+        .ready_in  ({sqrt_ready_out, div_ready_out}),
+        .data_in   ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out}, 
+                     {div_result, div_has_fflags, div_fflags, div_tag_out}}),
+        .data_out  ({per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], per_core_fflags[FPU_DIVSQRT], per_core_tag_out[FPU_DIVSQRT]}),
+        .valid_out (per_core_valid_out[FPU_DIVSQRT]),
+        .ready_out (per_core_ready_out[FPU_DIVSQRT]),
+        `UNUSED_PIN (sel_out)
+    );
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
+    
+    always @(*) begin
+        for (integer i = 0; i < NUM_FPC; ++i) begin
+            per_core_data_out[i][RSP_DATAW+1:2] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
+            per_core_data_out[i][1:0] = '0;
+        end        
+        per_core_data_out[FPU_CVT][1:0] = {1'b1, cvt_rt_int_out};
+        per_core_data_out[FPU_NCP][1:0] = {ncp_rt_sext_out, ncp_rt_int_out};
+    end
+
+    wire [NUM_LANES-1:0][31:0] result_s;
+    wire [1:0] op_rt_int_out;
+
+    VX_stream_arb #(
+        .NUM_INPUTS (NUM_FPC),
+        .DATAW      (RSP_DATAW + 2), 
+        .ARBITER    ("R"),
+        .OUT_REG    (OUT_REG)
+    ) rsp_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  (per_core_valid_out), 
+        .ready_in  (per_core_ready_out),
+        .data_in   (per_core_data_out),
+        .data_out  ({result_s, has_fflags, fflags, tag_out, op_rt_int_out}),
+        .valid_out (valid_out),
+        .ready_out (ready_out),
+        `UNUSED_PIN (sel_out)
+    );
+
+`ifndef FPU_RV64F
+    `UNUSED_VAR (op_rt_int_out)
+`endif
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin        
+    `ifdef FPU_RV64F
+        reg [`XLEN-1:0] result_r;
+        always @(*) begin
+            case (op_rt_int_out)
+            2'b11:   result_r = `XLEN'($signed(result_s[i]));
+            2'b01:   result_r = {32'h00000000, result_s[i]};
+            default: result_r = {32'hffffffff, result_s[i]};
+            endcase
+        end
+        assign result[i] = result_r;
+    `else
+        assign result[i] = result_s[i];
+    `endif
+    end
+
+    // can accept new request?
+    assign ready_in = per_core_ready_in[core_select];
+
+endmodule
+`endif 
--- a/hw/rtl/fpu/VX_fpu_fma.sv
+++ b/hw/rtl/fpu/VX_fpu_fma.sv
@@ -0,0 +1,170 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_fma import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1, 
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire  is_madd,
+    input wire  is_sub,
+    input wire  is_neg,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    input wire [NUM_LANES-1:0][31:0]  datab,
+    input wire [NUM_LANES-1:0][31:0]  datac,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    `UNUSED_VAR (frm)
+
+    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;
+
+    fflags_t [NUM_LANES-1:0] per_lane_fflags;
+    wire [NUM_LANES-1:0] lane_mask_out;
+
+    VX_shift_register #(
+        .DATAW  (1 + NUM_LANES + TAGW),
+        .DEPTH  (`LATENCY_FMA),
+        .RESETW (1)
+    ) shift_reg (
+        .clk(clk),
+        .reset    (reset),
+        .enable   (enable),
+        .data_in  ({valid_in, lane_mask, tag_in}),
+        .data_out ({valid_out, lane_mask_out, tag_out})
+    );
+
+    assign ready_in = enable;
+
+    reg [NUM_LANES-1:0][31:0] a, b, c;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            if (is_madd) begin
+                // MADD / MSUB / NMADD / NMSUB
+                a[i] = is_neg ? {~dataa[i][31], dataa[i][30:0]} : dataa[i];                    
+                b[i] = datab[i];
+                c[i] = (is_neg ^ is_sub) ? {~datac[i][31], datac[i][30:0]} : datac[i];
+            end else begin
+                if (is_neg) begin
+                    // MUL
+                    a[i] = dataa[i];
+                    b[i] = datab[i];
+                    c[i] = '0;
+                end else begin
+                    // ADD / SUB
+                    a[i] = 32'h3f800000; // 1.0f
+                    b[i] = dataa[i];
+                    c[i] = is_sub ? {~datab[i][31], datab[i][30:0]} : datab[i];
+                end
+            end    
+        end
+    end
+
+`ifdef QUARTUS
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        acl_fmadd fmadd (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (enable),
+            .a      (a[i]),
+            .b      (b[i]),
+            .c      (c[i]),
+            .q      (result[i])
+        );
+    end
+    
+    assign has_fflags = 0;
+    assign per_lane_fflags = 'x;
+
+`elsif VIVADO
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [2:0] tuser;
+        
+        xil_fma fma (
+            .aclk                (clk),
+            .aclken              (enable),
+            .s_axis_a_tvalid     (1'b1),
+            .s_axis_a_tdata      (a[i]),
+            .s_axis_b_tvalid     (1'b1),
+            .s_axis_b_tdata      (b[i]),
+            .s_axis_c_tvalid     (1'b1),
+            .s_axis_c_tdata      (c[i]),
+            `UNUSED_PIN (m_axis_result_tvalid),
+            .m_axis_result_tdata (result[i]),
+            .m_axis_result_tuser (tuser)
+        );
+                                    // NV, DZ, OF, UF, NX
+        assign per_lane_fflags[i] = {tuser[2], 1'b0, tuser[1], tuser[0], 1'b0};
+    end
+
+    assign has_fflags = 1;
+
+`else
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        reg [63:0] r;
+        `UNUSED_VAR (r)
+
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fmadd (enable && valid_in, int'(0), {32'hffffffff, a[i]}, {32'hffffffff, b[i]}, {32'hffffffff, c[i]}, frm, r, f);
+        end
+
+        VX_shift_register #(
+            .DATAW  (32 + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FMA)
+        ) shift_req_dpi (
+            .clk      (clk),
+            `UNUSED_PIN (reset),
+            .enable   (enable),
+            .data_in  ({r[31:0], f}),
+            .data_out ({result[i], per_lane_fflags[i]})
+        );
+    end
+
+    assign has_fflags = 1;
+
+`endif
+
+`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_fpnew.sv
+++ b/hw/rtl/fpu/VX_fpu_fpnew.sv
@@ -0,0 +1,286 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_FPNEW
+
+module VX_fpu_fpnew 
+    import VX_fpu_pkg::*; 
+    import fpnew_pkg::*; 
+    import cf_math_pkg::*; 
+    import defs_div_sqrt_mvp::*;
+#(      
+    parameter NUM_LANES = 1,
+    parameter TAGW      = 1,
+    parameter OUT_REG   = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    input wire  valid_in,
+    output wire ready_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FMT_BITS-1:0] fmt,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);  
+    localparam LATENCY_FDIVSQRT = `MAX(`LATENCY_FDIV, `LATENCY_FSQRT);
+    localparam RSP_DATAW = (NUM_LANES * `XLEN) + 1 + $bits(fflags_t) + TAGW;
+
+`ifdef XLEN_64
+    // use scalar configuration for mixed formats
+    localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{
+        Width:         unsigned'(`XLEN),
+        EnableVectors: 1'b0,
+        EnableNanBox:  1'b1,
+    `ifdef FLEN_64
+        FpFmtMask:     5'b11000,
+    `else
+        FpFmtMask:     5'b11000, // TODO: added FP64 to fix CVT bug in FpNew
+    `endif
+        IntFmtMask:    4'b0011
+    };
+`else
+    localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{
+        Width:         unsigned'(`XLEN * NUM_LANES),
+        EnableVectors: 1'b1,
+        EnableNanBox:  1'b0,
+        FpFmtMask:     5'b10000,
+        IntFmtMask:    4'b0010
+    };
+`endif
+
+    localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
+      PipeRegs:'{'{`LATENCY_FMA, 0, 0, 0, 0}, // ADDMUL
+                 '{default: unsigned'(LATENCY_FDIVSQRT)}, // DIVSQRT
+                 '{default: `LATENCY_FNCP}, // NONCOMP
+                 '{default: `LATENCY_FCVT}}, // CONV
+      UnitTypes:'{'{default: fpnew_pkg::PARALLEL}, // ADDMUL
+                  '{default: fpnew_pkg::MERGED}, // DIVSQRT
+                  '{default: fpnew_pkg::PARALLEL}, // NONCOMP
+                  '{default: fpnew_pkg::MERGED}}, // CONV
+      PipeConfig: fpnew_pkg::DISTRIBUTED
+    };
+    
+    wire fpu_ready_in, fpu_valid_in;    
+    wire fpu_ready_out, fpu_valid_out;
+
+    reg [TAGW-1:0] fpu_tag_in, fpu_tag_out;
+    
+    reg [2:0][NUM_LANES-1:0][`XLEN-1:0] fpu_operands;
+
+    wire [NUM_LANES-1:0][`XLEN-1:0] fpu_result;
+    fpnew_pkg::status_t fpu_status;
+
+    fpnew_pkg::operation_e fpu_op;
+    reg [`INST_FRM_BITS-1:0] fpu_rnd;
+    reg fpu_op_mod;
+    reg fpu_has_fflags, fpu_has_fflags_out;
+    fpnew_pkg::fp_format_e fpu_src_fmt, fpu_dst_fmt;
+    fpnew_pkg::int_format_e fpu_int_fmt;
+
+    `UNUSED_VAR (fmt)
+
+    always @(*) begin
+        fpu_op          = 'x;
+        fpu_rnd         = frm;  
+        fpu_op_mod      = 0;        
+        fpu_has_fflags  = 1;
+        fpu_operands[0] = dataa;
+        fpu_operands[1] = datab;
+        fpu_operands[2] = datac;    
+        fpu_dst_fmt     = fpnew_pkg::FP32;
+        fpu_int_fmt     = fpnew_pkg::INT32;
+
+    `ifdef FLEN_64
+        if (fmt[0]) begin
+            fpu_dst_fmt = fpnew_pkg::FP64;
+        end
+    `endif
+
+    `ifdef XLEN_64
+        if (fmt[1]) begin
+            fpu_int_fmt = fpnew_pkg::INT64;
+        end
+    `endif
+
+        fpu_src_fmt = fpu_dst_fmt;
+        
+        case (op_type)
+            `INST_FPU_ADD: begin
+                fpu_op = fpnew_pkg::ADD;
+                fpu_operands[1] = dataa;
+                fpu_operands[2] = datab;
+            end
+            `INST_FPU_SUB: begin 
+                fpu_op = fpnew_pkg::ADD; 
+                fpu_operands[1] = dataa;
+                fpu_operands[2] = datab;
+                fpu_op_mod = 1; 
+            end
+            `INST_FPU_MUL:   begin fpu_op = fpnew_pkg::MUL; end
+            `INST_FPU_DIV:   begin fpu_op = fpnew_pkg::DIV; end
+            `INST_FPU_SQRT:  begin fpu_op = fpnew_pkg::SQRT; end
+            `INST_FPU_MADD:  begin fpu_op = fpnew_pkg::FMADD; end
+            `INST_FPU_MSUB:  begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end            
+            `INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
+            `INST_FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
+        `ifdef FLEN_64
+            `INST_FPU_F2F: begin fpu_op = fpnew_pkg::F2F; fpu_src_fmt = fmt[0] ? fpnew_pkg::FP32 : fpnew_pkg::FP64; end
+        `endif
+            `INST_FPU_F2I,
+            `INST_FPU_F2U: begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = op_type[0]; end
+            `INST_FPU_I2F,
+            `INST_FPU_U2F: begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = op_type[0]; end
+            `INST_FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end
+            `INST_FPU_MISC:begin
+                case (frm)
+                    0,1,2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = {1'b0, frm[1:0]}; fpu_has_fflags = 0; end // FSGNJ
+                    3:     begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end // CLASS                     
+                    4,5:   begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = 3'b011; fpu_op_mod = ~frm[0]; fpu_has_fflags = 0; end // FMV.X.W, FMV.W.X
+                    6,7:   begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = {2'b00, frm[0]}; end // MIN, MAX
+                endcase    
+            end
+            default:;
+        endcase
+
+    `ifdef FPU_RV64F
+        // apply nan-boxing to floating-point operands
+        for (integer i = 0; i < NUM_LANES; ++i) begin                    
+            if (op_type != `INST_FPU_I2F && op_type != `INST_FPU_U2F) begin
+                fpu_operands[0][i] |= 64'hffffffff00000000;
+            end
+            fpu_operands[1][i] |= 64'hffffffff00000000;
+            fpu_operands[2][i] |= 64'hffffffff00000000;        
+        end
+    `endif
+    end
+
+`ifdef XLEN_64
+    `UNUSED_VAR (lane_mask)
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [(TAGW+1)-1:0] fpu_tag;        
+        wire fpu_valid_out_uq;
+        wire fpu_ready_in_uq;
+        fpnew_pkg::status_t fpu_status_uq;
+        `UNUSED_VAR (fpu_tag)
+        `UNUSED_VAR (fpu_valid_out_uq)
+        `UNUSED_VAR (fpu_ready_in_uq)
+        `UNUSED_VAR (fpu_status_uq)
+
+        fpnew_top #( 
+            .Features       (FPU_FEATURES),
+            .Implementation (FPU_IMPLEMENTATION),
+            .TagType        (logic[(TAGW+1)-1:0])
+        ) fpnew_core (
+            .clk_i          (clk),
+            .rst_ni         (~reset),
+            .operands_i     ({fpu_operands[2][i], fpu_operands[1][i], fpu_operands[0][i]}),
+            .rnd_mode_i     (fpnew_pkg::roundmode_e'(fpu_rnd)),
+            .op_i           (fpu_op),
+            .op_mod_i       (fpu_op_mod),
+            .src_fmt_i      (fpu_src_fmt),
+            .dst_fmt_i      (fpu_dst_fmt),
+            .int_fmt_i      (fpu_int_fmt),
+            `UNUSED_PIN (vectorial_op_i),
+            `UNUSED_PIN (simd_mask_i),
+            .tag_i          ({fpu_tag_in, fpu_has_fflags}),            
+            .in_valid_i     (fpu_valid_in),
+            .in_ready_o     (fpu_ready_in_uq),
+            .flush_i        (reset),
+            .result_o       (fpu_result[i]),
+            .status_o       (fpu_status_uq),
+            .tag_o          (fpu_tag),
+            .out_valid_o    (fpu_valid_out_uq),
+            .out_ready_i    (fpu_ready_out),
+            `UNUSED_PIN (busy_o)
+        );
+        
+        if (i == 0) begin
+            assign {fpu_tag_out, fpu_has_fflags_out} = fpu_tag;            
+            assign fpu_valid_out = fpu_valid_out_uq;
+            assign fpu_ready_in = fpu_ready_in_uq;
+            assign fpu_status = fpu_status_uq;
+        end
+    end
+`else
+    fpnew_top #( 
+        .Features       (FPU_FEATURES),
+        .Implementation (FPU_IMPLEMENTATION),
+        .TagType        (logic[(TAGW+1)-1:0]),
+        .TrueSIMDClass  (1),
+        .EnableSIMDMask (1)
+    ) fpnew_core (
+        .clk_i          (clk),
+        .rst_ni         (~reset),
+        .operands_i     (fpu_operands),
+        .rnd_mode_i     (fpnew_pkg::roundmode_e'(fpu_rnd)),
+        .op_i           (fpu_op),
+        .op_mod_i       (fpu_op_mod),
+        .src_fmt_i      (fpu_src_fmt),
+        .dst_fmt_i      (fpu_dst_fmt),
+        .int_fmt_i      (fpu_int_fmt),
+        .vectorial_op_i (1'b1),
+        .simd_mask_i    (lane_mask),
+        .tag_i          ({fpu_tag_in, fpu_has_fflags}),        
+        .in_valid_i     (fpu_valid_in),
+        .in_ready_o     (fpu_ready_in),
+        .flush_i        (reset),
+        .result_o       (fpu_result),
+        .status_o       (fpu_status),
+        .tag_o          ({fpu_tag_out, fpu_has_fflags_out}),
+        .out_valid_o    (fpu_valid_out),
+        .out_ready_i    (fpu_ready_out),
+        `UNUSED_PIN (busy_o)
+    );
+`endif
+
+    assign fpu_valid_in = valid_in;
+    assign ready_in = fpu_ready_in;
+    assign fpu_tag_in = tag_in;
+
+    VX_elastic_buffer #(
+        .DATAW   (RSP_DATAW),
+        .SIZE    (`OUT_REG_TO_EB_SIZE(OUT_REG)),
+        .OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
+    ) rsp_buf (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  (fpu_valid_out),
+        .ready_in  (fpu_ready_out),
+        .data_in   ({fpu_result, fpu_has_fflags_out, fpu_status, fpu_tag_out}),
+        .data_out  ({result, has_fflags, fflags, tag_out}),
+        .valid_out (valid_out),
+        .ready_out (ready_out)
+    );
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_ncomp.sv
+++ b/hw/rtl/fpu/VX_fpu_ncomp.sv
@@ -0,0 +1,292 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+/// Modified port of noncomp module from fpnew Libray 
+/// reference: https://github.com/pulp-platform/fpnew
+
+module VX_fpu_ncomp import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset,
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    input wire [NUM_LANES-1:0][31:0]  datab,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);  
+    localparam  EXP_BITS = 8;
+    localparam  MAN_BITS = 23;
+        
+    localparam  NEG_INF     = 32'h00000001,
+                NEG_NORM    = 32'h00000002,
+                NEG_SUBNORM = 32'h00000004,
+                NEG_ZERO    = 32'h00000008,
+                POS_ZERO    = 32'h00000010,
+                POS_SUBNORM = 32'h00000020,
+                POS_NORM    = 32'h00000040,
+                POS_INF     = 32'h00000080,
+                //SIG_NAN   = 32'h00000100,
+                QUT_NAN     = 32'h00000200;
+
+    wire [NUM_LANES-1:0]        a_sign, b_sign;
+    wire [NUM_LANES-1:0][7:0]   a_exponent, b_exponent;
+    wire [NUM_LANES-1:0][22:0]  a_mantissa, b_mantissa;
+    fclass_t [NUM_LANES-1:0]    a_fclass, b_fclass;
+    wire [NUM_LANES-1:0]        a_smaller, ab_equal;
+
+    // Setup
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        assign     a_sign[i] = dataa[i][31]; 
+        assign a_exponent[i] = dataa[i][30:23];
+        assign a_mantissa[i] = dataa[i][22:0];
+
+        assign     b_sign[i] = datab[i][31]; 
+        assign b_exponent[i] = datab[i][30:23];
+        assign b_mantissa[i] = datab[i][22:0];
+
+        VX_fpu_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_a (
+            .exp_i  (a_exponent[i]),
+            .man_i  (a_mantissa[i]),
+            .clss_o (a_fclass[i])
+        );
+
+        VX_fpu_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_b (
+            .exp_i  (b_exponent[i]),
+            .man_i  (b_mantissa[i]),
+            .clss_o (b_fclass[i])
+        );
+
+        assign a_smaller[i] = (dataa[i] < datab[i]) ^ (a_sign[i] || b_sign[i]);
+        assign ab_equal[i]  = (dataa[i] == datab[i]) 
+                           || (a_fclass[i].is_zero && b_fclass[i].is_zero); // +0 == -0
+    end  
+
+    // Pipeline stage0
+
+    wire                        valid_in_s0;
+    wire [NUM_LANES-1:0]        lane_mask_s0;
+    wire [TAGW-1:0]             tag_in_s0;
+    wire [3:0]                  op_mod_s0;
+    wire [NUM_LANES-1:0][31:0]  dataa_s0, datab_s0;
+    wire [NUM_LANES-1:0]        a_sign_s0, b_sign_s0;
+    wire [NUM_LANES-1:0][7:0]   a_exponent_s0;
+    wire [NUM_LANES-1:0][22:0]  a_mantissa_s0;
+    fclass_t [NUM_LANES-1:0]    a_fclass_s0, b_fclass_s0;
+    wire [NUM_LANES-1:0]        a_smaller_s0, ab_equal_s0;
+
+    wire stall;
+
+    wire [3:0] op_mod = {(op_type == `INST_FPU_CMP), frm};
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 4 + NUM_LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1)),
+        .RESETW (1)
+    ) pipe_reg0 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (!stall),
+        .data_in  ({valid_in, lane_mask, tag_in, op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}),
+        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
+    ); 
+
+    // FCLASS
+    reg [NUM_LANES-1:0][31:0] fclass_mask_s0;  // generate a 10-bit mask for integer reg
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin 
+            if (a_fclass_s0[i].is_normal) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_NORM : POS_NORM;
+            end 
+            else if (a_fclass_s0[i].is_inf) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_INF : POS_INF;
+            end 
+            else if (a_fclass_s0[i].is_zero) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_ZERO : POS_ZERO;
+            end 
+            else if (a_fclass_s0[i].is_subnormal) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_SUBNORM : POS_SUBNORM;
+            end 
+            else if (a_fclass_s0[i].is_nan) begin
+                fclass_mask_s0[i] = {22'h0, a_fclass_s0[i].is_quiet, a_fclass_s0[i].is_signaling, 8'h0};
+            end 
+            else begin                     
+                fclass_mask_s0[i] = QUT_NAN;
+            end
+        end
+    end
+
+    // Min/Max    
+    reg [NUM_LANES-1:0][31:0] fminmax_res_s0;
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            if (a_fclass_s0[i].is_nan && b_fclass_s0[i].is_nan)
+                fminmax_res_s0[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
+            else if (a_fclass_s0[i].is_nan) 
+                fminmax_res_s0[i] = datab_s0[i];
+            else if (b_fclass_s0[i].is_nan) 
+                fminmax_res_s0[i] = dataa_s0[i];
+            else begin 
+                // FMIN, FMAX
+                fminmax_res_s0[i] = (op_mod_s0[0] ^ a_smaller_s0[i]) ? dataa_s0[i] : datab_s0[i];
+            end
+        end
+    end
+
+    // Sign injection    
+    reg [NUM_LANES-1:0][31:0] fsgnj_res_s0;    // result of sign injection
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            case (op_mod_s0[1:0])
+                0: fsgnj_res_s0[i] = { b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
+                1: fsgnj_res_s0[i] = {~b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
+          default: fsgnj_res_s0[i] = { a_sign_s0[i] ^ b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
+            endcase
+        end
+    end
+
+    // Comparison    
+    reg [NUM_LANES-1:0] fcmp_res_s0;        // result of comparison
+    reg [NUM_LANES-1:0] fcmp_fflags_NV_s0;  // comparison fflags
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            case (op_mod_s0[1:0])
+                0: begin // LE                    
+                    if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
+                        fcmp_res_s0[i]       = 0;
+                        fcmp_fflags_NV_s0[i] = 1;
+                    end else begin
+                        fcmp_res_s0[i]       = (a_smaller_s0[i] | ab_equal_s0[i]);
+                        fcmp_fflags_NV_s0[i] = 0;
+                    end
+                end
+                1: begin // LT
+                    if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
+                        fcmp_res_s0[i]       = 0;
+                        fcmp_fflags_NV_s0[i] = 1;
+                    end else begin
+                        fcmp_res_s0[i]       = (a_smaller_s0[i] & ~ab_equal_s0[i]);
+                        fcmp_fflags_NV_s0[i] = 0;
+                    end                    
+                end
+                2: begin // EQ
+                    if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
+                        fcmp_res_s0[i]       = 0;
+                        fcmp_fflags_NV_s0[i] = a_fclass_s0[i].is_signaling | b_fclass_s0[i].is_signaling; 
+                    end else begin
+                        fcmp_res_s0[i]       = ab_equal_s0[i];
+                        fcmp_fflags_NV_s0[i] = 0;
+                    end
+                end
+                default: begin
+                    fcmp_res_s0[i]       = 'x;
+                    fcmp_fflags_NV_s0[i] = 'x;                        
+                end
+            endcase
+        end
+    end
+
+    // outputs
+
+    reg [NUM_LANES-1:0][31:0] result_s0;
+    reg [NUM_LANES-1:0] fflags_NV_s0;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            case (op_mod_s0[2:0])
+                0,1,2: begin
+                    // SGNJ, CMP
+                    result_s0[i] = op_mod_s0[3] ? 32'(fcmp_res_s0[i]) : fsgnj_res_s0[i];
+                    fflags_NV_s0[i] = fcmp_fflags_NV_s0[i];
+                end
+                3: begin
+                    // CLASS
+                    result_s0[i] = fclass_mask_s0[i];
+                    fflags_NV_s0[i] = 'x;
+                end
+                4,5: begin
+                    // FMV
+                    result_s0[i] = dataa_s0[i];
+                    fflags_NV_s0[i] = 'x;
+                end                
+                6,7: begin
+                    // MIN/MAX
+                    result_s0[i] = fminmax_res_s0[i];
+                    fflags_NV_s0[i] = a_fclass_s0[i].is_signaling | b_fclass_s0[i].is_signaling;
+                end
+            endcase
+        end
+    end
+
+    // only MIN/MAX and CMP return status flags
+    wire has_fflags_s0 = (op_mod_s0[2:0] >= 6) || op_mod_s0[3];
+
+    assign stall = ~ready_out && valid_out;
+
+    wire fflags_NV;
+    reg fflags_merged;
+
+    always @(*) begin
+        fflags_merged = 0;
+        for (integer i = 0; i < NUM_LANES; ++i) begin
+            if (lane_mask_s0[i]) begin
+                fflags_merged |= fflags_NV_s0[i];
+            end
+        end
+    end
+
+    VX_pipe_register #(
+        .DATAW  (1 + TAGW + (NUM_LANES * 32) + 1 + 1),
+        .RESETW (1)
+    ) pipe_reg1 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (!stall),
+        .data_in  ({valid_in_s0, tag_in_s0, result_s0, has_fflags_s0, fflags_merged}),
+        .data_out ({valid_out, tag_out, result, has_fflags, fflags_NV})
+    );
+
+    assign ready_in = ~stall;
+
+                  // NV, DZ, OF, UF, NX
+    assign fflags = {fflags_NV, 1'b0, 1'b0, 1'b0, 1'b0};
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_pkg.sv
+++ b/hw/rtl/fpu/VX_fpu_pkg.sv
@@ -0,0 +1,41 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`ifndef VX_FPU_PKG_VH
+`define VX_FPU_PKG_VH
+
+`include "VX_define.vh"
+
+package VX_fpu_pkg;
+
+typedef struct packed {
+    logic is_normal;
+    logic is_zero;
+    logic is_subnormal;
+    logic is_inf;
+    logic is_nan;
+    logic is_quiet;
+    logic is_signaling;    
+} fclass_t;
+
+typedef struct packed {
+    logic NV; // 4-Invalid
+    logic DZ; // 3-Divide by zero
+    logic OF; // 2-Overflow
+    logic UF; // 1-Underflow
+    logic NX; // 0-Inexact
+} fflags_t;
+
+endpackage
+
+`endif // VX_FPU_PKG_VH
--- a/hw/rtl/fpu/VX_fpu_rounding.sv
+++ b/hw/rtl/fpu/VX_fpu_rounding.sv
@@ -0,0 +1,79 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+/// Modified port of rouding module from fpnew Libray
+/// reference: https://github.com/pulp-platform/fpnew
+
+module VX_fpu_rounding #(
+    parameter DAT_WIDTH = 2 // Width of the abolute value, without sign bit
+) (
+    // inputs
+    input wire [DAT_WIDTH-1:0]  abs_value_i, // absolute value without sign
+    input wire                  sign_i,
+    // rounding information
+    input wire [1:0]            round_sticky_bits_i, // round and sticky bits {RS}
+    input wire [2:0]            rnd_mode_i,
+    input wire                  effective_subtraction_i, // sign of inputs affects rounding of zeroes
+    // outputs
+    output wire [DAT_WIDTH-1:0] abs_rounded_o, // absolute value without sign
+    output wire                 sign_o,
+    output wire                 exact_zero_o             // output is an exact zero
+);
+
+    reg round_up; // Rounding decision
+
+    // Take the rounding decision according to RISC-V spec
+    // RoundMode | Mnemonic | Meaning
+    // :--------:|:--------:|:-------
+    //    000    |   RNE    | Round to Nearest, ties to Even
+    //    001    |   RTZ    | Round towards Zero
+    //    010    |   RDN    | Round Down (towards -\infty)
+    //    011    |   RUP    | Round Up (towards \infty)
+    //    100    |   RMM    | Round to Nearest, ties to Max Magnitude
+    //  others   |          | *invalid*
+
+    always @(*) begin
+        case (rnd_mode_i)
+            `INST_FRM_RNE: // Decide accoring to round/sticky bits
+                case (round_sticky_bits_i)
+                      2'b00, 
+                      2'b01: round_up = 1'b0;            // < ulp/2 away, round down
+                      2'b10: round_up = abs_value_i[0];  // = ulp/2 away, round towards even result
+                      2'b11: round_up = 1'b1;            // > ulp/2 away, round up
+                    default: round_up = 1'bx;
+                endcase
+            `INST_FRM_RTZ: round_up = 1'b0; // always round down
+            `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i;  // to 0 if +, away if -
+            `INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
+            `INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
+            default:  round_up = 1'bx; // propagate x
+        endcase
+    end
+
+    // Perform the rounding, exponent change and overflow to inf happens automagically
+    assign abs_rounded_o = abs_value_i + DAT_WIDTH'(round_up);
+
+    // True zero result is a zero result without dirty round/sticky bits
+    assign exact_zero_o = (abs_value_i == 0) && (round_sticky_bits_i == 0);
+
+    // In case of effective subtraction (thus signs of addition operands must have differed) and a
+    // true zero result, the result sign is '-' in case of RDN and '+' for other modes.
+    assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `INST_FRM_RDN)
+                                                              : sign_i;
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_sqrt.sv
+++ b/hw/rtl/fpu/VX_fpu_sqrt.sv
@@ -0,0 +1,134 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_sqrt import VX_fpu_pkg::*; #( 
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1    
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+
+    `UNUSED_VAR (frm)
+    
+    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;
+
+    fflags_t [NUM_LANES-1:0] per_lane_fflags;
+    wire [NUM_LANES-1:0] lane_mask_out;
+
+    VX_shift_register #(
+        .DATAW  (1 + NUM_LANES + TAGW),
+        .DEPTH  (`LATENCY_FSQRT),
+        .RESETW (1)
+    ) shift_reg (
+        .clk(clk),
+        .reset    (reset),
+        .enable   (enable),
+        .data_in  ({valid_in, lane_mask, tag_in}),
+        .data_out ({valid_out, lane_mask_out, tag_out})
+    );
+
+    assign ready_in = enable;    
+
+`ifdef QUARTUS
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        acl_fsqrt fsqrt (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (enable),
+            .a      (dataa[i]),
+            .q      (result[i])
+        );
+    end
+
+    assign has_fflags = 0;
+    assign per_lane_fflags = 'x;
+
+`elsif VIVADO
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire tuser;       
+
+        xil_fsqrt fsqrt (
+            .aclk                (clk),
+            .aclken              (enable),
+            .s_axis_a_tvalid     (1'b1),
+            .s_axis_a_tdata      (dataa[i][31:0]),
+            `UNUSED_PIN (m_axis_result_tvalid),
+            .m_axis_result_tdata (result[i][31:0]),
+            .m_axis_result_tuser (tuser)
+        );
+                                    // NV, DZ, OF, UF, NX
+        assign per_lane_fflags[i] = {tuser, 1'b0, 1'b0, 1'b0, 1'b0};
+    end
+
+    assign has_fflags = 1;
+
+`else
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        reg [63:0] r;
+        `UNUSED_VAR (r)
+
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fsqrt (enable && valid_in, int'(0), {32'hffffffff, dataa[i]}, frm, r, f);
+        end
+        
+        VX_shift_register #(
+            .DATAW  (32 + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FSQRT)
+        ) shift_req_dpi (
+            .clk      (clk),
+            `UNUSED_PIN (reset),
+            .enable   (enable),
+            .data_in  ({r[31:0], f}),
+            .data_out ({result[i], per_lane_fflags[i]})
+        );
+    end
+
+    assign has_fflags = 1;
+
+`endif
+
+`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_to_csr_if.sv
+++ b/hw/rtl/fpu/VX_fpu_to_csr_if.sv
@@ -0,0 +1,43 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+interface VX_fpu_to_csr_if import VX_fpu_pkg::*; ();
+
+    wire                    write_enable;
+    wire [`NW_WIDTH-1:0]    write_wid;
+    fflags_t                write_fflags;
+
+    wire [`NW_WIDTH-1:0]    read_wid;
+    wire [`INST_FRM_BITS-1:0] read_frm;
+
+    modport master (
+        output write_enable,
+        output write_wid,
+        output write_fflags,
+
+        output read_wid,
+        input  read_frm
+    );
+
+    modport slave (
+        input  write_enable,
+        input  write_wid,
+        input  write_fflags,
+        
+        input  read_wid,
+        output read_frm
+    );
+
+endinterface
--- a/hw/rtl/fpu/VX_fpu_unit.sv
+++ b/hw/rtl/fpu/VX_fpu_unit.sv
@@ -0,0 +1,259 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_define.vh"
+`include "VX_fpu_define.vh"
+
+module VX_fpu_unit import VX_fpu_pkg::*; #(
+    parameter CORE_ID = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    VX_dispatch_if.slave    dispatch_if [`ISSUE_WIDTH],
+    VX_fpu_to_csr_if.master fpu_to_csr_if[`NUM_FPU_BLOCKS],
+
+    VX_commit_if.master     commit_if [`ISSUE_WIDTH]
+);
+    `UNUSED_PARAM (CORE_ID)
+    localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
+    localparam NUM_LANES  = `NUM_FPU_LANES;
+    localparam PID_BITS   = `CLOG2(`NUM_THREADS / NUM_LANES);
+    localparam PID_WIDTH  = `UP(PID_BITS);
+    localparam TAG_WIDTH  = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
+    localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
+
+    VX_execute_if #(
+        .NUM_LANES (NUM_LANES)
+    ) execute_if[BLOCK_SIZE]();
+
+    `RESET_RELAY (dispatch_reset, reset);
+
+    VX_dispatch_unit #(
+        .BLOCK_SIZE (BLOCK_SIZE),
+        .NUM_LANES  (NUM_LANES),
+        .OUT_REG    (PARTIAL_BW ? 1 : 0)
+    ) dispatch_unit (
+        .clk        (clk),
+        .reset      (dispatch_reset),
+        .dispatch_if(dispatch_if),
+        .execute_if (execute_if)
+    );
+
+    VX_commit_if #(
+        .NUM_LANES (NUM_LANES)
+    ) commit_block_if[BLOCK_SIZE]();
+
+    for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
+        `UNUSED_VAR (execute_if[block_idx].data.tid)
+        `UNUSED_VAR (execute_if[block_idx].data.wb)
+        `UNUSED_VAR (execute_if[block_idx].data.use_PC)
+        `UNUSED_VAR (execute_if[block_idx].data.use_imm)
+
+        // Store request info
+        wire fpu_req_valid, fpu_req_ready;
+        wire fpu_rsp_valid, fpu_rsp_ready;    
+        wire [NUM_LANES-1:0][`XLEN-1:0] fpu_rsp_result;
+        fflags_t fpu_rsp_fflags;
+        wire fpu_rsp_has_fflags;
+
+        wire [`UUID_WIDTH-1:0]  fpu_rsp_uuid;
+        wire [`NW_WIDTH-1:0]    fpu_rsp_wid;
+        wire [NUM_LANES-1:0]    fpu_rsp_tmask;
+        wire [`XLEN-1:0]        fpu_rsp_PC;
+        wire [`NR_BITS-1:0]     fpu_rsp_rd;
+        wire [PID_WIDTH-1:0]    fpu_rsp_pid;
+        wire                    fpu_rsp_sop;
+        wire                    fpu_rsp_eop;
+
+        wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;    
+        wire mdata_full;
+
+        wire [`INST_FMT_BITS-1:0] fpu_fmt = execute_if[block_idx].data.imm[`INST_FMT_BITS-1:0];
+        wire [`INST_FRM_BITS-1:0] fpu_frm = execute_if[block_idx].data.op_mod[`INST_FRM_BITS-1:0];
+
+        wire execute_fire = execute_if[block_idx].valid && execute_if[block_idx].ready;
+        wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
+
+        VX_index_buffer #(
+            .DATAW  (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
+            .SIZE   (`FPU_REQ_QUEUE_SIZE)
+        ) tag_store (
+            .clk          (clk),
+            .reset        (reset),
+            .acquire_en   (execute_fire), 
+            .write_addr   (fpu_req_tag), 
+            .write_data   ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.rd, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}),
+            .read_data    ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
+            .read_addr    (fpu_rsp_tag),
+            .release_en   (fpu_rsp_fire), 
+            .full         (mdata_full),
+            `UNUSED_PIN (empty)
+        );
+
+        // resolve dynamic FRM from CSR   
+        wire [`INST_FRM_BITS-1:0] fpu_req_frm; 
+        `ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].read_wid, execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
+        assign fpu_req_frm = (execute_if[block_idx].data.op_type != `INST_FPU_MISC 
+                           && fpu_frm == `INST_FRM_DYN) ? fpu_to_csr_if[block_idx].read_frm : fpu_frm;
+
+        // submit FPU request
+
+        assign fpu_req_valid = execute_if[block_idx].valid && ~mdata_full;
+        assign execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
+
+        `RESET_RELAY (fpu_reset, reset);   
+
+    `ifdef FPU_DPI
+
+        VX_fpu_dpi #(
+            .NUM_LANES  (NUM_LANES),
+            .TAGW       (TAG_WIDTH),
+            .OUT_REG    (PARTIAL_BW ? 1 : 3)
+        ) fpu_dpi (
+            .clk        (clk),
+            .reset      (fpu_reset),
+
+            .valid_in   (fpu_req_valid),
+            .op_type    (execute_if[block_idx].data.op_type),
+            .lane_mask  (execute_if[block_idx].data.tmask),
+            .fmt        (fpu_fmt),
+            .frm        (fpu_req_frm),
+            .dataa      (execute_if[block_idx].data.rs1_data),
+            .datab      (execute_if[block_idx].data.rs2_data),
+            .datac      (execute_if[block_idx].data.rs3_data),
+            .tag_in     (fpu_req_tag),
+            .ready_in   (fpu_req_ready),
+
+            .valid_out  (fpu_rsp_valid),
+            .result     (fpu_rsp_result),
+            .has_fflags (fpu_rsp_has_fflags),
+            .fflags     (fpu_rsp_fflags),
+            .tag_out    (fpu_rsp_tag),
+            .ready_out  (fpu_rsp_ready)     
+        );   
+
+    `elsif FPU_FPNEW
+
+        VX_fpu_fpnew #(
+            .NUM_LANES  (NUM_LANES),
+            .TAGW       (TAG_WIDTH),
+            .OUT_REG    (PARTIAL_BW ? 1 : 3)
+        ) fpu_fpnew (
+            .clk        (clk),
+            .reset      (fpu_reset), 
+
+            .valid_in   (fpu_req_valid),
+            .op_type    (execute_if[block_idx].data.op_type),
+            .lane_mask  (execute_if[block_idx].data.tmask),
+            .fmt        (fpu_fmt),
+            .frm        (fpu_req_frm),
+            .dataa      (execute_if[block_idx].data.rs1_data),
+            .datab      (execute_if[block_idx].data.rs2_data),
+            .datac      (execute_if[block_idx].data.rs3_data), 
+            .tag_in     (fpu_req_tag),
+            .ready_in   (fpu_req_ready),
+
+            .valid_out  (fpu_rsp_valid), 
+            .result     (fpu_rsp_result),
+            .has_fflags (fpu_rsp_has_fflags),
+            .fflags     (fpu_rsp_fflags),
+            .tag_out    (fpu_rsp_tag), 
+            .ready_out  (fpu_rsp_ready)        
+        );
+
+    `elsif FPU_DSP
+
+        VX_fpu_dsp #(
+            .NUM_LANES  (NUM_LANES),
+            .TAGW       (TAG_WIDTH),
+            .OUT_REG    (PARTIAL_BW ? 1 : 3)
+        ) fpu_dsp (
+            .clk        (clk),
+            .reset      (fpu_reset), 
+
+            .valid_in   (fpu_req_valid),
+            .lane_mask  (execute_if[block_idx].data.tmask),
+            .op_type    (execute_if[block_idx].data.op_type),
+            .fmt        (fpu_fmt),
+            .frm        (fpu_req_frm),
+            .dataa      (execute_if[block_idx].data.rs1_data),
+            .datab      (execute_if[block_idx].data.rs2_data),
+            .datac      (execute_if[block_idx].data.rs3_data), 
+            .tag_in     (fpu_req_tag),
+            .ready_in   (fpu_req_ready),
+
+            .valid_out  (fpu_rsp_valid), 
+            .result     (fpu_rsp_result), 
+            .has_fflags (fpu_rsp_has_fflags),
+            .fflags     (fpu_rsp_fflags),
+            .tag_out    (fpu_rsp_tag),
+            .ready_out  (fpu_rsp_ready)
+        );
+        
+    `endif
+
+        // handle FPU response
+
+        fflags_t fpu_rsp_fflags_q;
+
+        if (PID_BITS != 0) begin
+            fflags_t fpu_rsp_fflags_r;
+            always @(posedge clk) begin
+                if (reset) begin
+                    fpu_rsp_fflags_r <= '0;
+                end else if (fpu_rsp_fire) begin
+                    fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
+                end
+            end
+            assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
+        end else begin
+            assign fpu_rsp_fflags_q = fpu_rsp_fflags;
+        end
+        
+        assign fpu_to_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
+        `ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
+        assign fpu_to_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
+
+        // send response
+
+        VX_elastic_buffer #(
+            .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
+            .SIZE  (0)
+        ) rsp_buf (
+            .clk       (clk),
+            .reset     (reset),
+            .valid_in  (fpu_rsp_valid),
+            .ready_in  (fpu_rsp_ready),
+            .data_in   ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
+            .data_out  ({commit_block_if[block_idx].data.uuid, commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.tmask, commit_block_if[block_idx].data.PC, commit_block_if[block_idx].data.rd, commit_block_if[block_idx].data.data, commit_block_if[block_idx].data.pid, commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop}),
+            .valid_out (commit_block_if[block_idx].valid),
+            .ready_out (commit_block_if[block_idx].ready)
+        );
+        assign commit_block_if[block_idx].data.wb = 1'b1;
+    end
+
+    `RESET_RELAY (commit_reset, reset);
+
+    VX_gather_unit #(
+        .BLOCK_SIZE (BLOCK_SIZE),
+        .NUM_LANES  (NUM_LANES),
+        .OUT_REG    (PARTIAL_BW ? 3 : 0)
+    ) gather_unit (
+        .clk           (clk),
+        .reset         (commit_reset),
+        .commit_in_if  (commit_block_if),
+        .commit_out_if (commit_if)
+    );
+
+endmodule