Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions
--- a/hw/rtl/fpu/VX_fpu_class.sv
+++ b/hw/rtl/fpu/VX_fpu_class.sv
@@ -0,0 +1,45 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_class import VX_fpu_pkg::*; #(    
+    parameter MAN_BITS = 23,
+    parameter EXP_BITS = 8
+) (
+    input  [EXP_BITS-1:0] exp_i,
+    input  [MAN_BITS-1:0] man_i,
+    output fclass_t       clss_o
+);
+    wire is_normal    = (exp_i != '0) && (exp_i != '1);
+    wire is_zero      = (exp_i == '0) && (man_i == '0);
+    wire is_subnormal = (exp_i == '0) && (man_i != '0);
+    wire is_inf       = (exp_i == '1) && (man_i == '0); 
+    wire is_nan       = (exp_i == '1) && (man_i != '0);
+    wire is_signaling = is_nan && ~man_i[MAN_BITS-1];
+    wire is_quiet     = is_nan && ~is_signaling;
+
+    assign clss_o.is_normal    = is_normal;
+    assign clss_o.is_zero      = is_zero;
+    assign clss_o.is_subnormal = is_subnormal;
+    assign clss_o.is_inf       = is_inf;
+    assign clss_o.is_nan       = is_nan;
+    assign clss_o.is_quiet     = is_quiet;
+    assign clss_o.is_signaling = is_signaling;
+
+endmodule
+`endif
+
--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@@ -0,0 +1,464 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+/// Modified port of cast module from fpnew Libray 
+/// reference: https://github.com/pulp-platform/fpnew
+
+module VX_fpu_cvt import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire is_itof,
+    input wire is_signed,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);   
+    // Constants
+ 
+    localparam MAN_BITS = 23;
+    localparam EXP_BITS = 8;
+    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;    
+
+    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
+    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    
+    // Use 32-bit integer
+    localparam MAX_INT_WIDTH = 32;
+
+    // The internal mantissa includes normal bit or an entire integer
+    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
+
+    // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
+    localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
+
+    // The internal exponent must be able to represent the smallest denormal input value as signed
+    // or the number of bits in an integer
+    localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
+
+    // shift amount for denormalization
+    localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
+
+    localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
+    localparam NUM_FP_STICKY  = 2 * INT_MAN_WIDTH - MAN_BITS - 1;   // removed mantissa, 1. and R
+    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH;  // removed int and R
+    
+    // Input processing
+    
+    fclass_t [NUM_LANES-1:0] fclass;
+      
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        VX_fpu_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class (
+            .exp_i  (dataa[i][30:23]),
+            .man_i  (dataa[i][22:0]),
+            .clss_o (fclass[i])
+        );
+    end
+
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp;    
+    wire [NUM_LANES-1:0]                    input_sign;
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [INT_MAN_WIDTH-1:0] int_mantissa;
+        wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
+        wire fmt_sign        = dataa[i][31];
+        wire int_sign        = dataa[i][31] && is_signed;
+        assign int_mantissa  = int_sign ? (-dataa[i]) : dataa[i];
+        assign fmt_mantissa  = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
+        assign input_exp[i]  = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
+        assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
+        assign input_sign[i] = is_itof ? int_sign : fmt_sign;
+    end
+
+    // Pipeline stage0
+    
+    wire                    valid_in_s0;
+    wire [NUM_LANES-1:0]    lane_mask_s0;
+    wire [TAGW-1:0]         tag_in_s0;
+    wire                    is_itof_s0;
+    wire                    unsigned_s0;
+    wire [2:0]              rnd_mode_s0;
+    fclass_t [NUM_LANES-1:0] fclass_s0;
+    wire [NUM_LANES-1:0]    input_sign_s0;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
+
+    wire stall;
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + `INST_FRM_BITS + 1 + NUM_LANES * ($bits(fclass_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
+        .RESETW (1)
+    ) pipe_reg0 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
+        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
+    );
+    
+    // Normalization
+
+    wire [NUM_LANES-1:0][LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
+    wire [NUM_LANES-1:0] mant_is_zero_s0;                       // for integer zeroes
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire mant_is_nonzero_s0;
+        VX_lzc #(
+            .N (INT_MAN_WIDTH)
+        ) lzc (
+            .data_in   (encoded_mant_s0[i]),
+            .data_out  (renorm_shamt_s0[i]),
+            .valid_out (mant_is_nonzero_s0)
+        );
+        assign mant_is_zero_s0[i] = ~mant_is_nonzero_s0;  
+    end
+
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_n_s0;    // normalized input mantissa    
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_n_s0;     // unbiased true exponent
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+       // Realign input mantissa, append zeroes if destination is wider
+        assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
+
+        // Unbias exponent and compensate for shift
+        wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
+    end
+
+    // Pipeline stage1
+
+    wire                    valid_in_s1;
+    wire [NUM_LANES-1:0]    lane_mask_s1;
+    wire [TAGW-1:0]         tag_in_s1;
+    wire                    is_itof_s1;
+    wire                    unsigned_s1;
+    wire [2:0]              rnd_mode_s1;
+    fclass_t [NUM_LANES-1:0] fclass_s1;
+    wire [NUM_LANES-1:0]    input_sign_s1;
+    wire [NUM_LANES-1:0]    mant_is_zero_s1;
+    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + `INST_FRM_BITS + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
+        .RESETW (1)
+    ) pipe_reg1 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
+        .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
+    );
+
+    // Perform adjustments to mantissa and exponent
+
+    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
+    wire [NUM_LANES-1:0]                    of_before_round_s1;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        reg [2*INT_MAN_WIDTH:0] preshift_mant_s1;   // mantissa before final shift                
+        reg [SHAMT_BITS-1:0]    denorm_shamt_s1;    // shift amount for denormalization
+        reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1;   // after eventual adjustments
+        reg                     of_before_round_tmp_s1;
+
+        always @(*) begin
+            final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
+            preshift_mant_s1 = {input_mant_s1[i], 33'b0};
+            denorm_shamt_s1  = '0;
+            of_before_round_tmp_s1 = 1'b0;
+
+            if (is_itof_s1) begin                   
+                if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
+                    // Overflow or infinities (for proper rounding)
+                    final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
+                    preshift_mant_s1 = ~0;  // largest normal value and RS bits set
+                    of_before_round_tmp_s1 = 1'b1;
+                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
+                    // Limit the shift to retain sticky bits
+                    final_exp_tmp_s1 = '0; // denormal result
+                    denorm_shamt_s1  = (2 + MAN_BITS); // to sticky                
+                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
+                    // Denormalize underflowing values
+                    final_exp_tmp_s1 = '0; // denormal result
+                    denorm_shamt_s1  = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting               
+                end
+            end else begin
+                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
+                    // overflow: when converting to unsigned the range is larger by one
+                    of_before_round_tmp_s1 = 1'b1;                
+                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
+                    // underflow
+                    denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
+                end else begin
+                    // By default right shift mantissa to be an integer
+                    denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
+                end              
+            end
+        end
+
+        assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
+        assign final_exp_s1[i]        = final_exp_tmp_s1;
+        assign of_before_round_s1[i]  = of_before_round_tmp_s1;
+    end
+
+    // Pipeline stage2
+    
+    wire                    valid_in_s2;
+    wire [NUM_LANES-1:0]    lane_mask_s2;
+    wire [TAGW-1:0]         tag_in_s2;
+    wire                    is_itof_s2;
+    wire                    unsigned_s2;
+    wire [2:0]              rnd_mode_s2;
+    fclass_t [NUM_LANES-1:0] fclass_s2;   
+    wire [NUM_LANES-1:0]    mant_is_zero_s2;
+    wire [NUM_LANES-1:0]    input_sign_s2;
+    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
+    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
+    wire [NUM_LANES-1:0]    of_before_round_s2;
+    
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
+        .RESETW (1)
+    ) pipe_reg2 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
+        .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
+    );
+
+    wire [NUM_LANES-1:0]       rounded_sign_s2;
+    wire [NUM_LANES-1:0][31:0] rounded_abs_s2;      // absolute value of result after rounding
+    wire [NUM_LANES-1:0]       int_round_has_sticky_s2;
+    wire [NUM_LANES-1:0]       fp_round_has_sticky_s2;
+    
+    // Rouding and classification
+   
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [MAN_BITS-1:0]      final_mant_s2;        // mantissa after adjustments
+        wire [MAX_INT_WIDTH-1:0] final_int_s2;         // integer shifted in position
+        wire [1:0]               round_sticky_bits_s2;
+        wire [31:0]              fmt_pre_round_abs_s2;
+        wire [31:0]              pre_round_abs_s2;
+        wire [1:0]               int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
+
+        // Extract final mantissa and round bit, discard the normal bit (for FP)
+        assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
+        assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (MAX_INT_WIDTH+1) + 1];
+
+        // Collapse sticky bits
+        assign fp_round_sticky_bits_s2[0]  = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
+        assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
+        assign fp_round_has_sticky_s2[i]   = (| fp_round_sticky_bits_s2);
+        assign int_round_has_sticky_s2[i]  = (| int_round_sticky_bits_s2);
+
+        // select RS bits for destination operation
+        assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
+
+        // Pack exponent and mantissa into proper rounding form
+        assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
+
+        // Select output with destination format and operation
+        assign pre_round_abs_s2 = is_itof_s2 ? fmt_pre_round_abs_s2 : final_int_s2;
+
+        // Perform the rounding
+        VX_fpu_rounding #(
+            .DAT_WIDTH (32)
+        ) fp_rounding (
+            .abs_value_i (pre_round_abs_s2),
+            .sign_i      (input_sign_s2[i]),
+            .round_sticky_bits_i (round_sticky_bits_s2),
+            .rnd_mode_i  (rnd_mode_s2),
+            .effective_subtraction_i (1'b0),
+            .abs_rounded_o (rounded_abs_s2[i]),
+            .sign_o      (rounded_sign_s2[i]),
+            `UNUSED_PIN  (exact_zero_o)
+        );
+    end
+
+    // Pipeline stage3
+
+    wire                 valid_in_s3;
+    wire [NUM_LANES-1:0] lane_mask_s3;
+    wire [TAGW-1:0]      tag_in_s3;
+    wire                 is_itof_s3;
+    wire                 unsigned_s3;
+    fclass_t [NUM_LANES-1:0] fclass_s3;   
+    wire [NUM_LANES-1:0] mant_is_zero_s3;
+    wire [NUM_LANES-1:0] input_sign_s3;
+    wire [NUM_LANES-1:0] rounded_sign_s3;
+    wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
+    wire [NUM_LANES-1:0] of_before_round_s3;   
+    wire [NUM_LANES-1:0] int_round_has_sticky_s3;
+    wire [NUM_LANES-1:0] fp_round_has_sticky_s3; 
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
+        .RESETW (1)
+    ) pipe_reg3 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (~stall),
+        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
+        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
+    );
+     
+    wire [NUM_LANES-1:0] of_after_round_s3;
+    wire [NUM_LANES-1:0] uf_after_round_s3;
+    wire [NUM_LANES-1:0][31:0] fmt_result_s3;
+    wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
+    wire [NUM_LANES-1:0] rounded_int_res_zero_s3;  // after rounding
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        // Assemble regular result, nan box short ones. Int zeroes need to be detected
+        assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
+
+        // Classification after rounding select by destination format
+        assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0);  // denormal
+        assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
+
+        // Negative integer result needs to be brought into two's complement
+        assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
+        assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
+    end
+
+    // FP Special case handling
+
+    wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
+    fflags_t [NUM_LANES-1:0]   fp_special_status_s3;
+    wire [NUM_LANES-1:0]       fp_result_is_special_s3;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        // Detect special case from source format, I2F casts don't produce a special result
+        assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
+
+        // Signalling input NaNs raise invalid flag, otherwise no flags set
+        assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
+
+        // Assemble result according to destination format
+        assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
+                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
+    end
+
+    // INT Special case handling
+
+    reg [NUM_LANES-1:0][31:0] int_special_result_s3;
+    fflags_t [NUM_LANES-1:0]  int_special_status_s3;
+    wire [NUM_LANES-1:0]      int_result_is_special_s3;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+         // Assemble result according to destination format
+        always @(*) begin
+            if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
+                int_special_result_s3[i][30:0] = '0;            // alone yields 2**(31)-1
+                int_special_result_s3[i][31]   = ~unsigned_s3;  // for unsigned casts yields 2**31
+            end else begin
+                int_special_result_s3[i][30:0] = 2**(31) - 1;   // alone yields 2**(31)-1
+                int_special_result_s3[i][31]   = unsigned_s3;   // for unsigned casts yields 2**31
+            end
+        end            
+
+        // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
+        assign int_result_is_special_s3[i] = fclass_s3[i].is_nan 
+                                           | fclass_s3[i].is_inf
+                                           | of_before_round_s3[i]
+                                           | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
+                                        
+        // All integer special cases are invalid
+        assign int_special_status_s3[i] = {1'b1, 4'h0};
+    end
+
+    // Result selection and Output handshake
+
+    fflags_t [NUM_LANES-1:0] tmp_fflags_s3;    
+    wire [NUM_LANES-1:0][31:0] tmp_result_s3;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        fflags_t fp_regular_status_s3, int_regular_status_s3;
+        fflags_t fp_status_s3, int_status_s3;    
+        wire [31:0] fp_result_s3, int_result_s3;
+
+        wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;        
+                                     : (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
+                                  
+        assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
+        assign fp_regular_status_s3.DZ = 1'b0; // no divisions
+        assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
+        assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
+        assign fp_regular_status_s3.NX = inexact_s3;
+
+        assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
+
+        assign fp_result_s3  = fp_result_is_special_s3[i]  ? fp_special_result_s3[i]  : fmt_result_s3[i];        
+        assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
+
+        assign fp_status_s3  = fp_result_is_special_s3[i]  ? fp_special_status_s3[i]  : fp_regular_status_s3;
+        assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
+
+        // Select output depending on special case detection
+        assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
+        assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
+    end
+
+    assign stall = ~ready_out && valid_out;
+
+    fflags_t fflags_merged;
+    `FPU_MERGE_FFLAGS(fflags_merged, tmp_fflags_s3, lane_mask_s3, NUM_LANES);
+
+    VX_pipe_register #(
+        .DATAW  (1 + TAGW + (NUM_LANES * 32) + `FP_FLAGS_BITS),
+        .RESETW (1)
+    ) pipe_reg4 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (!stall),
+        .data_in  ({valid_in_s3, tag_in_s3, tmp_result_s3, fflags_merged}),
+        .data_out ({valid_out, tag_out, result, fflags})
+    );
+
+    assign ready_in = ~stall;
+
+    assign has_fflags = 1'b1;
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_define.vh
+++ b/hw/rtl/fpu/VX_fpu_define.vh
@@ -0,0 +1,42 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`ifndef VX_FPU_DEFINE_VH
+`define VX_FPU_DEFINE_VH
+
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+`define FPU_MERGE_FFLAGS(out, in, mask, lanes) \
+    fflags_t __``out; \
+    always @(*) begin \
+        __``out = '0; \
+        for (integer __i = 0; __i < lanes; ++__i) begin \
+            if (mask[__i]) begin \
+                __``out.NX |= in[__i].NX; \
+                __``out.UF |= in[__i].UF; \
+                __``out.OF |= in[__i].OF; \
+                __``out.DZ |= in[__i].DZ; \
+                __``out.NV |= in[__i].NV; \
+            end \
+        end \
+    end \
+    assign out = __``out
+    
+`define FP_CLASS_BITS   $bits(VX_fpu_pkg::fclass_t)
+`define FP_FLAGS_BITS   $bits(VX_fpu_pkg::fflags_t)
+
+`endif // VX_FPU_DEFINE_VH
--- a/hw/rtl/fpu/VX_fpu_div.sv
+++ b/hw/rtl/fpu/VX_fpu_div.sv
@@ -0,0 +1,137 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_div import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire [`INST_FRM_BITS-1:0] frm,
+    
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    input wire [NUM_LANES-1:0][31:0]  datab,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    `UNUSED_VAR (frm)
+
+    wire stall  = ~ready_out && valid_out;
+    wire enable = ~stall;
+
+    fflags_t [NUM_LANES-1:0] per_lane_fflags;
+    wire [NUM_LANES-1:0] lane_mask_out;
+
+    VX_shift_register #(
+        .DATAW  (1 + NUM_LANES + TAGW),
+        .DEPTH  (`LATENCY_FDIV),
+        .RESETW (1)
+    ) shift_reg (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (enable),
+        .data_in  ({valid_in, lane_mask, tag_in}),
+        .data_out ({valid_out, lane_mask_out, tag_out})
+    );
+
+    assign ready_in = enable;
+
+`ifdef QUARTUS
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        acl_fdiv fdiv (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (enable),
+            .a      (dataa[i]),
+            .b      (datab[i]),
+            .q      (result[i])
+        );
+    end    
+    
+    assign has_fflags = 0;
+    assign per_lane_fflags = 'x;
+
+`elsif VIVADO
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [3:0] tuser;
+
+        xil_fdiv fdiv (
+            .aclk                (clk),
+            .aclken              (enable),
+            .s_axis_a_tvalid     (1'b1),
+            .s_axis_a_tdata      (dataa[i]),
+            .s_axis_b_tvalid     (1'b1),
+            .s_axis_b_tdata      (datab[i]),
+            `UNUSED_PIN (m_axis_result_tvalid),
+            .m_axis_result_tdata (result[i]),
+            .m_axis_result_tuser (tuser)
+        );
+                                    // NV, DZ, OF, UF, NX
+        assign per_lane_fflags[i] = {tuser[2], tuser[3], tuser[1], tuser[0], 1'b0};
+    end
+
+     assign has_fflags = 1;
+
+`else    
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin       
+        reg [63:0] r;
+        `UNUSED_VAR (r)
+        
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fdiv (enable && valid_in, int'(0), {32'hffffffff, dataa[i]}, {32'hffffffff,  datab[i]}, frm, r, f);
+        end
+
+        VX_shift_register #(
+            .DATAW  (32 + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FDIV)
+        ) shift_req_dpi (
+            .clk      (clk),
+            `UNUSED_PIN (reset),
+            .enable   (enable),
+            .data_in  ({r[31:0], f}),
+            .data_out ({result[i], per_lane_fflags[i]})
+        );
+    end
+
+    assign has_fflags = 1;
+
+`endif
+
+`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_dpi.sv
+++ b/hw/rtl/fpu/VX_fpu_dpi.sv
@@ -0,0 +1,490 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DPI
+
+module VX_fpu_dpi import VX_fpu_pkg::*; #( 
+    parameter NUM_LANES = 1,
+    parameter TAGW      = 1,
+    parameter OUT_REG   = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    input wire  valid_in,
+    output wire ready_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FMT_BITS-1:0] fmt,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    localparam FPU_FMA     = 0;
+    localparam FPU_DIVSQRT = 1;
+    localparam FPU_CVT     = 2;
+    localparam FPU_NCP     = 3;
+    localparam NUM_FPC     = 4;
+    localparam FPC_BITS    = `LOG2UP(NUM_FPC);
+
+    localparam RSP_DATAW = (NUM_LANES * `XLEN) + 1 + $bits(fflags_t) + TAGW;
+    
+    wire [NUM_FPC-1:0] per_core_ready_in;
+    wire [NUM_FPC-1:0][NUM_LANES-1:0][`XLEN-1:0] per_core_result;
+    wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
+    reg  [NUM_FPC-1:0] per_core_ready_out;
+    wire [NUM_FPC-1:0] per_core_valid_out;    
+    wire [NUM_FPC-1:0] per_core_has_fflags;  
+    fflags_t [NUM_FPC-1:0] per_core_fflags;  
+
+    wire div_ready_in, sqrt_ready_in;
+    wire [NUM_LANES-1:0][`XLEN-1:0] div_result, sqrt_result;
+    wire [TAGW-1:0] div_tag_out, sqrt_tag_out;
+    wire div_ready_out, sqrt_ready_out;
+    wire div_valid_out, sqrt_valid_out;    
+    wire div_has_fflags, sqrt_has_fflags;  
+    fflags_t div_fflags, sqrt_fflags;
+
+    reg [FPC_BITS-1:0] core_select;
+
+    reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
+    reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;    
+    reg dst_fmt, int_fmt;
+
+    reg [NUM_LANES-1:0][63:0] operands [3];
+    
+    always @(*) begin
+        for (integer i = 0; i < NUM_LANES; ++i) begin
+            operands[0][i] = 64'(dataa[i]);
+            operands[1][i] = 64'(datab[i]);
+            operands[2][i] = 64'(datac[i]);
+        end
+    end
+
+    `UNUSED_VAR (fmt)
+
+    always @(*) begin
+        is_fadd   = 0;
+        is_fsub   = 0;        
+        is_fmul   = 0;        
+        is_fmadd  = 0;
+        is_fmsub  = 0;
+        is_fnmadd = 0;           
+        is_fnmsub = 0; 
+        is_div    = 0;      
+        is_fcmp   = 0;
+        is_itof   = 0;
+        is_utof   = 0;
+        is_ftoi   = 0;
+        is_ftou   = 0;
+        is_f2f    = 0;
+        
+        dst_fmt   = 0;
+        int_fmt   = 0;
+        
+    `ifdef FLEN_64
+        dst_fmt = fmt[0];
+    `endif
+
+    `ifdef XLEN_64
+        int_fmt = fmt[1];
+    `endif
+
+        case (op_type)
+            `INST_FPU_ADD:   begin core_select = FPU_FMA; is_fadd = 1; end
+            `INST_FPU_SUB:   begin core_select = FPU_FMA; is_fsub = 1; end
+            `INST_FPU_MUL:   begin core_select = FPU_FMA; is_fmul = 1; end
+            `INST_FPU_MADD:  begin core_select = FPU_FMA; is_fmadd = 1; end
+            `INST_FPU_MSUB:  begin core_select = FPU_FMA; is_fmsub = 1; end
+            `INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
+            `INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
+            `INST_FPU_DIV:   begin core_select = FPU_DIVSQRT; is_div = 1; end
+            `INST_FPU_SQRT:  begin core_select = FPU_DIVSQRT; end
+            `INST_FPU_CMP:   begin core_select = FPU_NCP; is_fcmp = 1; end
+            `INST_FPU_F2I:   begin core_select = FPU_CVT; is_ftoi = 1; end
+            `INST_FPU_F2U:   begin core_select = FPU_CVT; is_ftou = 1; end
+            `INST_FPU_I2F:   begin core_select = FPU_CVT; is_itof = 1; end
+            `INST_FPU_U2F:   begin core_select = FPU_CVT; is_utof = 1; end
+            `INST_FPU_F2F:   begin core_select = FPU_CVT; is_f2f  = 1; end            
+            default:         begin core_select = FPU_NCP; end
+        endcase
+    end
+
+    generate 
+    begin : fma
+        
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
+        wire [NUM_LANES-1:0][63:0] result_fadd;
+        wire [NUM_LANES-1:0][63:0] result_fsub;
+        wire [NUM_LANES-1:0][63:0] result_fmul;
+        wire [NUM_LANES-1:0][63:0] result_fmadd;
+        wire [NUM_LANES-1:0][63:0] result_fmsub;
+        wire [NUM_LANES-1:0][63:0] result_fnmadd;
+        wire [NUM_LANES-1:0][63:0] result_fnmsub;
+        
+        fflags_t [NUM_LANES-1:0] fflags_fma;
+        fflags_t [NUM_LANES-1:0] fflags_fadd;
+        fflags_t [NUM_LANES-1:0] fflags_fsub;
+        fflags_t [NUM_LANES-1:0] fflags_fmul;
+        fflags_t [NUM_LANES-1:0] fflags_fmadd;
+        fflags_t [NUM_LANES-1:0] fflags_fmsub;
+        fflags_t [NUM_LANES-1:0] fflags_fnmadd;
+        fflags_t [NUM_LANES-1:0] fflags_fnmsub;
+
+        wire fma_valid = (valid_in && core_select == FPU_FMA);
+        wire fma_ready = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
+        wire fma_fire  = fma_valid && fma_ready;
+
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_fadd   (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
+                dpi_fsub   (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
+                dpi_fmul   (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
+                dpi_fmadd  (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
+                dpi_fmsub  (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
+                dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
+                dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
+
+                result_fma[i] = is_fadd   ? result_fadd[i][`XLEN-1:0] :
+                                is_fsub   ? result_fsub[i][`XLEN-1:0] :
+                                is_fmul   ? result_fmul[i][`XLEN-1:0] :
+                                is_fmadd  ? result_fmadd[i][`XLEN-1:0] :               
+                                is_fmsub  ? result_fmsub[i][`XLEN-1:0] :
+                                is_fnmadd ? result_fnmadd[i][`XLEN-1:0] :               
+                                is_fnmsub ? result_fnmsub[i][`XLEN-1:0] :
+                                            '0;
+
+                fflags_fma[i] = is_fadd   ? fflags_fadd[i] :
+                                is_fsub   ? fflags_fsub[i] :
+                                is_fmul   ? fflags_fmul[i] :
+                                is_fmadd  ? fflags_fmadd[i] :               
+                                is_fmsub  ? fflags_fmsub[i] :
+                                is_fnmadd ? fflags_fnmadd[i] :               
+                                is_fnmsub ? fflags_fnmsub[i] : 
+                                            '0;                
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fma, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FMA),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fma_ready),
+            .data_in  ({fma_valid, tag_in, result_fma, fflags_merged}),
+            .data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]})
+        );
+
+        assign per_core_has_fflags[FPU_FMA] = 1;
+        assign per_core_ready_in[FPU_FMA] = fma_ready;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fdiv
+
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
+        wire [NUM_LANES-1:0][63:0] result_fdiv;
+        fflags_t [NUM_LANES-1:0] fflags_fdiv;
+
+        wire fdiv_valid = (valid_in && core_select == FPU_DIVSQRT) && is_div;
+        wire fdiv_ready = div_ready_out || ~div_valid_out;
+        wire fdiv_fire  = fdiv_valid && fdiv_ready;
+        
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin                
+                dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
+                result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0];
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fdiv, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FDIV),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fdiv_ready),
+            .data_in  ({fdiv_valid, tag_in, result_fdiv_r, fflags_merged}),
+            .data_out ({div_valid_out, div_tag_out, div_result, div_fflags})
+        );
+
+        assign div_has_fflags = 1;
+        assign div_ready_in = fdiv_ready;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fsqrt
+
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
+        wire [NUM_LANES-1:0][63:0] result_fsqrt;
+        fflags_t [NUM_LANES-1:0] fflags_fsqrt;
+
+        wire fsqrt_valid = (valid_in && core_select == FPU_DIVSQRT) && ~is_div;
+        wire fsqrt_ready = sqrt_ready_out || ~sqrt_valid_out;                
+        wire fsqrt_fire  = fsqrt_valid && fsqrt_ready;
+        
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
+                result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0];
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fsqrt, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FSQRT),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fsqrt_ready),
+            .data_in  ({fsqrt_valid, tag_in, result_fsqrt_r, fflags_merged}),
+            .data_out ({sqrt_valid_out, sqrt_tag_out, sqrt_result, sqrt_fflags})
+        );
+
+        assign sqrt_has_fflags = 1;
+        assign sqrt_ready_in = fsqrt_ready;
+
+    end
+    endgenerate
+
+    generate
+    begin : fcvt
+
+        reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
+        wire [NUM_LANES-1:0][63:0] result_itof;
+        wire [NUM_LANES-1:0][63:0] result_utof;
+        wire [NUM_LANES-1:0][63:0] result_ftoi;
+        wire [NUM_LANES-1:0][63:0] result_ftou;
+        wire [NUM_LANES-1:0][63:0] result_f2f;
+        
+        fflags_t [NUM_LANES-1:0] fflags_fcvt;
+        fflags_t [NUM_LANES-1:0] fflags_itof;
+        fflags_t [NUM_LANES-1:0] fflags_utof;
+        fflags_t [NUM_LANES-1:0] fflags_ftoi;
+        fflags_t [NUM_LANES-1:0] fflags_ftou;
+
+        wire fcvt_valid = (valid_in && core_select == FPU_CVT);
+        wire fcvt_ready = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
+        wire fcvt_fire  = fcvt_valid && fcvt_ready;
+                
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
+                dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
+                dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
+                dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
+                dpi_f2f  (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]);                
+
+                result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] :
+                                is_utof ? result_utof[i][`XLEN-1:0] :
+                                is_ftoi ? result_ftoi[i][`XLEN-1:0] :
+                                is_ftou ? result_ftou[i][`XLEN-1:0] : 
+                                is_f2f  ? result_f2f[i][`XLEN-1:0] : 
+                                        '0;
+
+                fflags_fcvt[i] = is_itof ? fflags_itof[i] :
+                                is_utof ? fflags_utof[i] :
+                                is_ftoi ? fflags_ftoi[i] :
+                                is_ftou ? fflags_ftou[i] :
+                                       '0;
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fcvt, lane_mask, NUM_LANES);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FCVT),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fcvt_ready),
+            .data_in  ({fcvt_valid, tag_in, result_fcvt, fflags_merged}),
+            .data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]})
+        );
+
+        assign per_core_has_fflags[FPU_CVT] = 1;
+        assign per_core_ready_in[FPU_CVT] = fcvt_ready;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fncp
+
+        reg [NUM_LANES-1:0][`XLEN-1:0]  result_fncp;
+        wire [NUM_LANES-1:0][63:0] result_fclss;
+        wire [NUM_LANES-1:0][63:0] result_flt;
+        wire [NUM_LANES-1:0][63:0] result_fle;
+        wire [NUM_LANES-1:0][63:0] result_feq;
+        wire [NUM_LANES-1:0][63:0] result_fmin;
+        wire [NUM_LANES-1:0][63:0] result_fmax;
+        wire [NUM_LANES-1:0][63:0] result_fsgnj;
+        wire [NUM_LANES-1:0][63:0] result_fsgnjn;
+        wire [NUM_LANES-1:0][63:0] result_fsgnjx;
+        reg [NUM_LANES-1:0][63:0] result_fmvx;
+        reg [NUM_LANES-1:0][63:0] result_fmvf;
+
+        fflags_t [NUM_LANES-1:0] fflags_fncp;
+        fflags_t [NUM_LANES-1:0] fflags_flt;
+        fflags_t [NUM_LANES-1:0] fflags_fle;
+        fflags_t [NUM_LANES-1:0] fflags_feq;
+        fflags_t [NUM_LANES-1:0] fflags_fmin;
+        fflags_t [NUM_LANES-1:0] fflags_fmax;
+
+        wire fncp_valid = (valid_in && core_select == FPU_NCP);
+        wire fncp_ready = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
+        wire fncp_fire  = fncp_valid && fncp_ready;
+                
+        always @(*) begin        
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                dpi_fclss  (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]);
+                dpi_fle    (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
+                dpi_flt    (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);                
+                dpi_feq    (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
+                dpi_fmin   (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
+                dpi_fmax   (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);            
+                dpi_fsgnj  (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
+                dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
+                dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
+                result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0]));      // sign-extension
+                result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
+            end
+        end
+
+        always @(*) begin
+            result_fncp = 'x;
+            fflags_fncp = 'x;
+            for (integer i = 0; i < NUM_LANES; ++i) begin
+                case (frm)
+                0:  begin result_fncp[i] = is_fcmp ? result_fle[i][`XLEN-1:0] : result_fsgnj[i][`XLEN-1:0];  fflags_fncp[i] = fflags_fle[i]; end
+                1:  begin result_fncp[i] = is_fcmp ? result_flt[i][`XLEN-1:0] : result_fsgnjn[i][`XLEN-1:0]; fflags_fncp[i] = fflags_flt[i]; end
+                2:  begin result_fncp[i] = is_fcmp ? result_feq[i][`XLEN-1:0] : result_fsgnjx[i][`XLEN-1:0]; fflags_fncp[i] = fflags_feq[i]; end
+                3:  begin result_fncp[i] = result_fclss[i][`XLEN-1:0]; end
+                4:  begin result_fncp[i] = result_fmvx[i][`XLEN-1:0]; end
+                5:  begin result_fncp[i] = result_fmvf[i][`XLEN-1:0]; end
+                6:  begin result_fncp[i] = result_fmin[i][`XLEN-1:0]; fflags_fncp[i] = fflags_fmin[i]; end
+                7:  begin result_fncp[i] = result_fmax[i][`XLEN-1:0]; fflags_fncp[i] = fflags_fmax[i]; end
+                endcase
+            end
+        end
+
+        fflags_t fflags_merged;
+        `FPU_MERGE_FFLAGS(fflags_merged, fflags_fncp, lane_mask, NUM_LANES);
+
+        wire has_fflags_fncp = (frm >= 6) || is_fcmp;
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + 1 + NUM_LANES * `XLEN + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FNCP),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (fncp_ready),
+            .data_in  ({fncp_valid, tag_in, has_fflags_fncp, result_fncp, fflags_merged}),
+            .data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
+        );
+        
+        assign per_core_ready_in[FPU_NCP] = fncp_ready;
+
+    end
+    endgenerate
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
+
+    VX_stream_arb #(
+        .NUM_INPUTS (2),
+        .DATAW      (RSP_DATAW), 
+        .ARBITER    ("R"),
+        .OUT_REG    (0)
+    ) div_sqrt_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  ({sqrt_valid_out, div_valid_out}), 
+        .ready_in  ({sqrt_ready_out, div_ready_out}),
+        .data_in   ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out}, 
+                     {div_result, div_has_fflags, div_fflags, div_tag_out}}),
+        .data_out  ({per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], per_core_fflags[FPU_DIVSQRT], per_core_tag_out[FPU_DIVSQRT]}),
+        .valid_out (per_core_valid_out[FPU_DIVSQRT]),
+        .ready_out (per_core_ready_out[FPU_DIVSQRT]),
+        `UNUSED_PIN (sel_out)
+    );
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out;
+
+    for (genvar i = 0; i < NUM_FPC; ++i) begin
+        assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
+    end
+
+    VX_stream_arb #(
+        .NUM_INPUTS (NUM_FPC),
+        .DATAW      (RSP_DATAW), 
+        .ARBITER    ("R"),
+        .OUT_REG    (OUT_REG)
+    ) rsp_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  (per_core_valid_out), 
+        .ready_in  (per_core_ready_out),
+        .data_in   (per_core_data_out),
+        .data_out  ({result, has_fflags, fflags, tag_out}),
+        .valid_out (valid_out),
+        .ready_out (ready_out),
+        `UNUSED_PIN (sel_out)
+    );
+
+    assign ready_in = per_core_ready_in[core_select];
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_dsp.sv
+++ b/hw/rtl/fpu/VX_fpu_dsp.sv
@@ -0,0 +1,325 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_dsp import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 4, 
+    parameter TAGW      = 4,
+    parameter OUT_REG   = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    input wire  valid_in,
+    output wire ready_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FMT_BITS-1:0] fmt,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    localparam FPU_FMA     = 0;
+    localparam FPU_DIVSQRT = 1;
+    localparam FPU_CVT     = 2;
+    localparam FPU_NCP     = 3;
+    localparam NUM_FPC     = 4;
+    localparam FPC_BITS    = `LOG2UP(NUM_FPC);
+
+    localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAGW;
+
+    `UNUSED_VAR (fmt)    
+
+    wire [NUM_FPC-1:0] per_core_ready_in;
+    wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
+    wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
+    wire [NUM_FPC-1:0] per_core_ready_out;
+    wire [NUM_FPC-1:0] per_core_valid_out;    
+    wire [NUM_FPC-1:0] per_core_has_fflags;  
+    fflags_t [NUM_FPC-1:0] per_core_fflags;
+
+    wire div_ready_in, sqrt_ready_in;
+    wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
+    wire [TAGW-1:0] div_tag_out, sqrt_tag_out;
+    wire div_ready_out, sqrt_ready_out;
+    wire div_valid_out, sqrt_valid_out;    
+    wire div_has_fflags, sqrt_has_fflags;  
+    fflags_t div_fflags, sqrt_fflags;
+
+    reg [FPC_BITS-1:0] core_select;
+    reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed;
+
+    always @(*) begin
+        is_madd   = 0;
+        is_sub    = 0;        
+        is_neg    = 0;
+        is_div    = 0;
+        is_itof   = 0;
+        is_signed = 0;
+        case (op_type)
+            `INST_FPU_ADD:    begin core_select = FPU_FMA; end
+            `INST_FPU_SUB:    begin core_select = FPU_FMA; is_sub = 1; end
+            `INST_FPU_MUL:    begin core_select = FPU_FMA; is_neg = 1; end
+            `INST_FPU_MADD:   begin core_select = FPU_FMA; is_madd = 1; end
+            `INST_FPU_MSUB:   begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end
+            `INST_FPU_NMADD:  begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end
+            `INST_FPU_NMSUB:  begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end
+            `INST_FPU_DIV:    begin core_select = FPU_DIVSQRT; is_div = 1; end
+            `INST_FPU_SQRT:   begin core_select = FPU_DIVSQRT; end
+            `INST_FPU_F2I:    begin core_select = FPU_CVT; is_signed = 1; end
+            `INST_FPU_F2U:    begin core_select = FPU_CVT; end
+            `INST_FPU_I2F:    begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
+            `INST_FPU_U2F:    begin core_select = FPU_CVT; is_itof = 1; end
+            default:          begin core_select = FPU_NCP; end
+        endcase
+    end
+
+    `RESET_RELAY (fma_reset, reset);
+    `RESET_RELAY (div_reset, reset);
+    `RESET_RELAY (sqrt_reset, reset);
+    `RESET_RELAY (cvt_reset, reset);
+    `RESET_RELAY (ncp_reset, reset);
+
+    wire [NUM_LANES-1:0][31:0] dataa_s;
+    wire [NUM_LANES-1:0][31:0] datab_s;
+    wire [NUM_LANES-1:0][31:0] datac_s;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        assign dataa_s[i] = dataa[i][31:0];
+        assign datab_s[i] = datab[i][31:0];
+        assign datac_s[i] = datac[i][31:0];
+    end
+
+    `UNUSED_VAR (dataa)
+    `UNUSED_VAR (datab)
+    `UNUSED_VAR (datac)
+
+    VX_fpu_fma #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW)
+    ) fpu_fma (
+        .clk        (clk), 
+        .reset      (fma_reset), 
+        .valid_in   (valid_in && (core_select == FPU_FMA)),
+        .ready_in   (per_core_ready_in[FPU_FMA]),
+        .lane_mask  (lane_mask),
+        .tag_in     (tag_in), 
+        .frm        (frm),
+        .is_madd    (is_madd),
+        .is_sub     (is_sub),
+        .is_neg     (is_neg),
+        .dataa      (dataa_s), 
+        .datab      (datab_s), 
+        .datac      (datac_s), 
+        .has_fflags (per_core_has_fflags[FPU_FMA]),
+        .fflags     (per_core_fflags[FPU_FMA]),
+        .result     (per_core_result[FPU_FMA]),
+        .tag_out    (per_core_tag_out[FPU_FMA]),
+        .ready_out  (per_core_ready_out[FPU_FMA]),
+        .valid_out  (per_core_valid_out[FPU_FMA])
+    );
+
+    VX_fpu_div #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW)
+    ) fpu_div (
+        .clk        (clk), 
+        .reset      (div_reset), 
+        .valid_in   (valid_in && (core_select == FPU_DIVSQRT) && is_div),
+        .ready_in   (div_ready_in),
+        .lane_mask  (lane_mask),
+        .tag_in     (tag_in),
+        .frm        (frm), 
+        .dataa      (dataa_s), 
+        .datab      (datab_s), 
+        .has_fflags (div_has_fflags),
+        .fflags     (div_fflags), 
+        .result     (div_result),
+        .tag_out    (div_tag_out),
+        .valid_out  (div_valid_out),
+        .ready_out  (div_ready_out)
+    );
+
+    VX_fpu_sqrt #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW)
+    ) fpu_sqrt (
+        .clk        (clk), 
+        .reset      (sqrt_reset), 
+        .valid_in   (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
+        .ready_in   (sqrt_ready_in),
+        .lane_mask  (lane_mask),
+        .tag_in     (tag_in),
+        .frm        (frm), 
+        .dataa      (dataa_s), 
+        .has_fflags (sqrt_has_fflags),
+        .fflags     (sqrt_fflags),
+        .result     (sqrt_result),
+        .tag_out    (sqrt_tag_out),
+        .valid_out  (sqrt_valid_out),
+        .ready_out  (sqrt_ready_out)
+    );
+
+    wire cvt_rt_int_in = ~is_itof;
+    wire cvt_rt_int_out;
+
+    VX_fpu_cvt #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW+1)
+    ) fpu_cvt (
+        .clk        (clk), 
+        .reset      (cvt_reset), 
+        .valid_in   (valid_in && (core_select == FPU_CVT)),
+        .ready_in   (per_core_ready_in[FPU_CVT]),
+        .lane_mask  (lane_mask),
+        .tag_in     ({cvt_rt_int_in, tag_in}), 
+        .frm        (frm),
+        .is_itof    (is_itof), 
+        .is_signed  (is_signed), 
+        .dataa      (dataa_s), 
+        .has_fflags (per_core_has_fflags[FPU_CVT]),
+        .fflags     (per_core_fflags[FPU_CVT]),
+        .result     (per_core_result[FPU_CVT]),
+        .tag_out    ({cvt_rt_int_out, per_core_tag_out[FPU_CVT]}),
+        .valid_out  (per_core_valid_out[FPU_CVT]),
+        .ready_out  (per_core_ready_out[FPU_CVT])
+    );
+
+    wire ncp_rt_int_in = (op_type == `INST_FPU_CMP)
+                      || `INST_FPU_IS_CLASS(op_type, frm) 
+                      || `INST_FPU_IS_MVXW(op_type, frm);
+    wire ncp_rt_int_out;
+
+    wire ncp_rt_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
+    wire ncp_rt_sext_out;
+    
+    VX_fpu_ncomp #(
+        .NUM_LANES (NUM_LANES),
+        .TAGW      (TAGW+2)
+    ) fpu_ncomp (
+        .clk        (clk),
+        .reset      (ncp_reset), 
+        .valid_in   (valid_in && (core_select == FPU_NCP)),
+        .ready_in   (per_core_ready_in[FPU_NCP]),
+        .lane_mask  (lane_mask),
+        .tag_in     ({ncp_rt_sext_in, ncp_rt_int_in, tag_in}),
+        .op_type    (op_type),
+        .frm        (frm),
+        .dataa      (dataa_s),
+        .datab      (datab_s), 
+        .result     (per_core_result[FPU_NCP]), 
+        .has_fflags (per_core_has_fflags[FPU_NCP]),
+        .fflags     (per_core_fflags[FPU_NCP]),
+        .tag_out    ({ncp_rt_sext_out, ncp_rt_int_out, per_core_tag_out[FPU_NCP]}),
+        .valid_out  (per_core_valid_out[FPU_NCP]),
+        .ready_out  (per_core_ready_out[FPU_NCP])
+    );
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
+
+    VX_stream_arb #(
+        .NUM_INPUTS (2),
+        .DATAW      (RSP_DATAW), 
+        .ARBITER    ("R"),
+        .OUT_REG    (0)
+    ) div_sqrt_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  ({sqrt_valid_out, div_valid_out}), 
+        .ready_in  ({sqrt_ready_out, div_ready_out}),
+        .data_in   ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out}, 
+                     {div_result, div_has_fflags, div_fflags, div_tag_out}}),
+        .data_out  ({per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], per_core_fflags[FPU_DIVSQRT], per_core_tag_out[FPU_DIVSQRT]}),
+        .valid_out (per_core_valid_out[FPU_DIVSQRT]),
+        .ready_out (per_core_ready_out[FPU_DIVSQRT]),
+        `UNUSED_PIN (sel_out)
+    );
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
+    
+    always @(*) begin
+        for (integer i = 0; i < NUM_FPC; ++i) begin
+            per_core_data_out[i][RSP_DATAW+1:2] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
+            per_core_data_out[i][1:0] = '0;
+        end        
+        per_core_data_out[FPU_CVT][1:0] = {1'b1, cvt_rt_int_out};
+        per_core_data_out[FPU_NCP][1:0] = {ncp_rt_sext_out, ncp_rt_int_out};
+    end
+
+    wire [NUM_LANES-1:0][31:0] result_s;
+    wire [1:0] op_rt_int_out;
+
+    VX_stream_arb #(
+        .NUM_INPUTS (NUM_FPC),
+        .DATAW      (RSP_DATAW + 2), 
+        .ARBITER    ("R"),
+        .OUT_REG    (OUT_REG)
+    ) rsp_arb (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  (per_core_valid_out), 
+        .ready_in  (per_core_ready_out),
+        .data_in   (per_core_data_out),
+        .data_out  ({result_s, has_fflags, fflags, tag_out, op_rt_int_out}),
+        .valid_out (valid_out),
+        .ready_out (ready_out),
+        `UNUSED_PIN (sel_out)
+    );
+
+`ifndef FPU_RV64F
+    `UNUSED_VAR (op_rt_int_out)
+`endif
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin        
+    `ifdef FPU_RV64F
+        reg [`XLEN-1:0] result_r;
+        always @(*) begin
+            case (op_rt_int_out)
+            2'b11:   result_r = `XLEN'($signed(result_s[i]));
+            2'b01:   result_r = {32'h00000000, result_s[i]};
+            default: result_r = {32'hffffffff, result_s[i]};
+            endcase
+        end
+        assign result[i] = result_r;
+    `else
+        assign result[i] = result_s[i];
+    `endif
+    end
+
+    // can accept new request?
+    assign ready_in = per_core_ready_in[core_select];
+
+endmodule
+`endif 
--- a/hw/rtl/fpu/VX_fpu_fma.sv
+++ b/hw/rtl/fpu/VX_fpu_fma.sv
@@ -0,0 +1,170 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_fma import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1, 
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire  is_madd,
+    input wire  is_sub,
+    input wire  is_neg,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    input wire [NUM_LANES-1:0][31:0]  datab,
+    input wire [NUM_LANES-1:0][31:0]  datac,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    `UNUSED_VAR (frm)
+
+    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;
+
+    fflags_t [NUM_LANES-1:0] per_lane_fflags;
+    wire [NUM_LANES-1:0] lane_mask_out;
+
+    VX_shift_register #(
+        .DATAW  (1 + NUM_LANES + TAGW),
+        .DEPTH  (`LATENCY_FMA),
+        .RESETW (1)
+    ) shift_reg (
+        .clk(clk),
+        .reset    (reset),
+        .enable   (enable),
+        .data_in  ({valid_in, lane_mask, tag_in}),
+        .data_out ({valid_out, lane_mask_out, tag_out})
+    );
+
+    assign ready_in = enable;
+
+    reg [NUM_LANES-1:0][31:0] a, b, c;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            if (is_madd) begin
+                // MADD / MSUB / NMADD / NMSUB
+                a[i] = is_neg ? {~dataa[i][31], dataa[i][30:0]} : dataa[i];                    
+                b[i] = datab[i];
+                c[i] = (is_neg ^ is_sub) ? {~datac[i][31], datac[i][30:0]} : datac[i];
+            end else begin
+                if (is_neg) begin
+                    // MUL
+                    a[i] = dataa[i];
+                    b[i] = datab[i];
+                    c[i] = '0;
+                end else begin
+                    // ADD / SUB
+                    a[i] = 32'h3f800000; // 1.0f
+                    b[i] = dataa[i];
+                    c[i] = is_sub ? {~datab[i][31], datab[i][30:0]} : datab[i];
+                end
+            end    
+        end
+    end
+
+`ifdef QUARTUS
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        acl_fmadd fmadd (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (enable),
+            .a      (a[i]),
+            .b      (b[i]),
+            .c      (c[i]),
+            .q      (result[i])
+        );
+    end
+    
+    assign has_fflags = 0;
+    assign per_lane_fflags = 'x;
+
+`elsif VIVADO
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [2:0] tuser;
+        
+        xil_fma fma (
+            .aclk                (clk),
+            .aclken              (enable),
+            .s_axis_a_tvalid     (1'b1),
+            .s_axis_a_tdata      (a[i]),
+            .s_axis_b_tvalid     (1'b1),
+            .s_axis_b_tdata      (b[i]),
+            .s_axis_c_tvalid     (1'b1),
+            .s_axis_c_tdata      (c[i]),
+            `UNUSED_PIN (m_axis_result_tvalid),
+            .m_axis_result_tdata (result[i]),
+            .m_axis_result_tuser (tuser)
+        );
+                                    // NV, DZ, OF, UF, NX
+        assign per_lane_fflags[i] = {tuser[2], 1'b0, tuser[1], tuser[0], 1'b0};
+    end
+
+    assign has_fflags = 1;
+
+`else
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        reg [63:0] r;
+        `UNUSED_VAR (r)
+
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fmadd (enable && valid_in, int'(0), {32'hffffffff, a[i]}, {32'hffffffff, b[i]}, {32'hffffffff, c[i]}, frm, r, f);
+        end
+
+        VX_shift_register #(
+            .DATAW  (32 + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FMA)
+        ) shift_req_dpi (
+            .clk      (clk),
+            `UNUSED_PIN (reset),
+            .enable   (enable),
+            .data_in  ({r[31:0], f}),
+            .data_out ({result[i], per_lane_fflags[i]})
+        );
+    end
+
+    assign has_fflags = 1;
+
+`endif
+
+`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_fpnew.sv
+++ b/hw/rtl/fpu/VX_fpu_fpnew.sv
@@ -0,0 +1,286 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_FPNEW
+
+module VX_fpu_fpnew 
+    import VX_fpu_pkg::*; 
+    import fpnew_pkg::*; 
+    import cf_math_pkg::*; 
+    import defs_div_sqrt_mvp::*;
+#(      
+    parameter NUM_LANES = 1,
+    parameter TAGW      = 1,
+    parameter OUT_REG   = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    input wire  valid_in,
+    output wire ready_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FMT_BITS-1:0] fmt,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);  
+    localparam LATENCY_FDIVSQRT = `MAX(`LATENCY_FDIV, `LATENCY_FSQRT);
+    localparam RSP_DATAW = (NUM_LANES * `XLEN) + 1 + $bits(fflags_t) + TAGW;
+
+`ifdef XLEN_64
+    // use scalar configuration for mixed formats
+    localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{
+        Width:         unsigned'(`XLEN),
+        EnableVectors: 1'b0,
+        EnableNanBox:  1'b1,
+    `ifdef FLEN_64
+        FpFmtMask:     5'b11000,
+    `else
+        FpFmtMask:     5'b11000, // TODO: added FP64 to fix CVT bug in FpNew
+    `endif
+        IntFmtMask:    4'b0011
+    };
+`else
+    localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{
+        Width:         unsigned'(`XLEN * NUM_LANES),
+        EnableVectors: 1'b1,
+        EnableNanBox:  1'b0,
+        FpFmtMask:     5'b10000,
+        IntFmtMask:    4'b0010
+    };
+`endif
+
+    localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
+      PipeRegs:'{'{`LATENCY_FMA, 0, 0, 0, 0}, // ADDMUL
+                 '{default: unsigned'(LATENCY_FDIVSQRT)}, // DIVSQRT
+                 '{default: `LATENCY_FNCP}, // NONCOMP
+                 '{default: `LATENCY_FCVT}}, // CONV
+      UnitTypes:'{'{default: fpnew_pkg::PARALLEL}, // ADDMUL
+                  '{default: fpnew_pkg::MERGED}, // DIVSQRT
+                  '{default: fpnew_pkg::PARALLEL}, // NONCOMP
+                  '{default: fpnew_pkg::MERGED}}, // CONV
+      PipeConfig: fpnew_pkg::DISTRIBUTED
+    };
+    
+    wire fpu_ready_in, fpu_valid_in;    
+    wire fpu_ready_out, fpu_valid_out;
+
+    reg [TAGW-1:0] fpu_tag_in, fpu_tag_out;
+    
+    reg [2:0][NUM_LANES-1:0][`XLEN-1:0] fpu_operands;
+
+    wire [NUM_LANES-1:0][`XLEN-1:0] fpu_result;
+    fpnew_pkg::status_t fpu_status;
+
+    fpnew_pkg::operation_e fpu_op;
+    reg [`INST_FRM_BITS-1:0] fpu_rnd;
+    reg fpu_op_mod;
+    reg fpu_has_fflags, fpu_has_fflags_out;
+    fpnew_pkg::fp_format_e fpu_src_fmt, fpu_dst_fmt;
+    fpnew_pkg::int_format_e fpu_int_fmt;
+
+    `UNUSED_VAR (fmt)
+
+    always @(*) begin
+        fpu_op          = 'x;
+        fpu_rnd         = frm;  
+        fpu_op_mod      = 0;        
+        fpu_has_fflags  = 1;
+        fpu_operands[0] = dataa;
+        fpu_operands[1] = datab;
+        fpu_operands[2] = datac;    
+        fpu_dst_fmt     = fpnew_pkg::FP32;
+        fpu_int_fmt     = fpnew_pkg::INT32;
+
+    `ifdef FLEN_64
+        if (fmt[0]) begin
+            fpu_dst_fmt = fpnew_pkg::FP64;
+        end
+    `endif
+
+    `ifdef XLEN_64
+        if (fmt[1]) begin
+            fpu_int_fmt = fpnew_pkg::INT64;
+        end
+    `endif
+
+        fpu_src_fmt = fpu_dst_fmt;
+        
+        case (op_type)
+            `INST_FPU_ADD: begin
+                fpu_op = fpnew_pkg::ADD;
+                fpu_operands[1] = dataa;
+                fpu_operands[2] = datab;
+            end
+            `INST_FPU_SUB: begin 
+                fpu_op = fpnew_pkg::ADD; 
+                fpu_operands[1] = dataa;
+                fpu_operands[2] = datab;
+                fpu_op_mod = 1; 
+            end
+            `INST_FPU_MUL:   begin fpu_op = fpnew_pkg::MUL; end
+            `INST_FPU_DIV:   begin fpu_op = fpnew_pkg::DIV; end
+            `INST_FPU_SQRT:  begin fpu_op = fpnew_pkg::SQRT; end
+            `INST_FPU_MADD:  begin fpu_op = fpnew_pkg::FMADD; end
+            `INST_FPU_MSUB:  begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end            
+            `INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
+            `INST_FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
+        `ifdef FLEN_64
+            `INST_FPU_F2F: begin fpu_op = fpnew_pkg::F2F; fpu_src_fmt = fmt[0] ? fpnew_pkg::FP32 : fpnew_pkg::FP64; end
+        `endif
+            `INST_FPU_F2I,
+            `INST_FPU_F2U: begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = op_type[0]; end
+            `INST_FPU_I2F,
+            `INST_FPU_U2F: begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = op_type[0]; end
+            `INST_FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end
+            `INST_FPU_MISC:begin
+                case (frm)
+                    0,1,2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = {1'b0, frm[1:0]}; fpu_has_fflags = 0; end // FSGNJ
+                    3:     begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end // CLASS                     
+                    4,5:   begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = 3'b011; fpu_op_mod = ~frm[0]; fpu_has_fflags = 0; end // FMV.X.W, FMV.W.X
+                    6,7:   begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = {2'b00, frm[0]}; end // MIN, MAX
+                endcase    
+            end
+            default:;
+        endcase
+
+    `ifdef FPU_RV64F
+        // apply nan-boxing to floating-point operands
+        for (integer i = 0; i < NUM_LANES; ++i) begin                    
+            if (op_type != `INST_FPU_I2F && op_type != `INST_FPU_U2F) begin
+                fpu_operands[0][i] |= 64'hffffffff00000000;
+            end
+            fpu_operands[1][i] |= 64'hffffffff00000000;
+            fpu_operands[2][i] |= 64'hffffffff00000000;        
+        end
+    `endif
+    end
+
+`ifdef XLEN_64
+    `UNUSED_VAR (lane_mask)
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire [(TAGW+1)-1:0] fpu_tag;        
+        wire fpu_valid_out_uq;
+        wire fpu_ready_in_uq;
+        fpnew_pkg::status_t fpu_status_uq;
+        `UNUSED_VAR (fpu_tag)
+        `UNUSED_VAR (fpu_valid_out_uq)
+        `UNUSED_VAR (fpu_ready_in_uq)
+        `UNUSED_VAR (fpu_status_uq)
+
+        fpnew_top #( 
+            .Features       (FPU_FEATURES),
+            .Implementation (FPU_IMPLEMENTATION),
+            .TagType        (logic[(TAGW+1)-1:0])
+        ) fpnew_core (
+            .clk_i          (clk),
+            .rst_ni         (~reset),
+            .operands_i     ({fpu_operands[2][i], fpu_operands[1][i], fpu_operands[0][i]}),
+            .rnd_mode_i     (fpnew_pkg::roundmode_e'(fpu_rnd)),
+            .op_i           (fpu_op),
+            .op_mod_i       (fpu_op_mod),
+            .src_fmt_i      (fpu_src_fmt),
+            .dst_fmt_i      (fpu_dst_fmt),
+            .int_fmt_i      (fpu_int_fmt),
+            `UNUSED_PIN (vectorial_op_i),
+            `UNUSED_PIN (simd_mask_i),
+            .tag_i          ({fpu_tag_in, fpu_has_fflags}),            
+            .in_valid_i     (fpu_valid_in),
+            .in_ready_o     (fpu_ready_in_uq),
+            .flush_i        (reset),
+            .result_o       (fpu_result[i]),
+            .status_o       (fpu_status_uq),
+            .tag_o          (fpu_tag),
+            .out_valid_o    (fpu_valid_out_uq),
+            .out_ready_i    (fpu_ready_out),
+            `UNUSED_PIN (busy_o)
+        );
+        
+        if (i == 0) begin
+            assign {fpu_tag_out, fpu_has_fflags_out} = fpu_tag;            
+            assign fpu_valid_out = fpu_valid_out_uq;
+            assign fpu_ready_in = fpu_ready_in_uq;
+            assign fpu_status = fpu_status_uq;
+        end
+    end
+`else
+    fpnew_top #( 
+        .Features       (FPU_FEATURES),
+        .Implementation (FPU_IMPLEMENTATION),
+        .TagType        (logic[(TAGW+1)-1:0]),
+        .TrueSIMDClass  (1),
+        .EnableSIMDMask (1)
+    ) fpnew_core (
+        .clk_i          (clk),
+        .rst_ni         (~reset),
+        .operands_i     (fpu_operands),
+        .rnd_mode_i     (fpnew_pkg::roundmode_e'(fpu_rnd)),
+        .op_i           (fpu_op),
+        .op_mod_i       (fpu_op_mod),
+        .src_fmt_i      (fpu_src_fmt),
+        .dst_fmt_i      (fpu_dst_fmt),
+        .int_fmt_i      (fpu_int_fmt),
+        .vectorial_op_i (1'b1),
+        .simd_mask_i    (lane_mask),
+        .tag_i          ({fpu_tag_in, fpu_has_fflags}),        
+        .in_valid_i     (fpu_valid_in),
+        .in_ready_o     (fpu_ready_in),
+        .flush_i        (reset),
+        .result_o       (fpu_result),
+        .status_o       (fpu_status),
+        .tag_o          ({fpu_tag_out, fpu_has_fflags_out}),
+        .out_valid_o    (fpu_valid_out),
+        .out_ready_i    (fpu_ready_out),
+        `UNUSED_PIN (busy_o)
+    );
+`endif
+
+    assign fpu_valid_in = valid_in;
+    assign ready_in = fpu_ready_in;
+    assign fpu_tag_in = tag_in;
+
+    VX_elastic_buffer #(
+        .DATAW   (RSP_DATAW),
+        .SIZE    (`OUT_REG_TO_EB_SIZE(OUT_REG)),
+        .OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
+    ) rsp_buf (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  (fpu_valid_out),
+        .ready_in  (fpu_ready_out),
+        .data_in   ({fpu_result, fpu_has_fflags_out, fpu_status, fpu_tag_out}),
+        .data_out  ({result, has_fflags, fflags, tag_out}),
+        .valid_out (valid_out),
+        .ready_out (ready_out)
+    );
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_ncomp.sv
+++ b/hw/rtl/fpu/VX_fpu_ncomp.sv
@@ -0,0 +1,292 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+/// Modified port of noncomp module from fpnew Libray 
+/// reference: https://github.com/pulp-platform/fpnew
+
+module VX_fpu_ncomp import VX_fpu_pkg::*; #(
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset,
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire [`INST_FPU_BITS-1:0] op_type,
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    input wire [NUM_LANES-1:0][31:0]  datab,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);  
+    localparam  EXP_BITS = 8;
+    localparam  MAN_BITS = 23;
+        
+    localparam  NEG_INF     = 32'h00000001,
+                NEG_NORM    = 32'h00000002,
+                NEG_SUBNORM = 32'h00000004,
+                NEG_ZERO    = 32'h00000008,
+                POS_ZERO    = 32'h00000010,
+                POS_SUBNORM = 32'h00000020,
+                POS_NORM    = 32'h00000040,
+                POS_INF     = 32'h00000080,
+                //SIG_NAN   = 32'h00000100,
+                QUT_NAN     = 32'h00000200;
+
+    wire [NUM_LANES-1:0]        a_sign, b_sign;
+    wire [NUM_LANES-1:0][7:0]   a_exponent, b_exponent;
+    wire [NUM_LANES-1:0][22:0]  a_mantissa, b_mantissa;
+    fclass_t [NUM_LANES-1:0]    a_fclass, b_fclass;
+    wire [NUM_LANES-1:0]        a_smaller, ab_equal;
+
+    // Setup
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        assign     a_sign[i] = dataa[i][31]; 
+        assign a_exponent[i] = dataa[i][30:23];
+        assign a_mantissa[i] = dataa[i][22:0];
+
+        assign     b_sign[i] = datab[i][31]; 
+        assign b_exponent[i] = datab[i][30:23];
+        assign b_mantissa[i] = datab[i][22:0];
+
+        VX_fpu_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_a (
+            .exp_i  (a_exponent[i]),
+            .man_i  (a_mantissa[i]),
+            .clss_o (a_fclass[i])
+        );
+
+        VX_fpu_class #( 
+            .EXP_BITS (EXP_BITS),
+            .MAN_BITS (MAN_BITS)
+        ) fp_class_b (
+            .exp_i  (b_exponent[i]),
+            .man_i  (b_mantissa[i]),
+            .clss_o (b_fclass[i])
+        );
+
+        assign a_smaller[i] = (dataa[i] < datab[i]) ^ (a_sign[i] || b_sign[i]);
+        assign ab_equal[i]  = (dataa[i] == datab[i]) 
+                           || (a_fclass[i].is_zero && b_fclass[i].is_zero); // +0 == -0
+    end  
+
+    // Pipeline stage0
+
+    wire                        valid_in_s0;
+    wire [NUM_LANES-1:0]        lane_mask_s0;
+    wire [TAGW-1:0]             tag_in_s0;
+    wire [3:0]                  op_mod_s0;
+    wire [NUM_LANES-1:0][31:0]  dataa_s0, datab_s0;
+    wire [NUM_LANES-1:0]        a_sign_s0, b_sign_s0;
+    wire [NUM_LANES-1:0][7:0]   a_exponent_s0;
+    wire [NUM_LANES-1:0][22:0]  a_mantissa_s0;
+    fclass_t [NUM_LANES-1:0]    a_fclass_s0, b_fclass_s0;
+    wire [NUM_LANES-1:0]        a_smaller_s0, ab_equal_s0;
+
+    wire stall;
+
+    wire [3:0] op_mod = {(op_type == `INST_FPU_CMP), frm};
+
+    VX_pipe_register #(
+        .DATAW  (1 + NUM_LANES + TAGW + 4 + NUM_LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1)),
+        .RESETW (1)
+    ) pipe_reg0 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (!stall),
+        .data_in  ({valid_in, lane_mask, tag_in, op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}),
+        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
+    ); 
+
+    // FCLASS
+    reg [NUM_LANES-1:0][31:0] fclass_mask_s0;  // generate a 10-bit mask for integer reg
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin 
+            if (a_fclass_s0[i].is_normal) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_NORM : POS_NORM;
+            end 
+            else if (a_fclass_s0[i].is_inf) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_INF : POS_INF;
+            end 
+            else if (a_fclass_s0[i].is_zero) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_ZERO : POS_ZERO;
+            end 
+            else if (a_fclass_s0[i].is_subnormal) begin
+                fclass_mask_s0[i] = a_sign_s0[i] ? NEG_SUBNORM : POS_SUBNORM;
+            end 
+            else if (a_fclass_s0[i].is_nan) begin
+                fclass_mask_s0[i] = {22'h0, a_fclass_s0[i].is_quiet, a_fclass_s0[i].is_signaling, 8'h0};
+            end 
+            else begin                     
+                fclass_mask_s0[i] = QUT_NAN;
+            end
+        end
+    end
+
+    // Min/Max    
+    reg [NUM_LANES-1:0][31:0] fminmax_res_s0;
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            if (a_fclass_s0[i].is_nan && b_fclass_s0[i].is_nan)
+                fminmax_res_s0[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
+            else if (a_fclass_s0[i].is_nan) 
+                fminmax_res_s0[i] = datab_s0[i];
+            else if (b_fclass_s0[i].is_nan) 
+                fminmax_res_s0[i] = dataa_s0[i];
+            else begin 
+                // FMIN, FMAX
+                fminmax_res_s0[i] = (op_mod_s0[0] ^ a_smaller_s0[i]) ? dataa_s0[i] : datab_s0[i];
+            end
+        end
+    end
+
+    // Sign injection    
+    reg [NUM_LANES-1:0][31:0] fsgnj_res_s0;    // result of sign injection
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            case (op_mod_s0[1:0])
+                0: fsgnj_res_s0[i] = { b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
+                1: fsgnj_res_s0[i] = {~b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
+          default: fsgnj_res_s0[i] = { a_sign_s0[i] ^ b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
+            endcase
+        end
+    end
+
+    // Comparison    
+    reg [NUM_LANES-1:0] fcmp_res_s0;        // result of comparison
+    reg [NUM_LANES-1:0] fcmp_fflags_NV_s0;  // comparison fflags
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            case (op_mod_s0[1:0])
+                0: begin // LE                    
+                    if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
+                        fcmp_res_s0[i]       = 0;
+                        fcmp_fflags_NV_s0[i] = 1;
+                    end else begin
+                        fcmp_res_s0[i]       = (a_smaller_s0[i] | ab_equal_s0[i]);
+                        fcmp_fflags_NV_s0[i] = 0;
+                    end
+                end
+                1: begin // LT
+                    if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
+                        fcmp_res_s0[i]       = 0;
+                        fcmp_fflags_NV_s0[i] = 1;
+                    end else begin
+                        fcmp_res_s0[i]       = (a_smaller_s0[i] & ~ab_equal_s0[i]);
+                        fcmp_fflags_NV_s0[i] = 0;
+                    end                    
+                end
+                2: begin // EQ
+                    if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
+                        fcmp_res_s0[i]       = 0;
+                        fcmp_fflags_NV_s0[i] = a_fclass_s0[i].is_signaling | b_fclass_s0[i].is_signaling; 
+                    end else begin
+                        fcmp_res_s0[i]       = ab_equal_s0[i];
+                        fcmp_fflags_NV_s0[i] = 0;
+                    end
+                end
+                default: begin
+                    fcmp_res_s0[i]       = 'x;
+                    fcmp_fflags_NV_s0[i] = 'x;                        
+                end
+            endcase
+        end
+    end
+
+    // outputs
+
+    reg [NUM_LANES-1:0][31:0] result_s0;
+    reg [NUM_LANES-1:0] fflags_NV_s0;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        always @(*) begin
+            case (op_mod_s0[2:0])
+                0,1,2: begin
+                    // SGNJ, CMP
+                    result_s0[i] = op_mod_s0[3] ? 32'(fcmp_res_s0[i]) : fsgnj_res_s0[i];
+                    fflags_NV_s0[i] = fcmp_fflags_NV_s0[i];
+                end
+                3: begin
+                    // CLASS
+                    result_s0[i] = fclass_mask_s0[i];
+                    fflags_NV_s0[i] = 'x;
+                end
+                4,5: begin
+                    // FMV
+                    result_s0[i] = dataa_s0[i];
+                    fflags_NV_s0[i] = 'x;
+                end                
+                6,7: begin
+                    // MIN/MAX
+                    result_s0[i] = fminmax_res_s0[i];
+                    fflags_NV_s0[i] = a_fclass_s0[i].is_signaling | b_fclass_s0[i].is_signaling;
+                end
+            endcase
+        end
+    end
+
+    // only MIN/MAX and CMP return status flags
+    wire has_fflags_s0 = (op_mod_s0[2:0] >= 6) || op_mod_s0[3];
+
+    assign stall = ~ready_out && valid_out;
+
+    wire fflags_NV;
+    reg fflags_merged;
+
+    always @(*) begin
+        fflags_merged = 0;
+        for (integer i = 0; i < NUM_LANES; ++i) begin
+            if (lane_mask_s0[i]) begin
+                fflags_merged |= fflags_NV_s0[i];
+            end
+        end
+    end
+
+    VX_pipe_register #(
+        .DATAW  (1 + TAGW + (NUM_LANES * 32) + 1 + 1),
+        .RESETW (1)
+    ) pipe_reg1 (
+        .clk      (clk),
+        .reset    (reset),
+        .enable   (!stall),
+        .data_in  ({valid_in_s0, tag_in_s0, result_s0, has_fflags_s0, fflags_merged}),
+        .data_out ({valid_out, tag_out, result, has_fflags, fflags_NV})
+    );
+
+    assign ready_in = ~stall;
+
+                  // NV, DZ, OF, UF, NX
+    assign fflags = {fflags_NV, 1'b0, 1'b0, 1'b0, 1'b0};
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_pkg.sv
+++ b/hw/rtl/fpu/VX_fpu_pkg.sv
@@ -0,0 +1,41 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`ifndef VX_FPU_PKG_VH
+`define VX_FPU_PKG_VH
+
+`include "VX_define.vh"
+
+package VX_fpu_pkg;
+
+typedef struct packed {
+    logic is_normal;
+    logic is_zero;
+    logic is_subnormal;
+    logic is_inf;
+    logic is_nan;
+    logic is_quiet;
+    logic is_signaling;    
+} fclass_t;
+
+typedef struct packed {
+    logic NV; // 4-Invalid
+    logic DZ; // 3-Divide by zero
+    logic OF; // 2-Overflow
+    logic UF; // 1-Underflow
+    logic NX; // 0-Inexact
+} fflags_t;
+
+endpackage
+
+`endif // VX_FPU_PKG_VH
--- a/hw/rtl/fpu/VX_fpu_rounding.sv
+++ b/hw/rtl/fpu/VX_fpu_rounding.sv
@@ -0,0 +1,79 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+/// Modified port of rouding module from fpnew Libray
+/// reference: https://github.com/pulp-platform/fpnew
+
+module VX_fpu_rounding #(
+    parameter DAT_WIDTH = 2 // Width of the abolute value, without sign bit
+) (
+    // inputs
+    input wire [DAT_WIDTH-1:0]  abs_value_i, // absolute value without sign
+    input wire                  sign_i,
+    // rounding information
+    input wire [1:0]            round_sticky_bits_i, // round and sticky bits {RS}
+    input wire [2:0]            rnd_mode_i,
+    input wire                  effective_subtraction_i, // sign of inputs affects rounding of zeroes
+    // outputs
+    output wire [DAT_WIDTH-1:0] abs_rounded_o, // absolute value without sign
+    output wire                 sign_o,
+    output wire                 exact_zero_o             // output is an exact zero
+);
+
+    reg round_up; // Rounding decision
+
+    // Take the rounding decision according to RISC-V spec
+    // RoundMode | Mnemonic | Meaning
+    // :--------:|:--------:|:-------
+    //    000    |   RNE    | Round to Nearest, ties to Even
+    //    001    |   RTZ    | Round towards Zero
+    //    010    |   RDN    | Round Down (towards -\infty)
+    //    011    |   RUP    | Round Up (towards \infty)
+    //    100    |   RMM    | Round to Nearest, ties to Max Magnitude
+    //  others   |          | *invalid*
+
+    always @(*) begin
+        case (rnd_mode_i)
+            `INST_FRM_RNE: // Decide accoring to round/sticky bits
+                case (round_sticky_bits_i)
+                      2'b00, 
+                      2'b01: round_up = 1'b0;            // < ulp/2 away, round down
+                      2'b10: round_up = abs_value_i[0];  // = ulp/2 away, round towards even result
+                      2'b11: round_up = 1'b1;            // > ulp/2 away, round up
+                    default: round_up = 1'bx;
+                endcase
+            `INST_FRM_RTZ: round_up = 1'b0; // always round down
+            `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i;  // to 0 if +, away if -
+            `INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
+            `INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
+            default:  round_up = 1'bx; // propagate x
+        endcase
+    end
+
+    // Perform the rounding, exponent change and overflow to inf happens automagically
+    assign abs_rounded_o = abs_value_i + DAT_WIDTH'(round_up);
+
+    // True zero result is a zero result without dirty round/sticky bits
+    assign exact_zero_o = (abs_value_i == 0) && (round_sticky_bits_i == 0);
+
+    // In case of effective subtraction (thus signs of addition operands must have differed) and a
+    // true zero result, the result sign is '-' in case of RDN and '+' for other modes.
+    assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `INST_FRM_RDN)
+                                                              : sign_i;
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_sqrt.sv
+++ b/hw/rtl/fpu/VX_fpu_sqrt.sv
@@ -0,0 +1,134 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+`ifdef FPU_DSP
+
+module VX_fpu_sqrt import VX_fpu_pkg::*; #( 
+    parameter NUM_LANES = 1,
+    parameter TAGW = 1    
+) (
+    input wire clk,
+    input wire reset, 
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [NUM_LANES-1:0] lane_mask,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`INST_FRM_BITS-1:0] frm,
+
+    input wire [NUM_LANES-1:0][31:0]  dataa,
+    output wire [NUM_LANES-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output wire [`FP_FLAGS_BITS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+
+    `UNUSED_VAR (frm)
+    
+    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;
+
+    fflags_t [NUM_LANES-1:0] per_lane_fflags;
+    wire [NUM_LANES-1:0] lane_mask_out;
+
+    VX_shift_register #(
+        .DATAW  (1 + NUM_LANES + TAGW),
+        .DEPTH  (`LATENCY_FSQRT),
+        .RESETW (1)
+    ) shift_reg (
+        .clk(clk),
+        .reset    (reset),
+        .enable   (enable),
+        .data_in  ({valid_in, lane_mask, tag_in}),
+        .data_out ({valid_out, lane_mask_out, tag_out})
+    );
+
+    assign ready_in = enable;    
+
+`ifdef QUARTUS
+    
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        acl_fsqrt fsqrt (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (enable),
+            .a      (dataa[i]),
+            .q      (result[i])
+        );
+    end
+
+    assign has_fflags = 0;
+    assign per_lane_fflags = 'x;
+
+`elsif VIVADO
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        wire tuser;       
+
+        xil_fsqrt fsqrt (
+            .aclk                (clk),
+            .aclken              (enable),
+            .s_axis_a_tvalid     (1'b1),
+            .s_axis_a_tdata      (dataa[i][31:0]),
+            `UNUSED_PIN (m_axis_result_tvalid),
+            .m_axis_result_tdata (result[i][31:0]),
+            .m_axis_result_tuser (tuser)
+        );
+                                    // NV, DZ, OF, UF, NX
+        assign per_lane_fflags[i] = {tuser, 1'b0, 1'b0, 1'b0, 1'b0};
+    end
+
+    assign has_fflags = 1;
+
+`else
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        reg [63:0] r;
+        `UNUSED_VAR (r)
+
+        fflags_t f;
+
+        always @(*) begin        
+            dpi_fsqrt (enable && valid_in, int'(0), {32'hffffffff, dataa[i]}, frm, r, f);
+        end
+        
+        VX_shift_register #(
+            .DATAW  (32 + $bits(fflags_t)),
+            .DEPTH  (`LATENCY_FSQRT)
+        ) shift_req_dpi (
+            .clk      (clk),
+            `UNUSED_PIN (reset),
+            .enable   (enable),
+            .data_in  ({r[31:0], f}),
+            .data_out ({result[i], per_lane_fflags[i]})
+        );
+    end
+
+    assign has_fflags = 1;
+
+`endif
+
+`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
+
+endmodule
+`endif
--- a/hw/rtl/fpu/VX_fpu_to_csr_if.sv
+++ b/hw/rtl/fpu/VX_fpu_to_csr_if.sv
@@ -0,0 +1,43 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_fpu_define.vh"
+
+interface VX_fpu_to_csr_if import VX_fpu_pkg::*; ();
+
+    wire                    write_enable;
+    wire [`NW_WIDTH-1:0]    write_wid;
+    fflags_t                write_fflags;
+
+    wire [`NW_WIDTH-1:0]    read_wid;
+    wire [`INST_FRM_BITS-1:0] read_frm;
+
+    modport master (
+        output write_enable,
+        output write_wid,
+        output write_fflags,
+
+        output read_wid,
+        input  read_frm
+    );
+
+    modport slave (
+        input  write_enable,
+        input  write_wid,
+        input  write_fflags,
+        
+        input  read_wid,
+        output read_frm
+    );
+
+endinterface
--- a/hw/rtl/fpu/VX_fpu_unit.sv
+++ b/hw/rtl/fpu/VX_fpu_unit.sv
@@ -0,0 +1,259 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_define.vh"
+`include "VX_fpu_define.vh"
+
+module VX_fpu_unit import VX_fpu_pkg::*; #(
+    parameter CORE_ID = 0
+) (
+    input wire clk,
+    input wire reset,
+
+    VX_dispatch_if.slave    dispatch_if [`ISSUE_WIDTH],
+    VX_fpu_to_csr_if.master fpu_to_csr_if[`NUM_FPU_BLOCKS],
+
+    VX_commit_if.master     commit_if [`ISSUE_WIDTH]
+);
+    `UNUSED_PARAM (CORE_ID)
+    localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
+    localparam NUM_LANES  = `NUM_FPU_LANES;
+    localparam PID_BITS   = `CLOG2(`NUM_THREADS / NUM_LANES);
+    localparam PID_WIDTH  = `UP(PID_BITS);
+    localparam TAG_WIDTH  = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
+    localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
+
+    VX_execute_if #(
+        .NUM_LANES (NUM_LANES)
+    ) execute_if[BLOCK_SIZE]();
+
+    `RESET_RELAY (dispatch_reset, reset);
+
+    VX_dispatch_unit #(
+        .BLOCK_SIZE (BLOCK_SIZE),
+        .NUM_LANES  (NUM_LANES),
+        .OUT_REG    (PARTIAL_BW ? 1 : 0)
+    ) dispatch_unit (
+        .clk        (clk),
+        .reset      (dispatch_reset),
+        .dispatch_if(dispatch_if),
+        .execute_if (execute_if)
+    );
+
+    VX_commit_if #(
+        .NUM_LANES (NUM_LANES)
+    ) commit_block_if[BLOCK_SIZE]();
+
+    for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
+        `UNUSED_VAR (execute_if[block_idx].data.tid)
+        `UNUSED_VAR (execute_if[block_idx].data.wb)
+        `UNUSED_VAR (execute_if[block_idx].data.use_PC)
+        `UNUSED_VAR (execute_if[block_idx].data.use_imm)
+
+        // Store request info
+        wire fpu_req_valid, fpu_req_ready;
+        wire fpu_rsp_valid, fpu_rsp_ready;    
+        wire [NUM_LANES-1:0][`XLEN-1:0] fpu_rsp_result;
+        fflags_t fpu_rsp_fflags;
+        wire fpu_rsp_has_fflags;
+
+        wire [`UUID_WIDTH-1:0]  fpu_rsp_uuid;
+        wire [`NW_WIDTH-1:0]    fpu_rsp_wid;
+        wire [NUM_LANES-1:0]    fpu_rsp_tmask;
+        wire [`XLEN-1:0]        fpu_rsp_PC;
+        wire [`NR_BITS-1:0]     fpu_rsp_rd;
+        wire [PID_WIDTH-1:0]    fpu_rsp_pid;
+        wire                    fpu_rsp_sop;
+        wire                    fpu_rsp_eop;
+
+        wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;    
+        wire mdata_full;
+
+        wire [`INST_FMT_BITS-1:0] fpu_fmt = execute_if[block_idx].data.imm[`INST_FMT_BITS-1:0];
+        wire [`INST_FRM_BITS-1:0] fpu_frm = execute_if[block_idx].data.op_mod[`INST_FRM_BITS-1:0];
+
+        wire execute_fire = execute_if[block_idx].valid && execute_if[block_idx].ready;
+        wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
+
+        VX_index_buffer #(
+            .DATAW  (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
+            .SIZE   (`FPU_REQ_QUEUE_SIZE)
+        ) tag_store (
+            .clk          (clk),
+            .reset        (reset),
+            .acquire_en   (execute_fire), 
+            .write_addr   (fpu_req_tag), 
+            .write_data   ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.rd, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}),
+            .read_data    ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
+            .read_addr    (fpu_rsp_tag),
+            .release_en   (fpu_rsp_fire), 
+            .full         (mdata_full),
+            `UNUSED_PIN (empty)
+        );
+
+        // resolve dynamic FRM from CSR   
+        wire [`INST_FRM_BITS-1:0] fpu_req_frm; 
+        `ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].read_wid, execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
+        assign fpu_req_frm = (execute_if[block_idx].data.op_type != `INST_FPU_MISC 
+                           && fpu_frm == `INST_FRM_DYN) ? fpu_to_csr_if[block_idx].read_frm : fpu_frm;
+
+        // submit FPU request
+
+        assign fpu_req_valid = execute_if[block_idx].valid && ~mdata_full;
+        assign execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
+
+        `RESET_RELAY (fpu_reset, reset);   
+
+    `ifdef FPU_DPI
+
+        VX_fpu_dpi #(
+            .NUM_LANES  (NUM_LANES),
+            .TAGW       (TAG_WIDTH),
+            .OUT_REG    (PARTIAL_BW ? 1 : 3)
+        ) fpu_dpi (
+            .clk        (clk),
+            .reset      (fpu_reset),
+
+            .valid_in   (fpu_req_valid),
+            .op_type    (execute_if[block_idx].data.op_type),
+            .lane_mask  (execute_if[block_idx].data.tmask),
+            .fmt        (fpu_fmt),
+            .frm        (fpu_req_frm),
+            .dataa      (execute_if[block_idx].data.rs1_data),
+            .datab      (execute_if[block_idx].data.rs2_data),
+            .datac      (execute_if[block_idx].data.rs3_data),
+            .tag_in     (fpu_req_tag),
+            .ready_in   (fpu_req_ready),
+
+            .valid_out  (fpu_rsp_valid),
+            .result     (fpu_rsp_result),
+            .has_fflags (fpu_rsp_has_fflags),
+            .fflags     (fpu_rsp_fflags),
+            .tag_out    (fpu_rsp_tag),
+            .ready_out  (fpu_rsp_ready)     
+        );   
+
+    `elsif FPU_FPNEW
+
+        VX_fpu_fpnew #(
+            .NUM_LANES  (NUM_LANES),
+            .TAGW       (TAG_WIDTH),
+            .OUT_REG    (PARTIAL_BW ? 1 : 3)
+        ) fpu_fpnew (
+            .clk        (clk),
+            .reset      (fpu_reset), 
+
+            .valid_in   (fpu_req_valid),
+            .op_type    (execute_if[block_idx].data.op_type),
+            .lane_mask  (execute_if[block_idx].data.tmask),
+            .fmt        (fpu_fmt),
+            .frm        (fpu_req_frm),
+            .dataa      (execute_if[block_idx].data.rs1_data),
+            .datab      (execute_if[block_idx].data.rs2_data),
+            .datac      (execute_if[block_idx].data.rs3_data), 
+            .tag_in     (fpu_req_tag),
+            .ready_in   (fpu_req_ready),
+
+            .valid_out  (fpu_rsp_valid), 
+            .result     (fpu_rsp_result),
+            .has_fflags (fpu_rsp_has_fflags),
+            .fflags     (fpu_rsp_fflags),
+            .tag_out    (fpu_rsp_tag), 
+            .ready_out  (fpu_rsp_ready)        
+        );
+
+    `elsif FPU_DSP
+
+        VX_fpu_dsp #(
+            .NUM_LANES  (NUM_LANES),
+            .TAGW       (TAG_WIDTH),
+            .OUT_REG    (PARTIAL_BW ? 1 : 3)
+        ) fpu_dsp (
+            .clk        (clk),
+            .reset      (fpu_reset), 
+
+            .valid_in   (fpu_req_valid),
+            .lane_mask  (execute_if[block_idx].data.tmask),
+            .op_type    (execute_if[block_idx].data.op_type),
+            .fmt        (fpu_fmt),
+            .frm        (fpu_req_frm),
+            .dataa      (execute_if[block_idx].data.rs1_data),
+            .datab      (execute_if[block_idx].data.rs2_data),
+            .datac      (execute_if[block_idx].data.rs3_data), 
+            .tag_in     (fpu_req_tag),
+            .ready_in   (fpu_req_ready),
+
+            .valid_out  (fpu_rsp_valid), 
+            .result     (fpu_rsp_result), 
+            .has_fflags (fpu_rsp_has_fflags),
+            .fflags     (fpu_rsp_fflags),
+            .tag_out    (fpu_rsp_tag),
+            .ready_out  (fpu_rsp_ready)
+        );
+        
+    `endif
+
+        // handle FPU response
+
+        fflags_t fpu_rsp_fflags_q;
+
+        if (PID_BITS != 0) begin
+            fflags_t fpu_rsp_fflags_r;
+            always @(posedge clk) begin
+                if (reset) begin
+                    fpu_rsp_fflags_r <= '0;
+                end else if (fpu_rsp_fire) begin
+                    fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
+                end
+            end
+            assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
+        end else begin
+            assign fpu_rsp_fflags_q = fpu_rsp_fflags;
+        end
+        
+        assign fpu_to_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
+        `ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
+        assign fpu_to_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
+
+        // send response
+
+        VX_elastic_buffer #(
+            .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
+            .SIZE  (0)
+        ) rsp_buf (
+            .clk       (clk),
+            .reset     (reset),
+            .valid_in  (fpu_rsp_valid),
+            .ready_in  (fpu_rsp_ready),
+            .data_in   ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
+            .data_out  ({commit_block_if[block_idx].data.uuid, commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.tmask, commit_block_if[block_idx].data.PC, commit_block_if[block_idx].data.rd, commit_block_if[block_idx].data.data, commit_block_if[block_idx].data.pid, commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop}),
+            .valid_out (commit_block_if[block_idx].valid),
+            .ready_out (commit_block_if[block_idx].ready)
+        );
+        assign commit_block_if[block_idx].data.wb = 1'b1;
+    end
+
+    `RESET_RELAY (commit_reset, reset);
+
+    VX_gather_unit #(
+        .BLOCK_SIZE (BLOCK_SIZE),
+        .NUM_LANES  (NUM_LANES),
+        .OUT_REG    (PARTIAL_BW ? 3 : 0)
+    ) gather_unit (
+        .clk           (clk),
+        .reset         (commit_reset),
+        .commit_in_if  (commit_block_if),
+        .commit_out_if (commit_if)
+    );
+
+endmodule