fpga fixes: warp scheduler, fnmadd, fdiv, fsqrt

2020-09-08 07:05:26 -07:00
parent 4c08929c56
commit 36ec603d17
22 changed files with 2374 additions and 2588 deletions
--- a/hw/rtl/fp_cores/VX_fp_addmul.v
+++ b/hw/rtl/fp_cores/VX_fp_addmul.v
@@ -51,9 +51,9 @@ module VX_fp_addmul #(
            .ax(dataa[i]),
            .ay(datab[i]),
            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
+            .clk({2'b00, clk}),
+            .ena({2'b00, enable}),
+            .aclr({reset, reset}),
            .chainin(),
            // outputs
            .overflow(),
@@ -91,9 +91,9 @@ module VX_fp_addmul #(
            .ax(dataa[i]),
            .ay(datab[i]),
            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
+            .clk({2'b00, clk}),
+            .ena({2'b00, enable}),
+            .aclr({reset, reset}),
            .chainin(),
            // outputs
            .overflow(),
@@ -131,9 +131,9 @@ module VX_fp_addmul #(
            .ax(),
            .ay(datab[i]),
            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
+            .clk({2'b00, clk}),
+            .ena({2'b00, enable}),
+            .aclr({reset, reset}),
            .chainin(),
            // outputs
            .overflow(),
--- a/hw/rtl/fp_cores/VX_fp_div.v
+++ b/hw/rtl/fp_cores/VX_fp_div.v
@@ -32,7 +32,7 @@ module VX_fp_div #(
    `ifdef QUARTUS
        acl_fdiv fdiv (
            .clk    (clk),
-            .areset (1'b0),
+            .areset (reset),
            .en     (enable),
            .a      (dataa[i]),
            .b      (datab[i]),
--- a/hw/rtl/fp_cores/VX_fp_fpga.v
+++ b/hw/rtl/fp_cores/VX_fp_fpga.v
@@ -27,7 +27,7 @@ module VX_fp_fpga #(
    input wire  ready_out,
    output wire valid_out
 );
-    localparam NUM_FPC  = 8;
+    localparam NUM_FPC  = 7;
    localparam FPC_BITS = `LOG2UP(NUM_FPC);
    
    wire [NUM_FPC-1:0] per_core_ready_in;
@@ -40,28 +40,28 @@ module VX_fp_fpga #(
    fflags_t [`NUM_THREADS-1:0] fpnew_fflags;  

    reg [FPC_BITS-1:0] core_select;
-    reg do_sub, do_mul;
+    reg do_sub, do_mul, do_neg;
    reg is_signed;

    always @(*) begin
-        core_select = 'x;
-        do_sub      = 'x;
-        do_mul      = 'x;
-        is_signed   = 'x;
+        do_sub    = 'x;
+        do_mul    = 'x;
+        do_neg    = 'x;
+        is_signed = 'x;
        case (op_type)
            `FPU_ADD:    begin core_select = 1; do_mul = 0; do_sub = 0; end
            `FPU_SUB:    begin core_select = 1; do_mul = 0; do_sub = 1; end
            `FPU_MUL:    begin core_select = 1; do_mul = 1; do_sub = 0; end
-            `FPU_MADD:   begin core_select = 2; do_sub = 0; end
-            `FPU_MSUB:   begin core_select = 2; do_sub = 1; end
-            `FPU_NMADD:  begin core_select = 3; do_sub = 0; end
-            `FPU_NMSUB:  begin core_select = 3; do_sub = 1; end
-            `FPU_DIV:    begin core_select = 4; end
-            `FPU_SQRT:   begin core_select = 5; end
-            `FPU_CVTWS:  begin core_select = 6; is_signed = 1; end
-            `FPU_CVTWUS: begin core_select = 6; is_signed = 0; end
-            `FPU_CVTSW:  begin core_select = 7; is_signed = 1; end
-            `FPU_CVTSWU: begin core_select = 7; is_signed = 0; end
+            `FPU_MADD:   begin core_select = 2; do_sub = 0; do_neg = 0; end
+            `FPU_MSUB:   begin core_select = 2; do_sub = 1; do_neg = 0; end
+            `FPU_NMADD:  begin core_select = 2; do_sub = 0; do_neg = 1; end
+            `FPU_NMSUB:  begin core_select = 2; do_sub = 1; do_neg = 1; end
+            `FPU_DIV:    begin core_select = 3; end
+            `FPU_SQRT:   begin core_select = 4; end
+            `FPU_CVTWS:  begin core_select = 5; is_signed = 1; end
+            `FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
+            `FPU_CVTSW:  begin core_select = 6; is_signed = 1; end
+            `FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
            default:     begin core_select = 0; end
        endcase
    end
@@ -116,6 +116,7 @@ module VX_fp_fpga #(
        .ready_in   (per_core_ready_in[2]),    
        .tag_in     (tag_in),  
        .do_sub     (do_sub),
+        .do_neg     (do_neg),
        .dataa      (dataa), 
        .datab      (datab),      
        .datac      (datac),   
@@ -125,40 +126,21 @@ module VX_fp_fpga #(
        .valid_out  (per_core_valid_out[2])
    );

-    VX_fp_nmadd #(
-        .TAGW (TAGW),
-        .LANES(`NUM_THREADS)
-    ) fp_nmadd (
-        .clk        (clk), 
-        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 3)),
-        .ready_in   (per_core_ready_in[3]),    
-        .tag_in     (tag_in),  
-        .do_sub     (do_sub),
-        .dataa      (dataa), 
-        .datab      (datab),   
-        .datac      (datac),              
-        .result     (per_core_result[3]),
-        .tag_out    (per_core_tag_out[3]),
-        .ready_out  (per_core_ready_out[3]),
-        .valid_out  (per_core_valid_out[3])
-    );
-
    VX_fp_div #(
        .TAGW (TAGW),
        .LANES(`NUM_THREADS)
    ) fp_div (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 4)),
-        .ready_in   (per_core_ready_in[4]),    
+        .valid_in   (valid_in && (core_select == 3)),
+        .ready_in   (per_core_ready_in[3]),    
        .tag_in     (tag_in),    
        .dataa      (dataa), 
        .datab      (datab),         
-        .result     (per_core_result[4]),
-        .tag_out    (per_core_tag_out[4]),
-        .ready_out  (per_core_ready_out[4]),
-        .valid_out  (per_core_valid_out[4])
+        .result     (per_core_result[3]),
+        .tag_out    (per_core_tag_out[3]),
+        .ready_out  (per_core_ready_out[3]),
+        .valid_out  (per_core_valid_out[3])
    );

    VX_fp_sqrt #(
@@ -167,14 +149,14 @@ module VX_fp_fpga #(
    ) fp_sqrt (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 5)),
-        .ready_in   (per_core_ready_in[5]),    
+        .valid_in   (valid_in && (core_select == 4)),
+        .ready_in   (per_core_ready_in[4]),    
        .tag_in     (tag_in),    
        .dataa      (dataa),  
-        .result     (per_core_result[5]),
-        .tag_out    (per_core_tag_out[5]),
-        .ready_out  (per_core_ready_out[5]),
-        .valid_out  (per_core_valid_out[5])
+        .result     (per_core_result[4]),
+        .tag_out    (per_core_tag_out[4]),
+        .ready_out  (per_core_ready_out[4]),
+        .valid_out  (per_core_valid_out[4])
    );

    VX_fp_ftoi #(
@@ -183,32 +165,32 @@ module VX_fp_fpga #(
    ) fp_ftoi (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 6)),
-        .ready_in   (per_core_ready_in[6]),    
+        .valid_in   (valid_in && (core_select == 5)),
+        .ready_in   (per_core_ready_in[5]),    
        .tag_in     (tag_in), 
        .is_signed  (is_signed),   
        .dataa      (dataa),  
-        .result     (per_core_result[6]),
-        .tag_out    (per_core_tag_out[6]),
-        .ready_out  (per_core_ready_out[6]),
-        .valid_out  (per_core_valid_out[6])
+        .result     (per_core_result[5]),
+        .tag_out    (per_core_tag_out[5]),
+        .ready_out  (per_core_ready_out[5]),
+        .valid_out  (per_core_valid_out[5])
    );

    VX_fp_itof #(
        .TAGW (TAGW),
        .LANES(`NUM_THREADS)
    ) fp_itof (
-        .clk        (clk), 
-        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 7)),
-        .ready_in   (per_core_ready_in[7]),    
-        .tag_in     (tag_in), 
-        .is_signed  (is_signed),      
-        .dataa      (dataa),  
-        .result     (per_core_result[7]),
-        .tag_out    (per_core_tag_out[7]),
-        .ready_out  (per_core_ready_out[7]),
-        .valid_out  (per_core_valid_out[7])
+        .clk        (clk),
+        .reset      (reset),
+        .valid_in   (valid_in && (core_select == 6)),
+        .ready_in   (per_core_ready_in[6]),
+        .tag_in     (tag_in),
+        .is_signed  (is_signed),
+        .dataa      (dataa),
+        .result     (per_core_result[6]),
+        .tag_out    (per_core_tag_out[6]),
+        .ready_out  (per_core_ready_out[6]),
+        .valid_out  (per_core_valid_out[6])
    );

    reg valid_out_n;
@@ -234,7 +216,7 @@ module VX_fp_fpga #(
        end
    end

-    assign ready_in   = (& per_core_ready_in);
+    assign ready_in   = per_core_ready_in[core_select];
    assign valid_out  = valid_out_n;
    assign has_fflags = has_fflags_n;
    assign tag_out    = tag_out_n;
--- a/hw/rtl/fp_cores/VX_fp_ftoi.v
+++ b/hw/rtl/fp_cores/VX_fp_ftoi.v
@@ -39,7 +39,7 @@ module VX_fp_ftoi #(
    `ifdef QUARTUS       
        acl_ftoi ftoi (
            .clk    (clk),
-            .areset (1'b0),
+            .areset (reset),
            .en     (enable),
            .a      (dataa[i]),
            .q      (result_s)
@@ -47,7 +47,7 @@ module VX_fp_ftoi #(

        acl_ftou ftou (
            .clk    (clk),
-            .areset (1'b0),
+            .areset (reset),
            .en     (enable),
            .a      (dataa[i]),
            .q      (result_u)
--- a/hw/rtl/fp_cores/VX_fp_itof.v
+++ b/hw/rtl/fp_cores/VX_fp_itof.v
@@ -39,7 +39,7 @@ module VX_fp_itof #(
    `ifdef QUARTUS
        acl_itof itof (
            .clk    (clk),
-            .areset (1'b0),
+            .areset (reset),
            .en     (enable),
            .a      (dataa[i]),
            .q      (result_s)
@@ -47,7 +47,7 @@ module VX_fp_itof #(

        acl_utof utof (
            .clk    (clk),
-            .areset (1'b0),
+            .areset (reset),
            .en     (enable),
            .a      (dataa[i]),
            .q      (result_u)
--- a/hw/rtl/fp_cores/VX_fp_madd.v
+++ b/hw/rtl/fp_cores/VX_fp_madd.v
@@ -17,7 +17,8 @@ module VX_fp_madd #(
    input wire [TAGW-1:0] tag_in,

    input wire  do_sub,  
-
+    input wire  do_neg, 
+    
    input wire [LANES-1:0][31:0]  dataa,
    input wire [LANES-1:0][31:0]  datab,
    input wire [LANES-1:0][31:0]  datac,
@@ -32,7 +33,7 @@ module VX_fp_madd #(
    wire stall = ~ready_out && valid_out;
    wire enable = ~stall;

-    reg do_sub_r;
+    reg do_sub_r, do_neg_r;

    for (genvar i = 0; i < LANES; i++) begin
        
@@ -50,9 +51,9 @@ module VX_fp_madd #(
            .ax(datac[i]),
            .ay(datab[i]),
            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
+            .clk({2'b00, clk}),
+            .ena({2'b00, enable}),
+            .aclr({reset, reset}),
            .chainin(),
            // outputs
            .overflow(),
@@ -90,9 +91,9 @@ module VX_fp_madd #(
            .ax(datac[i]),
            .ay(datab[i]),
            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
+            .clk({2'b00, clk}),
+            .ena({2'b00, enable}),
+            .aclr({reset, reset}),
            .chainin(),
            // outputs
            .overflow(),
@@ -126,18 +127,20 @@ module VX_fp_madd #(
        end
    `endif

-        assign result[i] = do_sub_r ? result_msub : result_madd;
+        wire [31:0] result_unqual = do_sub_r ? result_msub : result_madd;
+        assign result[i][31]   = result_unqual[31] ^ do_neg_r;
+        assign result[i][30:0] = result_unqual[30:0];
    end
    
    VX_shift_register #(
-        .DATAW(TAGW + 1 + 1),
+        .DATAW(TAGW + 1 + 1 + 1),
        .DEPTH(`LATENCY_FMADD)
    ) shift_reg (
        .clk(clk),
        .reset(reset),
        .enable(enable),
-        .in({tag_in,   valid_in,  do_sub}),
-        .out({tag_out, valid_out, do_sub_r})
+        .in({tag_in,   valid_in,  do_sub,   do_neg}),
+        .out({tag_out, valid_out, do_sub_r, do_neg_r})
    );

    assign ready_in = enable;
--- a/hw/rtl/fp_cores/VX_fp_nmadd.v
+++ b/hw/rtl/fp_cores/VX_fp_nmadd.v
@@ -1,197 +0,0 @@
-`include "VX_define.vh"
-
-`ifndef SYNTHESIS
-`include "float_dpi.vh"
-`endif
-
-module VX_fp_nmadd #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire  do_sub,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    input wire [LANES-1:0][31:0]  datab,
-    input wire [LANES-1:0][31:0]  datac,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-); 
-
-    wire stall = ~ready_out && valid_out;
-    wire enable = ~stall;
-
-    reg do_sub_r;
-
-    for (genvar i = 0; i < LANES; i++) begin
-
-        wire [31:0] result_madd;
-        wire [31:0] result_msub;     
-
-        wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
-
-    `ifdef QUARTUS
-        twentynm_fp_mac mac_fp_madd (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(datac[i]),
-            .ay(datab[i]),
-            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_madd),
-            .chainout()
-        );
-        defparam mac_fp_madd.operation_mode = "sp_mult_add"; 
-        defparam mac_fp_madd.use_chainin = "false"; 
-        defparam mac_fp_madd.adder_subtract = "false"; 
-        defparam mac_fp_madd.ax_clock = "0"; 
-        defparam mac_fp_madd.ay_clock = "0"; 
-        defparam mac_fp_madd.az_clock = "0"; 
-        defparam mac_fp_madd.output_clock = "0"; 
-        defparam mac_fp_madd.accumulate_clock = "none"; 
-        defparam mac_fp_madd.ax_chainin_pl_clock = "0"; 
-        defparam mac_fp_madd.accum_pipeline_clock = "none"; 
-        defparam mac_fp_madd.mult_pipeline_clock = "0"; 
-        defparam mac_fp_madd.adder_input_clock = "0"; 
-        defparam mac_fp_madd.accum_adder_clock = "none"; 
-
-        twentynm_fp_mac mac_fp_msub (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(datac[i]),
-            .ay(datab[i]),
-            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_msub),
-            .chainout()
-        );
-        defparam mac_fp_msub.operation_mode = "sp_mult_add"; 
-        defparam mac_fp_msub.use_chainin = "false"; 
-        defparam mac_fp_msub.adder_subtract = "true"; 
-        defparam mac_fp_msub.ax_clock = "0"; 
-        defparam mac_fp_msub.ay_clock = "0"; 
-        defparam mac_fp_msub.az_clock = "0"; 
-        defparam mac_fp_msub.output_clock = "0"; 
-        defparam mac_fp_msub.accumulate_clock = "none"; 
-        defparam mac_fp_msub.ax_chainin_pl_clock = "0"; 
-        defparam mac_fp_msub.accum_pipeline_clock = "none"; 
-        defparam mac_fp_msub.mult_pipeline_clock = "0"; 
-        defparam mac_fp_msub.adder_input_clock = "0"; 
-        defparam mac_fp_msub.accum_adder_clock = "none";
-
-        twentynm_fp_mac mac_fp_neg (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(32'h0),
-            .ay(result_st0),
-            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result[i]),
-            .chainout()
-        );
-        defparam mac_fp_neg.operation_mode = "sp_add"; 
-        defparam mac_fp_neg.use_chainin = "false"; 
-        defparam mac_fp_neg.adder_subtract = "true"; 
-        defparam mac_fp_neg.ax_clock = "0"; 
-        defparam mac_fp_neg.ay_clock = "0"; 
-        defparam mac_fp_neg.az_clock = "none"; 
-        defparam mac_fp_neg.output_clock = "0"; 
-        defparam mac_fp_neg.accumulate_clock = "none"; 
-        defparam mac_fp_neg.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_neg.accum_pipeline_clock = "none"; 
-        defparam mac_fp_neg.mult_pipeline_clock = "none"; 
-        defparam mac_fp_neg.adder_input_clock = "0"; 
-        defparam mac_fp_neg.accum_adder_clock = "none";
-    `else
-        always @(posedge clk) begin
-           dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
-           dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
-           dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
-        end
-    `endif
-    end
-
-    VX_shift_register #(
-        .DATAW(1),
-        .DEPTH(`LATENCY_FMADD)
-    ) shift_reg0 (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in({do_sub}),
-        .out({do_sub_r})
-    );
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
-    ) shift_reg1 (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in({tag_in,   valid_in}),
-        .out({tag_out, valid_out})
-    );
-
-    assign ready_in = enable;
-
-endmodule
--- a/hw/rtl/fp_cores/VX_fp_sqrt.v
+++ b/hw/rtl/fp_cores/VX_fp_sqrt.v
@@ -31,7 +31,7 @@ module VX_fp_sqrt #(
    `ifdef QUARTUS
        acl_fsqrt fsqrt (
            .clk    (clk),
-            .areset (1'b0),
+            .areset (reset),
            .en     (enable),
            .a      (dataa[i]),
            .q      (result[i])
--- a/hw/rtl/fp_cores/VX_fpnew.v
+++ b/hw/rtl/fp_cores/VX_fpnew.v
@@ -91,6 +91,7 @@ module VX_fpnew #(
        fpu_operands[0] = dataa;
        fpu_operands[1] = datab;
        fpu_operands[2] = datac;
+
        case (op_type)
            `FPU_ADD: begin
                    fpu_op = fpnew_pkg::ADD;
@@ -107,23 +108,23 @@ module VX_fpnew #(
            `FPU_DIV:   begin fpu_op = fpnew_pkg::DIV; end
            `FPU_SQRT:  begin fpu_op = fpnew_pkg::SQRT; end
            `FPU_MADD:  begin fpu_op = fpnew_pkg::FMADD; end
-            `FPU_MSUB:  begin fpu_op = fpnew_pkg::FMADD;  fpu_op_mod = 1; end
-            `FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
+            `FPU_MSUB:  begin fpu_op = fpnew_pkg::FMADD;  fpu_op_mod = 1; end            
            `FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
+            `FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
            `FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end
-            `FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I;  fpu_op_mod = 1; end
+            `FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
            `FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end
-            `FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F;  fpu_op_mod = 1; end
+            `FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
            `FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end
            `FPU_CMP:   begin fpu_op = fpnew_pkg::CMP; end
            `FPU_MISC:  begin
                case (frm)
-                  0: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
-                  1: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
-                  2: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
-                  3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
-                  4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end    
-            default: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
+                      0: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
+                      1: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
+                      2: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
+                      3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
+                      4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end    
+                default: begin fpu_op = fpnew_pkg::SGNJ;   fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
                endcase    
            end
            default:;