Merge remote-tracking branch 'upstream/master' into vortex2

This commit is contained in:
Hansung Kim
2024-02-01 23:35:58 -08:00
203 changed files with 4383 additions and 21981 deletions

View File

@@ -52,30 +52,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
localparam MAN_BITS = 23;
localparam EXP_BITS = 8;
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
// Use 32-bit integer
localparam MAX_INT_WIDTH = 32;
localparam INT_WIDTH = 32;
// The internal mantissa includes normal bit or an entire integer
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
// The internal exponent must be able to represent the smallest denormal input value as signed
// or the number of bits in an integer
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
// shift amount for denormalization
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R
// Input processing
@@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class (
.exp_i (dataa[i][30:23]),
.man_i (dataa[i][22:0]),
.exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]),
.man_i (dataa[i][MAN_BITS-1:0]),
.clss_o (fclass[i])
);
end
@@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] input_sign;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [INT_MAN_WIDTH-1:0] int_mantissa;
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
wire fmt_sign = dataa[i][31];
wire int_sign = dataa[i][31] && is_signed;
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
wire i2f_sign = dataa[i][INT_WIDTH-1];
wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
end
// Pipeline stage0
wire valid_in_s0;
wire [NUM_LANES-1:0] lane_mask_s0;
wire [TAGW-1:0] tag_in_s0;
wire is_itof_s0;
wire unsigned_s0;
wire [2:0] rnd_mode_s0;
wire valid_in_s0;
wire [NUM_LANES-1:0] lane_mask_s0;
wire [TAGW-1:0] tag_in_s0;
wire is_itof_s0;
wire is_signed_s0;
wire [2:0] rnd_mode_s0;
fclass_t [NUM_LANES-1:0] fclass_s0;
wire [NUM_LANES-1:0] input_sign_s0;
wire [NUM_LANES-1:0] input_sign_s0;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
@@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
.data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
);
// Normalization
@@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
// Unbias exponent and compensate for shift
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
end
// Pipeline stage1
wire valid_in_s1;
wire [NUM_LANES-1:0] lane_mask_s1;
wire [TAGW-1:0] tag_in_s1;
wire is_itof_s1;
wire unsigned_s1;
wire [2:0] rnd_mode_s1;
wire valid_in_s1;
wire [NUM_LANES-1:0] lane_mask_s1;
wire [TAGW-1:0] tag_in_s1;
wire is_itof_s1;
wire is_signed_s1;
wire [2:0] rnd_mode_s1;
fclass_t [NUM_LANES-1:0] fclass_s1;
wire [NUM_LANES-1:0] input_sign_s1;
wire [NUM_LANES-1:0] mant_is_zero_s1;
wire [NUM_LANES-1:0] input_sign_s1;
wire [NUM_LANES-1:0] mant_is_zero_s1;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
@@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
);
// Perform adjustments to mantissa and exponent
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
wire [NUM_LANES-1:0] of_before_round_s1;
wire [NUM_LANES-1:0] of_before_round_s1;
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
reg of_before_round_tmp_s1;
wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
always @(*) begin
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
preshift_mant_s1 = {input_mant_s1[i], 33'b0};
denorm_shamt_s1 = '0;
of_before_round_tmp_s1 = 1'b0;
if (is_itof_s1) begin
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
// Overflow or infinities (for proper rounding)
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
preshift_mant_s1 = ~0; // largest normal value and RS bits set
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
// Limit the shift to retain sticky bits
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
// Denormalize underflowing values
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
end
if (overflow) begin
denorm_shamt_q = '0;
end else if (underflow) begin
denorm_shamt_q = INT_WIDTH+1;
end else begin
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
// overflow: when converting to unsigned the range is larger by one
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
// underflow
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
end else begin
// By default right shift mantissa to be an integer
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
end
denorm_shamt_q = denorm_shamt;
end
end
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
assign final_exp_s1[i] = final_exp_tmp_s1;
assign of_before_round_s1[i] = of_before_round_tmp_s1;
assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
assign of_before_round_s1[i] = overflow;
end
// Pipeline stage2
wire valid_in_s2;
wire [NUM_LANES-1:0] lane_mask_s2;
wire [TAGW-1:0] tag_in_s2;
wire is_itof_s2;
wire unsigned_s2;
wire [2:0] rnd_mode_s2;
wire valid_in_s2;
wire [NUM_LANES-1:0] lane_mask_s2;
wire [TAGW-1:0] tag_in_s2;
wire is_itof_s2;
wire is_signed_s2;
wire [2:0] rnd_mode_s2;
fclass_t [NUM_LANES-1:0] fclass_s2;
wire [NUM_LANES-1:0] mant_is_zero_s2;
wire [NUM_LANES-1:0] input_sign_s2;
wire [NUM_LANES-1:0] mant_is_zero_s2;
wire [NUM_LANES-1:0] input_sign_s2;
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
wire [NUM_LANES-1:0] of_before_round_s2;
wire [NUM_LANES-1:0] of_before_round_s2;
VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
@@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
);
wire [NUM_LANES-1:0] rounded_sign_s2;
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
wire [NUM_LANES-1:0] rounded_sign_s2;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
// Rouding and classification
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] round_sticky_bits_s2;
wire [31:0] fmt_pre_round_abs_s2;
wire [31:0] pre_round_abs_s2;
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] round_sticky_bits_s2;
wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
wire [INT_WIDTH-1:0] pre_round_abs_s2;
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
// Extract final mantissa and round bit, discard the normal bit (for FP)
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1];
// Collapse sticky bits
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2);
assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2);
// select RS bits for destination operation
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
// Pack exponent and mantissa into proper rounding form
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
@@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] lane_mask_s3;
wire [TAGW-1:0] tag_in_s3;
wire is_itof_s3;
wire unsigned_s3;
wire is_signed_s3;
fclass_t [NUM_LANES-1:0] fclass_s3;
wire [NUM_LANES-1:0] mant_is_zero_s3;
wire [NUM_LANES-1:0] input_sign_s3;
wire [NUM_LANES-1:0] rounded_sign_s3;
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
wire [NUM_LANES-1:0] of_before_round_s3;
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
wire [NUM_LANES-1:0] i2f_round_has_sticky_s3;
VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
@@ -339,105 +304,71 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
);
wire [NUM_LANES-1:0] of_after_round_s3;
wire [NUM_LANES-1:0] uf_after_round_s3;
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble regular result, nan box short ones. Int zeroes need to be detected
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
// Classification after rounding select by destination format
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
// Negative integer result needs to be brought into two's complement
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
end
// FP Special case handling
// F2I Special case handling
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
wire [NUM_LANES-1:0] fp_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Detect special case from source format, I2F casts don't produce a special result
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
// Signalling input NaNs raise invalid flag, otherwise no flags set
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
// Assemble result according to destination format
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
end
// INT Special case handling
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
fflags_t [NUM_LANES-1:0] int_special_status_s3;
wire [NUM_LANES-1:0] int_result_is_special_s3;
reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
wire [NUM_LANES-1:0] f2i_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble result according to destination format
always @(*) begin
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1
f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31
end else begin
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
end
end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan
| fclass_s3[i].is_inf
| of_before_round_s3[i]
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
| (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
// All integer special cases are invalid
assign int_special_status_s3[i] = {1'b1, 4'h0};
assign f2i_special_status_s3[i] = {1'b1, 4'h0};
end
// Result selection and Output handshake
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t fp_regular_status_s3, int_regular_status_s3;
fflags_t fp_status_s3, int_status_s3;
wire [31:0] fp_result_s3, int_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
fflags_t i2f_status_s3, f2i_status_s3;
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
assign fp_regular_status_s3.NX = inexact_s3;
assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
assign i2f_status_s3 = i2f_regular_status_s3;
assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
// Select output depending on special case detection
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
end
assign stall = ~ready_out && valid_out;
@@ -457,7 +388,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
);
assign ready_in = ~stall;
assign has_fflags = 1'b1;
endmodule

View File

@@ -16,7 +16,7 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`ifdef SV_DPI
`include "float_dpi.vh"
`endif

View File

@@ -54,7 +54,6 @@ module VX_fpu_rounding #(
2'b01: round_up = 1'b0; // < ulp/2 away, round down
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
2'b11: round_up = 1'b1; // > ulp/2 away, round up
default: round_up = 1'bx;
endcase
`INST_FRM_RTZ: round_up = 1'b0; // always round down
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -

View File

@@ -1,259 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_fpu_define.vh"
module VX_fpu_unit import VX_fpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
VX_fpu_to_csr_if.master fpu_to_csr_if[`NUM_FPU_BLOCKS],
VX_commit_if.master commit_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`UNUSED_VAR (execute_if[block_idx].data.tid)
`UNUSED_VAR (execute_if[block_idx].data.wb)
`UNUSED_VAR (execute_if[block_idx].data.use_PC)
`UNUSED_VAR (execute_if[block_idx].data.use_imm)
// Store request info
wire fpu_req_valid, fpu_req_ready;
wire fpu_rsp_valid, fpu_rsp_ready;
wire [NUM_LANES-1:0][`XLEN-1:0] fpu_rsp_result;
fflags_t fpu_rsp_fflags;
wire fpu_rsp_has_fflags;
wire [`UUID_WIDTH-1:0] fpu_rsp_uuid;
wire [`NW_WIDTH-1:0] fpu_rsp_wid;
wire [NUM_LANES-1:0] fpu_rsp_tmask;
wire [`XLEN-1:0] fpu_rsp_PC;
wire [`NR_BITS-1:0] fpu_rsp_rd;
wire [PID_WIDTH-1:0] fpu_rsp_pid;
wire fpu_rsp_sop;
wire fpu_rsp_eop;
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
wire mdata_full;
wire [`INST_FMT_BITS-1:0] fpu_fmt = execute_if[block_idx].data.imm[`INST_FMT_BITS-1:0];
wire [`INST_FRM_BITS-1:0] fpu_frm = execute_if[block_idx].data.op_mod[`INST_FRM_BITS-1:0];
wire execute_fire = execute_if[block_idx].valid && execute_if[block_idx].ready;
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPU_REQ_QUEUE_SIZE)
) tag_store (
.clk (clk),
.reset (reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.rd, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.read_addr (fpu_rsp_tag),
.release_en (fpu_rsp_fire),
.full (mdata_full),
`UNUSED_PIN (empty)
);
// resolve dynamic FRM from CSR
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].read_wid, execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_req_frm = (execute_if[block_idx].data.op_type != `INST_FPU_MISC
&& fpu_frm == `INST_FRM_DYN) ? fpu_to_csr_if[block_idx].read_frm : fpu_frm;
// submit FPU request
assign fpu_req_valid = execute_if[block_idx].valid && ~mdata_full;
assign execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
`RESET_RELAY (fpu_reset, reset);
`ifdef FPU_DPI
VX_fpu_dpi #(
.NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3)
) fpu_dpi (
.clk (clk),
.reset (fpu_reset),
.valid_in (fpu_req_valid),
.op_type (execute_if[block_idx].data.op_type),
.lane_mask (execute_if[block_idx].data.tmask),
.fmt (fpu_fmt),
.frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag),
.ready_in (fpu_req_ready),
.valid_out (fpu_rsp_valid),
.result (fpu_rsp_result),
.has_fflags (fpu_rsp_has_fflags),
.fflags (fpu_rsp_fflags),
.tag_out (fpu_rsp_tag),
.ready_out (fpu_rsp_ready)
);
`elsif FPU_FPNEW
VX_fpu_fpnew #(
.NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3)
) fpu_fpnew (
.clk (clk),
.reset (fpu_reset),
.valid_in (fpu_req_valid),
.op_type (execute_if[block_idx].data.op_type),
.lane_mask (execute_if[block_idx].data.tmask),
.fmt (fpu_fmt),
.frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag),
.ready_in (fpu_req_ready),
.valid_out (fpu_rsp_valid),
.result (fpu_rsp_result),
.has_fflags (fpu_rsp_has_fflags),
.fflags (fpu_rsp_fflags),
.tag_out (fpu_rsp_tag),
.ready_out (fpu_rsp_ready)
);
`elsif FPU_DSP
VX_fpu_dsp #(
.NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3)
) fpu_dsp (
.clk (clk),
.reset (fpu_reset),
.valid_in (fpu_req_valid),
.lane_mask (execute_if[block_idx].data.tmask),
.op_type (execute_if[block_idx].data.op_type),
.fmt (fpu_fmt),
.frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag),
.ready_in (fpu_req_ready),
.valid_out (fpu_rsp_valid),
.result (fpu_rsp_result),
.has_fflags (fpu_rsp_has_fflags),
.fflags (fpu_rsp_fflags),
.tag_out (fpu_rsp_tag),
.ready_out (fpu_rsp_ready)
);
`endif
// handle FPU response
fflags_t fpu_rsp_fflags_q;
if (PID_BITS != 0) begin
fflags_t fpu_rsp_fflags_r;
always @(posedge clk) begin
if (reset) begin
fpu_rsp_fflags_r <= '0;
end else if (fpu_rsp_fire) begin
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
end
end
assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
end else begin
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
end
assign fpu_to_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_to_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
// send response
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (0)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.data_out ({commit_block_if[block_idx].data.uuid, commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.tmask, commit_block_if[block_idx].data.PC, commit_block_if[block_idx].data.rd, commit_block_if[block_idx].data.data, commit_block_if[block_idx].data.pid, commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop}),
.valid_out (commit_block_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready)
);
assign commit_block_if[block_idx].data.wb = 1'b1;
end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 3 : 0)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_block_if),
.commit_out_if (commit_if)
);
endmodule