Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
This commit is contained in:
45
hw/rtl/fpu/VX_fpu_class.sv
Normal file
45
hw/rtl/fpu/VX_fpu_class.sv
Normal file
@@ -0,0 +1,45 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
module VX_fpu_class import VX_fpu_pkg::*; #(
|
||||
parameter MAN_BITS = 23,
|
||||
parameter EXP_BITS = 8
|
||||
) (
|
||||
input [EXP_BITS-1:0] exp_i,
|
||||
input [MAN_BITS-1:0] man_i,
|
||||
output fclass_t clss_o
|
||||
);
|
||||
wire is_normal = (exp_i != '0) && (exp_i != '1);
|
||||
wire is_zero = (exp_i == '0) && (man_i == '0);
|
||||
wire is_subnormal = (exp_i == '0) && (man_i != '0);
|
||||
wire is_inf = (exp_i == '1) && (man_i == '0);
|
||||
wire is_nan = (exp_i == '1) && (man_i != '0);
|
||||
wire is_signaling = is_nan && ~man_i[MAN_BITS-1];
|
||||
wire is_quiet = is_nan && ~is_signaling;
|
||||
|
||||
assign clss_o.is_normal = is_normal;
|
||||
assign clss_o.is_zero = is_zero;
|
||||
assign clss_o.is_subnormal = is_subnormal;
|
||||
assign clss_o.is_inf = is_inf;
|
||||
assign clss_o.is_nan = is_nan;
|
||||
assign clss_o.is_quiet = is_quiet;
|
||||
assign clss_o.is_signaling = is_signaling;
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
|
||||
464
hw/rtl/fpu/VX_fpu_cvt.sv
Normal file
464
hw/rtl/fpu/VX_fpu_cvt.sv
Normal file
@@ -0,0 +1,464 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
/// Modified port of cast module from fpnew Libray
|
||||
/// reference: https://github.com/pulp-platform/fpnew
|
||||
|
||||
module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire is_itof,
|
||||
input wire is_signed,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
output wire [NUM_LANES-1:0][31:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
// Constants
|
||||
|
||||
localparam MAN_BITS = 23;
|
||||
localparam EXP_BITS = 8;
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
|
||||
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
|
||||
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
|
||||
|
||||
// Use 32-bit integer
|
||||
localparam MAX_INT_WIDTH = 32;
|
||||
|
||||
// The internal mantissa includes normal bit or an entire integer
|
||||
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
|
||||
|
||||
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
|
||||
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
|
||||
|
||||
// The internal exponent must be able to represent the smallest denormal input value as signed
|
||||
// or the number of bits in an integer
|
||||
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
|
||||
|
||||
// shift amount for denormalization
|
||||
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
|
||||
|
||||
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
|
||||
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
|
||||
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R
|
||||
|
||||
// Input processing
|
||||
|
||||
fclass_t [NUM_LANES-1:0] fclass;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
VX_fpu_class #(
|
||||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_class (
|
||||
.exp_i (dataa[i][30:23]),
|
||||
.man_i (dataa[i][22:0]),
|
||||
.clss_o (fclass[i])
|
||||
);
|
||||
end
|
||||
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp;
|
||||
wire [NUM_LANES-1:0] input_sign;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [INT_MAN_WIDTH-1:0] int_mantissa;
|
||||
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
|
||||
wire fmt_sign = dataa[i][31];
|
||||
wire int_sign = dataa[i][31] && is_signed;
|
||||
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
|
||||
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
||||
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
|
||||
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
|
||||
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
|
||||
end
|
||||
|
||||
// Pipeline stage0
|
||||
|
||||
wire valid_in_s0;
|
||||
wire [NUM_LANES-1:0] lane_mask_s0;
|
||||
wire [TAGW-1:0] tag_in_s0;
|
||||
wire is_itof_s0;
|
||||
wire unsigned_s0;
|
||||
wire [2:0] rnd_mode_s0;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s0;
|
||||
wire [NUM_LANES-1:0] input_sign_s0;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
|
||||
|
||||
wire stall;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + `INST_FRM_BITS + 1 + NUM_LANES * ($bits(fclass_t) + 1 + INT_EXP_WIDTH + INT_MAN_WIDTH)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
||||
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
||||
);
|
||||
|
||||
// Normalization
|
||||
|
||||
wire [NUM_LANES-1:0][LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s0; // for integer zeroes
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire mant_is_nonzero_s0;
|
||||
VX_lzc #(
|
||||
.N (INT_MAN_WIDTH)
|
||||
) lzc (
|
||||
.data_in (encoded_mant_s0[i]),
|
||||
.data_out (renorm_shamt_s0[i]),
|
||||
.valid_out (mant_is_nonzero_s0)
|
||||
);
|
||||
assign mant_is_zero_s0[i] = ~mant_is_nonzero_s0;
|
||||
end
|
||||
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_n_s0; // unbiased true exponent
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Realign input mantissa, append zeroes if destination is wider
|
||||
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
|
||||
|
||||
// Unbias exponent and compensate for shift
|
||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
|
||||
end
|
||||
|
||||
// Pipeline stage1
|
||||
|
||||
wire valid_in_s1;
|
||||
wire [NUM_LANES-1:0] lane_mask_s1;
|
||||
wire [TAGW-1:0] tag_in_s1;
|
||||
wire is_itof_s1;
|
||||
wire unsigned_s1;
|
||||
wire [2:0] rnd_mode_s1;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s1;
|
||||
wire [NUM_LANES-1:0] input_sign_s1;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s1;
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + `INST_FRM_BITS + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + INT_MAN_WIDTH + INT_EXP_WIDTH)),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
|
||||
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
|
||||
);
|
||||
|
||||
// Perform adjustments to mantissa and exponent
|
||||
|
||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
|
||||
wire [NUM_LANES-1:0] of_before_round_s1;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
|
||||
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
|
||||
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
|
||||
reg of_before_round_tmp_s1;
|
||||
|
||||
always @(*) begin
|
||||
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
|
||||
preshift_mant_s1 = {input_mant_s1[i], 33'b0};
|
||||
denorm_shamt_s1 = '0;
|
||||
of_before_round_tmp_s1 = 1'b0;
|
||||
|
||||
if (is_itof_s1) begin
|
||||
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
|
||||
// Overflow or infinities (for proper rounding)
|
||||
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
|
||||
preshift_mant_s1 = ~0; // largest normal value and RS bits set
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
|
||||
// Limit the shift to retain sticky bits
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
|
||||
// Denormalize underflowing values
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
|
||||
end
|
||||
end else begin
|
||||
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
|
||||
// overflow: when converting to unsigned the range is larger by one
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
|
||||
// underflow
|
||||
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
|
||||
end else begin
|
||||
// By default right shift mantissa to be an integer
|
||||
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
|
||||
assign final_exp_s1[i] = final_exp_tmp_s1;
|
||||
assign of_before_round_s1[i] = of_before_round_tmp_s1;
|
||||
end
|
||||
|
||||
// Pipeline stage2
|
||||
|
||||
wire valid_in_s2;
|
||||
wire [NUM_LANES-1:0] lane_mask_s2;
|
||||
wire [TAGW-1:0] tag_in_s2;
|
||||
wire is_itof_s2;
|
||||
wire unsigned_s2;
|
||||
wire [2:0] rnd_mode_s2;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s2;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s2;
|
||||
wire [NUM_LANES-1:0] input_sign_s2;
|
||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
|
||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
|
||||
.RESETW (1)
|
||||
) pipe_reg2 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
||||
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] rounded_sign_s2;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
|
||||
|
||||
// Rouding and classification
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||
wire [1:0] round_sticky_bits_s2;
|
||||
wire [31:0] fmt_pre_round_abs_s2;
|
||||
wire [31:0] pre_round_abs_s2;
|
||||
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
|
||||
|
||||
// Extract final mantissa and round bit, discard the normal bit (for FP)
|
||||
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
|
||||
|
||||
// Collapse sticky bits
|
||||
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
|
||||
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
|
||||
|
||||
// select RS bits for destination operation
|
||||
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
|
||||
|
||||
// Pack exponent and mantissa into proper rounding form
|
||||
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
|
||||
|
||||
// Select output with destination format and operation
|
||||
assign pre_round_abs_s2 = is_itof_s2 ? fmt_pre_round_abs_s2 : final_int_s2;
|
||||
|
||||
// Perform the rounding
|
||||
VX_fpu_rounding #(
|
||||
.DAT_WIDTH (32)
|
||||
) fp_rounding (
|
||||
.abs_value_i (pre_round_abs_s2),
|
||||
.sign_i (input_sign_s2[i]),
|
||||
.round_sticky_bits_i (round_sticky_bits_s2),
|
||||
.rnd_mode_i (rnd_mode_s2),
|
||||
.effective_subtraction_i (1'b0),
|
||||
.abs_rounded_o (rounded_abs_s2[i]),
|
||||
.sign_o (rounded_sign_s2[i]),
|
||||
`UNUSED_PIN (exact_zero_o)
|
||||
);
|
||||
end
|
||||
|
||||
// Pipeline stage3
|
||||
|
||||
wire valid_in_s3;
|
||||
wire [NUM_LANES-1:0] lane_mask_s3;
|
||||
wire [TAGW-1:0] tag_in_s3;
|
||||
wire is_itof_s3;
|
||||
wire unsigned_s3;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s3;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s3;
|
||||
wire [NUM_LANES-1:0] input_sign_s3;
|
||||
wire [NUM_LANES-1:0] rounded_sign_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
|
||||
wire [NUM_LANES-1:0] of_before_round_s3;
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
|
||||
.RESETW (1)
|
||||
) pipe_reg3 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
|
||||
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] of_after_round_s3;
|
||||
wire [NUM_LANES-1:0] uf_after_round_s3;
|
||||
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
|
||||
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble regular result, nan box short ones. Int zeroes need to be detected
|
||||
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||
|
||||
// Classification after rounding select by destination format
|
||||
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
|
||||
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
|
||||
|
||||
// Negative integer result needs to be brought into two's complement
|
||||
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
|
||||
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
|
||||
end
|
||||
|
||||
// FP Special case handling
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
|
||||
wire [NUM_LANES-1:0] fp_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Detect special case from source format, I2F casts don't produce a special result
|
||||
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
|
||||
|
||||
// Signalling input NaNs raise invalid flag, otherwise no flags set
|
||||
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
|
||||
|
||||
// Assemble result according to destination format
|
||||
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
|
||||
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
|
||||
end
|
||||
|
||||
// INT Special case handling
|
||||
|
||||
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] int_special_status_s3;
|
||||
wire [NUM_LANES-1:0] int_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble result according to destination format
|
||||
always @(*) begin
|
||||
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
|
||||
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
|
||||
end else begin
|
||||
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
|
||||
end
|
||||
end
|
||||
|
||||
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
|
||||
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
|
||||
| fclass_s3[i].is_inf
|
||||
| of_before_round_s3[i]
|
||||
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
|
||||
|
||||
// All integer special cases are invalid
|
||||
assign int_special_status_s3[i] = {1'b1, 4'h0};
|
||||
end
|
||||
|
||||
// Result selection and Output handshake
|
||||
|
||||
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
|
||||
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
fflags_t fp_regular_status_s3, int_regular_status_s3;
|
||||
fflags_t fp_status_s3, int_status_s3;
|
||||
wire [31:0] fp_result_s3, int_result_s3;
|
||||
|
||||
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
|
||||
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
|
||||
|
||||
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
|
||||
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
|
||||
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
|
||||
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
|
||||
assign fp_regular_status_s3.NX = inexact_s3;
|
||||
|
||||
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
|
||||
|
||||
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
|
||||
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
|
||||
|
||||
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
|
||||
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
|
||||
|
||||
// Select output depending on special case detection
|
||||
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
|
||||
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
|
||||
end
|
||||
|
||||
assign stall = ~ready_out && valid_out;
|
||||
|
||||
fflags_t fflags_merged;
|
||||
`FPU_MERGE_FFLAGS(fflags_merged, tmp_fflags_s3, lane_mask_s3, NUM_LANES);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + TAGW + (NUM_LANES * 32) + `FP_FLAGS_BITS),
|
||||
.RESETW (1)
|
||||
) pipe_reg4 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall),
|
||||
.data_in ({valid_in_s3, tag_in_s3, tmp_result_s3, fflags_merged}),
|
||||
.data_out ({valid_out, tag_out, result, fflags})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
assign has_fflags = 1'b1;
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
42
hw/rtl/fpu/VX_fpu_define.vh
Normal file
42
hw/rtl/fpu/VX_fpu_define.vh
Normal file
@@ -0,0 +1,42 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_FPU_DEFINE_VH
|
||||
`define VX_FPU_DEFINE_VH
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
`define FPU_MERGE_FFLAGS(out, in, mask, lanes) \
|
||||
fflags_t __``out; \
|
||||
always @(*) begin \
|
||||
__``out = '0; \
|
||||
for (integer __i = 0; __i < lanes; ++__i) begin \
|
||||
if (mask[__i]) begin \
|
||||
__``out.NX |= in[__i].NX; \
|
||||
__``out.UF |= in[__i].UF; \
|
||||
__``out.OF |= in[__i].OF; \
|
||||
__``out.DZ |= in[__i].DZ; \
|
||||
__``out.NV |= in[__i].NV; \
|
||||
end \
|
||||
end \
|
||||
end \
|
||||
assign out = __``out
|
||||
|
||||
`define FP_CLASS_BITS $bits(VX_fpu_pkg::fclass_t)
|
||||
`define FP_FLAGS_BITS $bits(VX_fpu_pkg::fflags_t)
|
||||
|
||||
`endif // VX_FPU_DEFINE_VH
|
||||
137
hw/rtl/fpu/VX_fpu_div.sv
Normal file
137
hw/rtl/fpu/VX_fpu_div.sv
Normal file
@@ -0,0 +1,137 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
module VX_fpu_div import VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
input wire [NUM_LANES-1:0][31:0] datab,
|
||||
output wire [NUM_LANES-1:0][31:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
`UNUSED_VAR (frm)
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
fflags_t [NUM_LANES-1:0] per_lane_fflags;
|
||||
wire [NUM_LANES-1:0] lane_mask_out;
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW),
|
||||
.DEPTH (`LATENCY_FDIV),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid_in, lane_mask, tag_in}),
|
||||
.data_out ({valid_out, lane_mask_out, tag_out})
|
||||
);
|
||||
|
||||
assign ready_in = enable;
|
||||
|
||||
`ifdef QUARTUS
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
acl_fdiv fdiv (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.b (datab[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 0;
|
||||
assign per_lane_fflags = 'x;
|
||||
|
||||
`elsif VIVADO
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [3:0] tuser;
|
||||
|
||||
xil_fdiv fdiv (
|
||||
.aclk (clk),
|
||||
.aclken (enable),
|
||||
.s_axis_a_tvalid (1'b1),
|
||||
.s_axis_a_tdata (dataa[i]),
|
||||
.s_axis_b_tvalid (1'b1),
|
||||
.s_axis_b_tdata (datab[i]),
|
||||
`UNUSED_PIN (m_axis_result_tvalid),
|
||||
.m_axis_result_tdata (result[i]),
|
||||
.m_axis_result_tuser (tuser)
|
||||
);
|
||||
// NV, DZ, OF, UF, NX
|
||||
assign per_lane_fflags[i] = {tuser[2], tuser[3], tuser[1], tuser[0], 1'b0};
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [63:0] r;
|
||||
`UNUSED_VAR (r)
|
||||
|
||||
fflags_t f;
|
||||
|
||||
always @(*) begin
|
||||
dpi_fdiv (enable && valid_in, int'(0), {32'hffffffff, dataa[i]}, {32'hffffffff, datab[i]}, frm, r, f);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (32 + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FDIV)
|
||||
) shift_req_dpi (
|
||||
.clk (clk),
|
||||
`UNUSED_PIN (reset),
|
||||
.enable (enable),
|
||||
.data_in ({r[31:0], f}),
|
||||
.data_out ({result[i], per_lane_fflags[i]})
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`endif
|
||||
|
||||
`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
490
hw/rtl/fpu/VX_fpu_dpi.sv
Normal file
490
hw/rtl/fpu/VX_fpu_dpi.sv
Normal file
@@ -0,0 +1,490 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DPI
|
||||
|
||||
module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1,
|
||||
parameter OUT_REG = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire valid_in,
|
||||
output wire ready_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FMT_BITS-1:0] fmt,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] dataa,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datab,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datac,
|
||||
output wire [NUM_LANES-1:0][`XLEN-1:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam FPU_FMA = 0;
|
||||
localparam FPU_DIVSQRT = 1;
|
||||
localparam FPU_CVT = 2;
|
||||
localparam FPU_NCP = 3;
|
||||
localparam NUM_FPC = 4;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
localparam RSP_DATAW = (NUM_LANES * `XLEN) + 1 + $bits(fflags_t) + TAGW;
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
wire [NUM_FPC-1:0][NUM_LANES-1:0][`XLEN-1:0] per_core_result;
|
||||
wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
|
||||
reg [NUM_FPC-1:0] per_core_ready_out;
|
||||
wire [NUM_FPC-1:0] per_core_valid_out;
|
||||
wire [NUM_FPC-1:0] per_core_has_fflags;
|
||||
fflags_t [NUM_FPC-1:0] per_core_fflags;
|
||||
|
||||
wire div_ready_in, sqrt_ready_in;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] div_result, sqrt_result;
|
||||
wire [TAGW-1:0] div_tag_out, sqrt_tag_out;
|
||||
wire div_ready_out, sqrt_ready_out;
|
||||
wire div_valid_out, sqrt_valid_out;
|
||||
wire div_has_fflags, sqrt_has_fflags;
|
||||
fflags_t div_fflags, sqrt_fflags;
|
||||
|
||||
reg [FPC_BITS-1:0] core_select;
|
||||
|
||||
reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
|
||||
reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;
|
||||
reg dst_fmt, int_fmt;
|
||||
|
||||
reg [NUM_LANES-1:0][63:0] operands [3];
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
operands[0][i] = 64'(dataa[i]);
|
||||
operands[1][i] = 64'(datab[i]);
|
||||
operands[2][i] = 64'(datac[i]);
|
||||
end
|
||||
end
|
||||
|
||||
`UNUSED_VAR (fmt)
|
||||
|
||||
always @(*) begin
|
||||
is_fadd = 0;
|
||||
is_fsub = 0;
|
||||
is_fmul = 0;
|
||||
is_fmadd = 0;
|
||||
is_fmsub = 0;
|
||||
is_fnmadd = 0;
|
||||
is_fnmsub = 0;
|
||||
is_div = 0;
|
||||
is_fcmp = 0;
|
||||
is_itof = 0;
|
||||
is_utof = 0;
|
||||
is_ftoi = 0;
|
||||
is_ftou = 0;
|
||||
is_f2f = 0;
|
||||
|
||||
dst_fmt = 0;
|
||||
int_fmt = 0;
|
||||
|
||||
`ifdef FLEN_64
|
||||
dst_fmt = fmt[0];
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
int_fmt = fmt[1];
|
||||
`endif
|
||||
|
||||
case (op_type)
|
||||
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end
|
||||
`INST_FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end
|
||||
`INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
|
||||
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end
|
||||
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end
|
||||
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
|
||||
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
|
||||
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
|
||||
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
|
||||
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
|
||||
`INST_FPU_F2I: begin core_select = FPU_CVT; is_ftoi = 1; end
|
||||
`INST_FPU_F2U: begin core_select = FPU_CVT; is_ftou = 1; end
|
||||
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; end
|
||||
`INST_FPU_U2F: begin core_select = FPU_CVT; is_utof = 1; end
|
||||
`INST_FPU_F2F: begin core_select = FPU_CVT; is_f2f = 1; end
|
||||
default: begin core_select = FPU_NCP; end
|
||||
endcase
|
||||
end
|
||||
|
||||
generate
|
||||
begin : fma
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
|
||||
wire [NUM_LANES-1:0][63:0] result_fadd;
|
||||
wire [NUM_LANES-1:0][63:0] result_fsub;
|
||||
wire [NUM_LANES-1:0][63:0] result_fmul;
|
||||
wire [NUM_LANES-1:0][63:0] result_fmadd;
|
||||
wire [NUM_LANES-1:0][63:0] result_fmsub;
|
||||
wire [NUM_LANES-1:0][63:0] result_fnmadd;
|
||||
wire [NUM_LANES-1:0][63:0] result_fnmsub;
|
||||
|
||||
fflags_t [NUM_LANES-1:0] fflags_fma;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fadd;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fsub;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fmul;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fmadd;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fmsub;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fnmadd;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fnmsub;
|
||||
|
||||
wire fma_valid = (valid_in && core_select == FPU_FMA);
|
||||
wire fma_ready = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
|
||||
wire fma_fire = fma_valid && fma_ready;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
|
||||
dpi_fsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
|
||||
dpi_fmul (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
|
||||
dpi_fmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
|
||||
dpi_fmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
|
||||
dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
|
||||
dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
|
||||
|
||||
result_fma[i] = is_fadd ? result_fadd[i][`XLEN-1:0] :
|
||||
is_fsub ? result_fsub[i][`XLEN-1:0] :
|
||||
is_fmul ? result_fmul[i][`XLEN-1:0] :
|
||||
is_fmadd ? result_fmadd[i][`XLEN-1:0] :
|
||||
is_fmsub ? result_fmsub[i][`XLEN-1:0] :
|
||||
is_fnmadd ? result_fnmadd[i][`XLEN-1:0] :
|
||||
is_fnmsub ? result_fnmsub[i][`XLEN-1:0] :
|
||||
'0;
|
||||
|
||||
fflags_fma[i] = is_fadd ? fflags_fadd[i] :
|
||||
is_fsub ? fflags_fsub[i] :
|
||||
is_fmul ? fflags_fmul[i] :
|
||||
is_fmadd ? fflags_fmadd[i] :
|
||||
is_fmsub ? fflags_fmsub[i] :
|
||||
is_fnmadd ? fflags_fnmadd[i] :
|
||||
is_fnmsub ? fflags_fnmsub[i] :
|
||||
'0;
|
||||
end
|
||||
end
|
||||
|
||||
fflags_t fflags_merged;
|
||||
`FPU_MERGE_FFLAGS(fflags_merged, fflags_fma, lane_mask, NUM_LANES);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FMA),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (fma_ready),
|
||||
.data_in ({fma_valid, tag_in, result_fma, fflags_merged}),
|
||||
.data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]})
|
||||
);
|
||||
|
||||
assign per_core_has_fflags[FPU_FMA] = 1;
|
||||
assign per_core_ready_in[FPU_FMA] = fma_ready;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fdiv
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
|
||||
wire [NUM_LANES-1:0][63:0] result_fdiv;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fdiv;
|
||||
|
||||
wire fdiv_valid = (valid_in && core_select == FPU_DIVSQRT) && is_div;
|
||||
wire fdiv_ready = div_ready_out || ~div_valid_out;
|
||||
wire fdiv_fire = fdiv_valid && fdiv_ready;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
|
||||
result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0];
|
||||
end
|
||||
end
|
||||
|
||||
fflags_t fflags_merged;
|
||||
`FPU_MERGE_FFLAGS(fflags_merged, fflags_fdiv, lane_mask, NUM_LANES);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FDIV),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (fdiv_ready),
|
||||
.data_in ({fdiv_valid, tag_in, result_fdiv_r, fflags_merged}),
|
||||
.data_out ({div_valid_out, div_tag_out, div_result, div_fflags})
|
||||
);
|
||||
|
||||
assign div_has_fflags = 1;
|
||||
assign div_ready_in = fdiv_ready;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fsqrt
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
|
||||
wire [NUM_LANES-1:0][63:0] result_fsqrt;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fsqrt;
|
||||
|
||||
wire fsqrt_valid = (valid_in && core_select == FPU_DIVSQRT) && ~is_div;
|
||||
wire fsqrt_ready = sqrt_ready_out || ~sqrt_valid_out;
|
||||
wire fsqrt_fire = fsqrt_valid && fsqrt_ready;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
|
||||
result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0];
|
||||
end
|
||||
end
|
||||
|
||||
fflags_t fflags_merged;
|
||||
`FPU_MERGE_FFLAGS(fflags_merged, fflags_fsqrt, lane_mask, NUM_LANES);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FSQRT),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (fsqrt_ready),
|
||||
.data_in ({fsqrt_valid, tag_in, result_fsqrt_r, fflags_merged}),
|
||||
.data_out ({sqrt_valid_out, sqrt_tag_out, sqrt_result, sqrt_fflags})
|
||||
);
|
||||
|
||||
assign sqrt_has_fflags = 1;
|
||||
assign sqrt_ready_in = fsqrt_ready;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fcvt
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
|
||||
wire [NUM_LANES-1:0][63:0] result_itof;
|
||||
wire [NUM_LANES-1:0][63:0] result_utof;
|
||||
wire [NUM_LANES-1:0][63:0] result_ftoi;
|
||||
wire [NUM_LANES-1:0][63:0] result_ftou;
|
||||
wire [NUM_LANES-1:0][63:0] result_f2f;
|
||||
|
||||
fflags_t [NUM_LANES-1:0] fflags_fcvt;
|
||||
fflags_t [NUM_LANES-1:0] fflags_itof;
|
||||
fflags_t [NUM_LANES-1:0] fflags_utof;
|
||||
fflags_t [NUM_LANES-1:0] fflags_ftoi;
|
||||
fflags_t [NUM_LANES-1:0] fflags_ftou;
|
||||
|
||||
wire fcvt_valid = (valid_in && core_select == FPU_CVT);
|
||||
wire fcvt_ready = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
|
||||
wire fcvt_fire = fcvt_valid && fcvt_ready;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
|
||||
dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
|
||||
dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
|
||||
dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
|
||||
dpi_f2f (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]);
|
||||
|
||||
result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] :
|
||||
is_utof ? result_utof[i][`XLEN-1:0] :
|
||||
is_ftoi ? result_ftoi[i][`XLEN-1:0] :
|
||||
is_ftou ? result_ftou[i][`XLEN-1:0] :
|
||||
is_f2f ? result_f2f[i][`XLEN-1:0] :
|
||||
'0;
|
||||
|
||||
fflags_fcvt[i] = is_itof ? fflags_itof[i] :
|
||||
is_utof ? fflags_utof[i] :
|
||||
is_ftoi ? fflags_ftoi[i] :
|
||||
is_ftou ? fflags_ftou[i] :
|
||||
'0;
|
||||
end
|
||||
end
|
||||
|
||||
fflags_t fflags_merged;
|
||||
`FPU_MERGE_FFLAGS(fflags_merged, fflags_fcvt, lane_mask, NUM_LANES);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + NUM_LANES * `XLEN + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FCVT),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (fcvt_ready),
|
||||
.data_in ({fcvt_valid, tag_in, result_fcvt, fflags_merged}),
|
||||
.data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]})
|
||||
);
|
||||
|
||||
assign per_core_has_fflags[FPU_CVT] = 1;
|
||||
assign per_core_ready_in[FPU_CVT] = fcvt_ready;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fncp
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fncp;
|
||||
wire [NUM_LANES-1:0][63:0] result_fclss;
|
||||
wire [NUM_LANES-1:0][63:0] result_flt;
|
||||
wire [NUM_LANES-1:0][63:0] result_fle;
|
||||
wire [NUM_LANES-1:0][63:0] result_feq;
|
||||
wire [NUM_LANES-1:0][63:0] result_fmin;
|
||||
wire [NUM_LANES-1:0][63:0] result_fmax;
|
||||
wire [NUM_LANES-1:0][63:0] result_fsgnj;
|
||||
wire [NUM_LANES-1:0][63:0] result_fsgnjn;
|
||||
wire [NUM_LANES-1:0][63:0] result_fsgnjx;
|
||||
reg [NUM_LANES-1:0][63:0] result_fmvx;
|
||||
reg [NUM_LANES-1:0][63:0] result_fmvf;
|
||||
|
||||
fflags_t [NUM_LANES-1:0] fflags_fncp;
|
||||
fflags_t [NUM_LANES-1:0] fflags_flt;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fle;
|
||||
fflags_t [NUM_LANES-1:0] fflags_feq;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fmin;
|
||||
fflags_t [NUM_LANES-1:0] fflags_fmax;
|
||||
|
||||
wire fncp_valid = (valid_in && core_select == FPU_NCP);
|
||||
wire fncp_ready = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
|
||||
wire fncp_fire = fncp_valid && fncp_ready;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fclss (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]);
|
||||
dpi_fle (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
|
||||
dpi_flt (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
|
||||
dpi_feq (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
|
||||
dpi_fmin (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
|
||||
dpi_fmax (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
|
||||
dpi_fsgnj (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
|
||||
dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
|
||||
dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
|
||||
result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
|
||||
result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
|
||||
end
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
result_fncp = 'x;
|
||||
fflags_fncp = 'x;
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
case (frm)
|
||||
0: begin result_fncp[i] = is_fcmp ? result_fle[i][`XLEN-1:0] : result_fsgnj[i][`XLEN-1:0]; fflags_fncp[i] = fflags_fle[i]; end
|
||||
1: begin result_fncp[i] = is_fcmp ? result_flt[i][`XLEN-1:0] : result_fsgnjn[i][`XLEN-1:0]; fflags_fncp[i] = fflags_flt[i]; end
|
||||
2: begin result_fncp[i] = is_fcmp ? result_feq[i][`XLEN-1:0] : result_fsgnjx[i][`XLEN-1:0]; fflags_fncp[i] = fflags_feq[i]; end
|
||||
3: begin result_fncp[i] = result_fclss[i][`XLEN-1:0]; end
|
||||
4: begin result_fncp[i] = result_fmvx[i][`XLEN-1:0]; end
|
||||
5: begin result_fncp[i] = result_fmvf[i][`XLEN-1:0]; end
|
||||
6: begin result_fncp[i] = result_fmin[i][`XLEN-1:0]; fflags_fncp[i] = fflags_fmin[i]; end
|
||||
7: begin result_fncp[i] = result_fmax[i][`XLEN-1:0]; fflags_fncp[i] = fflags_fmax[i]; end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
fflags_t fflags_merged;
|
||||
`FPU_MERGE_FFLAGS(fflags_merged, fflags_fncp, lane_mask, NUM_LANES);
|
||||
|
||||
wire has_fflags_fncp = (frm >= 6) || is_fcmp;
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + 1 + NUM_LANES * `XLEN + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FNCP),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (fncp_ready),
|
||||
.data_in ({fncp_valid, tag_in, has_fflags_fncp, result_fncp, fflags_merged}),
|
||||
.data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
|
||||
);
|
||||
|
||||
assign per_core_ready_in[FPU_NCP] = fncp_ready;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (0)
|
||||
) div_sqrt_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({sqrt_valid_out, div_valid_out}),
|
||||
.ready_in ({sqrt_ready_out, div_ready_out}),
|
||||
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
|
||||
{div_result, div_has_fflags, div_fflags, div_tag_out}}),
|
||||
.data_out ({per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], per_core_fflags[FPU_DIVSQRT], per_core_tag_out[FPU_DIVSQRT]}),
|
||||
.valid_out (per_core_valid_out[FPU_DIVSQRT]),
|
||||
.ready_out (per_core_ready_out[FPU_DIVSQRT]),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_FPC; ++i) begin
|
||||
assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_FPC),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (OUT_REG)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_core_valid_out),
|
||||
.ready_in (per_core_ready_out),
|
||||
.data_in (per_core_data_out),
|
||||
.data_out ({result, has_fflags, fflags, tag_out}),
|
||||
.valid_out (valid_out),
|
||||
.ready_out (ready_out),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
assign ready_in = per_core_ready_in[core_select];
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
325
hw/rtl/fpu/VX_fpu_dsp.sv
Normal file
325
hw/rtl/fpu/VX_fpu_dsp.sv
Normal file
@@ -0,0 +1,325 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 4,
|
||||
parameter TAGW = 4,
|
||||
parameter OUT_REG = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire valid_in,
|
||||
output wire ready_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FMT_BITS-1:0] fmt,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] dataa,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datab,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datac,
|
||||
output wire [NUM_LANES-1:0][`XLEN-1:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam FPU_FMA = 0;
|
||||
localparam FPU_DIVSQRT = 1;
|
||||
localparam FPU_CVT = 2;
|
||||
localparam FPU_NCP = 3;
|
||||
localparam NUM_FPC = 4;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAGW;
|
||||
|
||||
`UNUSED_VAR (fmt)
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
|
||||
wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
|
||||
wire [NUM_FPC-1:0] per_core_ready_out;
|
||||
wire [NUM_FPC-1:0] per_core_valid_out;
|
||||
wire [NUM_FPC-1:0] per_core_has_fflags;
|
||||
fflags_t [NUM_FPC-1:0] per_core_fflags;
|
||||
|
||||
wire div_ready_in, sqrt_ready_in;
|
||||
wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
|
||||
wire [TAGW-1:0] div_tag_out, sqrt_tag_out;
|
||||
wire div_ready_out, sqrt_ready_out;
|
||||
wire div_valid_out, sqrt_valid_out;
|
||||
wire div_has_fflags, sqrt_has_fflags;
|
||||
fflags_t div_fflags, sqrt_fflags;
|
||||
|
||||
reg [FPC_BITS-1:0] core_select;
|
||||
reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed;
|
||||
|
||||
always @(*) begin
|
||||
is_madd = 0;
|
||||
is_sub = 0;
|
||||
is_neg = 0;
|
||||
is_div = 0;
|
||||
is_itof = 0;
|
||||
is_signed = 0;
|
||||
case (op_type)
|
||||
`INST_FPU_ADD: begin core_select = FPU_FMA; end
|
||||
`INST_FPU_SUB: begin core_select = FPU_FMA; is_sub = 1; end
|
||||
`INST_FPU_MUL: begin core_select = FPU_FMA; is_neg = 1; end
|
||||
`INST_FPU_MADD: begin core_select = FPU_FMA; is_madd = 1; end
|
||||
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end
|
||||
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end
|
||||
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end
|
||||
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
|
||||
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
|
||||
`INST_FPU_F2I: begin core_select = FPU_CVT; is_signed = 1; end
|
||||
`INST_FPU_F2U: begin core_select = FPU_CVT; end
|
||||
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
|
||||
`INST_FPU_U2F: begin core_select = FPU_CVT; is_itof = 1; end
|
||||
default: begin core_select = FPU_NCP; end
|
||||
endcase
|
||||
end
|
||||
|
||||
`RESET_RELAY (fma_reset, reset);
|
||||
`RESET_RELAY (div_reset, reset);
|
||||
`RESET_RELAY (sqrt_reset, reset);
|
||||
`RESET_RELAY (cvt_reset, reset);
|
||||
`RESET_RELAY (ncp_reset, reset);
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] dataa_s;
|
||||
wire [NUM_LANES-1:0][31:0] datab_s;
|
||||
wire [NUM_LANES-1:0][31:0] datac_s;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign dataa_s[i] = dataa[i][31:0];
|
||||
assign datab_s[i] = datab[i][31:0];
|
||||
assign datac_s[i] = datac[i][31:0];
|
||||
end
|
||||
|
||||
`UNUSED_VAR (dataa)
|
||||
`UNUSED_VAR (datab)
|
||||
`UNUSED_VAR (datac)
|
||||
|
||||
VX_fpu_fma #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW)
|
||||
) fpu_fma (
|
||||
.clk (clk),
|
||||
.reset (fma_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_FMA)),
|
||||
.ready_in (per_core_ready_in[FPU_FMA]),
|
||||
.lane_mask (lane_mask),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.is_madd (is_madd),
|
||||
.is_sub (is_sub),
|
||||
.is_neg (is_neg),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.datac (datac_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_FMA]),
|
||||
.fflags (per_core_fflags[FPU_FMA]),
|
||||
.result (per_core_result[FPU_FMA]),
|
||||
.tag_out (per_core_tag_out[FPU_FMA]),
|
||||
.ready_out (per_core_ready_out[FPU_FMA]),
|
||||
.valid_out (per_core_valid_out[FPU_FMA])
|
||||
);
|
||||
|
||||
VX_fpu_div #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW)
|
||||
) fpu_div (
|
||||
.clk (clk),
|
||||
.reset (div_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && is_div),
|
||||
.ready_in (div_ready_in),
|
||||
.lane_mask (lane_mask),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.has_fflags (div_has_fflags),
|
||||
.fflags (div_fflags),
|
||||
.result (div_result),
|
||||
.tag_out (div_tag_out),
|
||||
.valid_out (div_valid_out),
|
||||
.ready_out (div_ready_out)
|
||||
);
|
||||
|
||||
VX_fpu_sqrt #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW)
|
||||
) fpu_sqrt (
|
||||
.clk (clk),
|
||||
.reset (sqrt_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
|
||||
.ready_in (sqrt_ready_in),
|
||||
.lane_mask (lane_mask),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa_s),
|
||||
.has_fflags (sqrt_has_fflags),
|
||||
.fflags (sqrt_fflags),
|
||||
.result (sqrt_result),
|
||||
.tag_out (sqrt_tag_out),
|
||||
.valid_out (sqrt_valid_out),
|
||||
.ready_out (sqrt_ready_out)
|
||||
);
|
||||
|
||||
wire cvt_rt_int_in = ~is_itof;
|
||||
wire cvt_rt_int_out;
|
||||
|
||||
VX_fpu_cvt #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW+1)
|
||||
) fpu_cvt (
|
||||
.clk (clk),
|
||||
.reset (cvt_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_CVT)),
|
||||
.ready_in (per_core_ready_in[FPU_CVT]),
|
||||
.lane_mask (lane_mask),
|
||||
.tag_in ({cvt_rt_int_in, tag_in}),
|
||||
.frm (frm),
|
||||
.is_itof (is_itof),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_CVT]),
|
||||
.fflags (per_core_fflags[FPU_CVT]),
|
||||
.result (per_core_result[FPU_CVT]),
|
||||
.tag_out ({cvt_rt_int_out, per_core_tag_out[FPU_CVT]}),
|
||||
.valid_out (per_core_valid_out[FPU_CVT]),
|
||||
.ready_out (per_core_ready_out[FPU_CVT])
|
||||
);
|
||||
|
||||
wire ncp_rt_int_in = (op_type == `INST_FPU_CMP)
|
||||
|| `INST_FPU_IS_CLASS(op_type, frm)
|
||||
|| `INST_FPU_IS_MVXW(op_type, frm);
|
||||
wire ncp_rt_int_out;
|
||||
|
||||
wire ncp_rt_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
|
||||
wire ncp_rt_sext_out;
|
||||
|
||||
VX_fpu_ncomp #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW+2)
|
||||
) fpu_ncomp (
|
||||
.clk (clk),
|
||||
.reset (ncp_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_NCP)),
|
||||
.ready_in (per_core_ready_in[FPU_NCP]),
|
||||
.lane_mask (lane_mask),
|
||||
.tag_in ({ncp_rt_sext_in, ncp_rt_int_in, tag_in}),
|
||||
.op_type (op_type),
|
||||
.frm (frm),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.result (per_core_result[FPU_NCP]),
|
||||
.has_fflags (per_core_has_fflags[FPU_NCP]),
|
||||
.fflags (per_core_fflags[FPU_NCP]),
|
||||
.tag_out ({ncp_rt_sext_out, ncp_rt_int_out, per_core_tag_out[FPU_NCP]}),
|
||||
.valid_out (per_core_valid_out[FPU_NCP]),
|
||||
.ready_out (per_core_ready_out[FPU_NCP])
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (0)
|
||||
) div_sqrt_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({sqrt_valid_out, div_valid_out}),
|
||||
.ready_in ({sqrt_ready_out, div_ready_out}),
|
||||
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
|
||||
{div_result, div_has_fflags, div_fflags, div_tag_out}}),
|
||||
.data_out ({per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], per_core_fflags[FPU_DIVSQRT], per_core_tag_out[FPU_DIVSQRT]}),
|
||||
.valid_out (per_core_valid_out[FPU_DIVSQRT]),
|
||||
.ready_out (per_core_ready_out[FPU_DIVSQRT]),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_FPC; ++i) begin
|
||||
per_core_data_out[i][RSP_DATAW+1:2] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
|
||||
per_core_data_out[i][1:0] = '0;
|
||||
end
|
||||
per_core_data_out[FPU_CVT][1:0] = {1'b1, cvt_rt_int_out};
|
||||
per_core_data_out[FPU_NCP][1:0] = {ncp_rt_sext_out, ncp_rt_int_out};
|
||||
end
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] result_s;
|
||||
wire [1:0] op_rt_int_out;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_FPC),
|
||||
.DATAW (RSP_DATAW + 2),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (OUT_REG)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_core_valid_out),
|
||||
.ready_in (per_core_ready_out),
|
||||
.data_in (per_core_data_out),
|
||||
.data_out ({result_s, has_fflags, fflags, tag_out, op_rt_int_out}),
|
||||
.valid_out (valid_out),
|
||||
.ready_out (ready_out),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
`ifndef FPU_RV64F
|
||||
`UNUSED_VAR (op_rt_int_out)
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef FPU_RV64F
|
||||
reg [`XLEN-1:0] result_r;
|
||||
always @(*) begin
|
||||
case (op_rt_int_out)
|
||||
2'b11: result_r = `XLEN'($signed(result_s[i]));
|
||||
2'b01: result_r = {32'h00000000, result_s[i]};
|
||||
default: result_r = {32'hffffffff, result_s[i]};
|
||||
endcase
|
||||
end
|
||||
assign result[i] = result_r;
|
||||
`else
|
||||
assign result[i] = result_s[i];
|
||||
`endif
|
||||
end
|
||||
|
||||
// can accept new request?
|
||||
assign ready_in = per_core_ready_in[core_select];
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
170
hw/rtl/fpu/VX_fpu_fma.sv
Normal file
170
hw/rtl/fpu/VX_fpu_fma.sv
Normal file
@@ -0,0 +1,170 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
module VX_fpu_fma import VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire is_madd,
|
||||
input wire is_sub,
|
||||
input wire is_neg,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
input wire [NUM_LANES-1:0][31:0] datab,
|
||||
input wire [NUM_LANES-1:0][31:0] datac,
|
||||
output wire [NUM_LANES-1:0][31:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
`UNUSED_VAR (frm)
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
fflags_t [NUM_LANES-1:0] per_lane_fflags;
|
||||
wire [NUM_LANES-1:0] lane_mask_out;
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW),
|
||||
.DEPTH (`LATENCY_FMA),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid_in, lane_mask, tag_in}),
|
||||
.data_out ({valid_out, lane_mask_out, tag_out})
|
||||
);
|
||||
|
||||
assign ready_in = enable;
|
||||
|
||||
reg [NUM_LANES-1:0][31:0] a, b, c;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
if (is_madd) begin
|
||||
// MADD / MSUB / NMADD / NMSUB
|
||||
a[i] = is_neg ? {~dataa[i][31], dataa[i][30:0]} : dataa[i];
|
||||
b[i] = datab[i];
|
||||
c[i] = (is_neg ^ is_sub) ? {~datac[i][31], datac[i][30:0]} : datac[i];
|
||||
end else begin
|
||||
if (is_neg) begin
|
||||
// MUL
|
||||
a[i] = dataa[i];
|
||||
b[i] = datab[i];
|
||||
c[i] = '0;
|
||||
end else begin
|
||||
// ADD / SUB
|
||||
a[i] = 32'h3f800000; // 1.0f
|
||||
b[i] = dataa[i];
|
||||
c[i] = is_sub ? {~datab[i][31], datab[i][30:0]} : datab[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef QUARTUS
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
acl_fmadd fmadd (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.a (a[i]),
|
||||
.b (b[i]),
|
||||
.c (c[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 0;
|
||||
assign per_lane_fflags = 'x;
|
||||
|
||||
`elsif VIVADO
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [2:0] tuser;
|
||||
|
||||
xil_fma fma (
|
||||
.aclk (clk),
|
||||
.aclken (enable),
|
||||
.s_axis_a_tvalid (1'b1),
|
||||
.s_axis_a_tdata (a[i]),
|
||||
.s_axis_b_tvalid (1'b1),
|
||||
.s_axis_b_tdata (b[i]),
|
||||
.s_axis_c_tvalid (1'b1),
|
||||
.s_axis_c_tdata (c[i]),
|
||||
`UNUSED_PIN (m_axis_result_tvalid),
|
||||
.m_axis_result_tdata (result[i]),
|
||||
.m_axis_result_tuser (tuser)
|
||||
);
|
||||
// NV, DZ, OF, UF, NX
|
||||
assign per_lane_fflags[i] = {tuser[2], 1'b0, tuser[1], tuser[0], 1'b0};
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [63:0] r;
|
||||
`UNUSED_VAR (r)
|
||||
|
||||
fflags_t f;
|
||||
|
||||
always @(*) begin
|
||||
dpi_fmadd (enable && valid_in, int'(0), {32'hffffffff, a[i]}, {32'hffffffff, b[i]}, {32'hffffffff, c[i]}, frm, r, f);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (32 + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FMA)
|
||||
) shift_req_dpi (
|
||||
.clk (clk),
|
||||
`UNUSED_PIN (reset),
|
||||
.enable (enable),
|
||||
.data_in ({r[31:0], f}),
|
||||
.data_out ({result[i], per_lane_fflags[i]})
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`endif
|
||||
|
||||
`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
286
hw/rtl/fpu/VX_fpu_fpnew.sv
Normal file
286
hw/rtl/fpu/VX_fpu_fpnew.sv
Normal file
@@ -0,0 +1,286 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_FPNEW
|
||||
|
||||
module VX_fpu_fpnew
|
||||
import VX_fpu_pkg::*;
|
||||
import fpnew_pkg::*;
|
||||
import cf_math_pkg::*;
|
||||
import defs_div_sqrt_mvp::*;
|
||||
#(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1,
|
||||
parameter OUT_REG = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire valid_in,
|
||||
output wire ready_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FMT_BITS-1:0] fmt,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] dataa,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datab,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datac,
|
||||
output wire [NUM_LANES-1:0][`XLEN-1:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam LATENCY_FDIVSQRT = `MAX(`LATENCY_FDIV, `LATENCY_FSQRT);
|
||||
localparam RSP_DATAW = (NUM_LANES * `XLEN) + 1 + $bits(fflags_t) + TAGW;
|
||||
|
||||
`ifdef XLEN_64
|
||||
// use scalar configuration for mixed formats
|
||||
localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{
|
||||
Width: unsigned'(`XLEN),
|
||||
EnableVectors: 1'b0,
|
||||
EnableNanBox: 1'b1,
|
||||
`ifdef FLEN_64
|
||||
FpFmtMask: 5'b11000,
|
||||
`else
|
||||
FpFmtMask: 5'b11000, // TODO: added FP64 to fix CVT bug in FpNew
|
||||
`endif
|
||||
IntFmtMask: 4'b0011
|
||||
};
|
||||
`else
|
||||
localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{
|
||||
Width: unsigned'(`XLEN * NUM_LANES),
|
||||
EnableVectors: 1'b1,
|
||||
EnableNanBox: 1'b0,
|
||||
FpFmtMask: 5'b10000,
|
||||
IntFmtMask: 4'b0010
|
||||
};
|
||||
`endif
|
||||
|
||||
localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
|
||||
PipeRegs:'{'{`LATENCY_FMA, 0, 0, 0, 0}, // ADDMUL
|
||||
'{default: unsigned'(LATENCY_FDIVSQRT)}, // DIVSQRT
|
||||
'{default: `LATENCY_FNCP}, // NONCOMP
|
||||
'{default: `LATENCY_FCVT}}, // CONV
|
||||
UnitTypes:'{'{default: fpnew_pkg::PARALLEL}, // ADDMUL
|
||||
'{default: fpnew_pkg::MERGED}, // DIVSQRT
|
||||
'{default: fpnew_pkg::PARALLEL}, // NONCOMP
|
||||
'{default: fpnew_pkg::MERGED}}, // CONV
|
||||
PipeConfig: fpnew_pkg::DISTRIBUTED
|
||||
};
|
||||
|
||||
wire fpu_ready_in, fpu_valid_in;
|
||||
wire fpu_ready_out, fpu_valid_out;
|
||||
|
||||
reg [TAGW-1:0] fpu_tag_in, fpu_tag_out;
|
||||
|
||||
reg [2:0][NUM_LANES-1:0][`XLEN-1:0] fpu_operands;
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] fpu_result;
|
||||
fpnew_pkg::status_t fpu_status;
|
||||
|
||||
fpnew_pkg::operation_e fpu_op;
|
||||
reg [`INST_FRM_BITS-1:0] fpu_rnd;
|
||||
reg fpu_op_mod;
|
||||
reg fpu_has_fflags, fpu_has_fflags_out;
|
||||
fpnew_pkg::fp_format_e fpu_src_fmt, fpu_dst_fmt;
|
||||
fpnew_pkg::int_format_e fpu_int_fmt;
|
||||
|
||||
`UNUSED_VAR (fmt)
|
||||
|
||||
always @(*) begin
|
||||
fpu_op = 'x;
|
||||
fpu_rnd = frm;
|
||||
fpu_op_mod = 0;
|
||||
fpu_has_fflags = 1;
|
||||
fpu_operands[0] = dataa;
|
||||
fpu_operands[1] = datab;
|
||||
fpu_operands[2] = datac;
|
||||
fpu_dst_fmt = fpnew_pkg::FP32;
|
||||
fpu_int_fmt = fpnew_pkg::INT32;
|
||||
|
||||
`ifdef FLEN_64
|
||||
if (fmt[0]) begin
|
||||
fpu_dst_fmt = fpnew_pkg::FP64;
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
if (fmt[1]) begin
|
||||
fpu_int_fmt = fpnew_pkg::INT64;
|
||||
end
|
||||
`endif
|
||||
|
||||
fpu_src_fmt = fpu_dst_fmt;
|
||||
|
||||
case (op_type)
|
||||
`INST_FPU_ADD: begin
|
||||
fpu_op = fpnew_pkg::ADD;
|
||||
fpu_operands[1] = dataa;
|
||||
fpu_operands[2] = datab;
|
||||
end
|
||||
`INST_FPU_SUB: begin
|
||||
fpu_op = fpnew_pkg::ADD;
|
||||
fpu_operands[1] = dataa;
|
||||
fpu_operands[2] = datab;
|
||||
fpu_op_mod = 1;
|
||||
end
|
||||
`INST_FPU_MUL: begin fpu_op = fpnew_pkg::MUL; end
|
||||
`INST_FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end
|
||||
`INST_FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end
|
||||
`INST_FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end
|
||||
`INST_FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
|
||||
`INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
|
||||
`INST_FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
|
||||
`ifdef FLEN_64
|
||||
`INST_FPU_F2F: begin fpu_op = fpnew_pkg::F2F; fpu_src_fmt = fmt[0] ? fpnew_pkg::FP32 : fpnew_pkg::FP64; end
|
||||
`endif
|
||||
`INST_FPU_F2I,
|
||||
`INST_FPU_F2U: begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = op_type[0]; end
|
||||
`INST_FPU_I2F,
|
||||
`INST_FPU_U2F: begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = op_type[0]; end
|
||||
`INST_FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end
|
||||
`INST_FPU_MISC:begin
|
||||
case (frm)
|
||||
0,1,2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = {1'b0, frm[1:0]}; fpu_has_fflags = 0; end // FSGNJ
|
||||
3: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end // CLASS
|
||||
4,5: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = 3'b011; fpu_op_mod = ~frm[0]; fpu_has_fflags = 0; end // FMV.X.W, FMV.W.X
|
||||
6,7: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = {2'b00, frm[0]}; end // MIN, MAX
|
||||
endcase
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
|
||||
`ifdef FPU_RV64F
|
||||
// apply nan-boxing to floating-point operands
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
if (op_type != `INST_FPU_I2F && op_type != `INST_FPU_U2F) begin
|
||||
fpu_operands[0][i] |= 64'hffffffff00000000;
|
||||
end
|
||||
fpu_operands[1][i] |= 64'hffffffff00000000;
|
||||
fpu_operands[2][i] |= 64'hffffffff00000000;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
`ifdef XLEN_64
|
||||
`UNUSED_VAR (lane_mask)
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [(TAGW+1)-1:0] fpu_tag;
|
||||
wire fpu_valid_out_uq;
|
||||
wire fpu_ready_in_uq;
|
||||
fpnew_pkg::status_t fpu_status_uq;
|
||||
`UNUSED_VAR (fpu_tag)
|
||||
`UNUSED_VAR (fpu_valid_out_uq)
|
||||
`UNUSED_VAR (fpu_ready_in_uq)
|
||||
`UNUSED_VAR (fpu_status_uq)
|
||||
|
||||
fpnew_top #(
|
||||
.Features (FPU_FEATURES),
|
||||
.Implementation (FPU_IMPLEMENTATION),
|
||||
.TagType (logic[(TAGW+1)-1:0])
|
||||
) fpnew_core (
|
||||
.clk_i (clk),
|
||||
.rst_ni (~reset),
|
||||
.operands_i ({fpu_operands[2][i], fpu_operands[1][i], fpu_operands[0][i]}),
|
||||
.rnd_mode_i (fpnew_pkg::roundmode_e'(fpu_rnd)),
|
||||
.op_i (fpu_op),
|
||||
.op_mod_i (fpu_op_mod),
|
||||
.src_fmt_i (fpu_src_fmt),
|
||||
.dst_fmt_i (fpu_dst_fmt),
|
||||
.int_fmt_i (fpu_int_fmt),
|
||||
`UNUSED_PIN (vectorial_op_i),
|
||||
`UNUSED_PIN (simd_mask_i),
|
||||
.tag_i ({fpu_tag_in, fpu_has_fflags}),
|
||||
.in_valid_i (fpu_valid_in),
|
||||
.in_ready_o (fpu_ready_in_uq),
|
||||
.flush_i (reset),
|
||||
.result_o (fpu_result[i]),
|
||||
.status_o (fpu_status_uq),
|
||||
.tag_o (fpu_tag),
|
||||
.out_valid_o (fpu_valid_out_uq),
|
||||
.out_ready_i (fpu_ready_out),
|
||||
`UNUSED_PIN (busy_o)
|
||||
);
|
||||
|
||||
if (i == 0) begin
|
||||
assign {fpu_tag_out, fpu_has_fflags_out} = fpu_tag;
|
||||
assign fpu_valid_out = fpu_valid_out_uq;
|
||||
assign fpu_ready_in = fpu_ready_in_uq;
|
||||
assign fpu_status = fpu_status_uq;
|
||||
end
|
||||
end
|
||||
`else
|
||||
fpnew_top #(
|
||||
.Features (FPU_FEATURES),
|
||||
.Implementation (FPU_IMPLEMENTATION),
|
||||
.TagType (logic[(TAGW+1)-1:0]),
|
||||
.TrueSIMDClass (1),
|
||||
.EnableSIMDMask (1)
|
||||
) fpnew_core (
|
||||
.clk_i (clk),
|
||||
.rst_ni (~reset),
|
||||
.operands_i (fpu_operands),
|
||||
.rnd_mode_i (fpnew_pkg::roundmode_e'(fpu_rnd)),
|
||||
.op_i (fpu_op),
|
||||
.op_mod_i (fpu_op_mod),
|
||||
.src_fmt_i (fpu_src_fmt),
|
||||
.dst_fmt_i (fpu_dst_fmt),
|
||||
.int_fmt_i (fpu_int_fmt),
|
||||
.vectorial_op_i (1'b1),
|
||||
.simd_mask_i (lane_mask),
|
||||
.tag_i ({fpu_tag_in, fpu_has_fflags}),
|
||||
.in_valid_i (fpu_valid_in),
|
||||
.in_ready_o (fpu_ready_in),
|
||||
.flush_i (reset),
|
||||
.result_o (fpu_result),
|
||||
.status_o (fpu_status),
|
||||
.tag_o ({fpu_tag_out, fpu_has_fflags_out}),
|
||||
.out_valid_o (fpu_valid_out),
|
||||
.out_ready_i (fpu_ready_out),
|
||||
`UNUSED_PIN (busy_o)
|
||||
);
|
||||
`endif
|
||||
|
||||
assign fpu_valid_in = valid_in;
|
||||
assign ready_in = fpu_ready_in;
|
||||
assign fpu_tag_in = tag_in;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (RSP_DATAW),
|
||||
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
|
||||
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (fpu_valid_out),
|
||||
.ready_in (fpu_ready_out),
|
||||
.data_in ({fpu_result, fpu_has_fflags_out, fpu_status, fpu_tag_out}),
|
||||
.data_out ({result, has_fflags, fflags, tag_out}),
|
||||
.valid_out (valid_out),
|
||||
.ready_out (ready_out)
|
||||
);
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
292
hw/rtl/fpu/VX_fpu_ncomp.sv
Normal file
292
hw/rtl/fpu/VX_fpu_ncomp.sv
Normal file
@@ -0,0 +1,292 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
/// Modified port of noncomp module from fpnew Libray
|
||||
/// reference: https://github.com/pulp-platform/fpnew
|
||||
|
||||
module VX_fpu_ncomp import VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
input wire [NUM_LANES-1:0][31:0] datab,
|
||||
output wire [NUM_LANES-1:0][31:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam EXP_BITS = 8;
|
||||
localparam MAN_BITS = 23;
|
||||
|
||||
localparam NEG_INF = 32'h00000001,
|
||||
NEG_NORM = 32'h00000002,
|
||||
NEG_SUBNORM = 32'h00000004,
|
||||
NEG_ZERO = 32'h00000008,
|
||||
POS_ZERO = 32'h00000010,
|
||||
POS_SUBNORM = 32'h00000020,
|
||||
POS_NORM = 32'h00000040,
|
||||
POS_INF = 32'h00000080,
|
||||
//SIG_NAN = 32'h00000100,
|
||||
QUT_NAN = 32'h00000200;
|
||||
|
||||
wire [NUM_LANES-1:0] a_sign, b_sign;
|
||||
wire [NUM_LANES-1:0][7:0] a_exponent, b_exponent;
|
||||
wire [NUM_LANES-1:0][22:0] a_mantissa, b_mantissa;
|
||||
fclass_t [NUM_LANES-1:0] a_fclass, b_fclass;
|
||||
wire [NUM_LANES-1:0] a_smaller, ab_equal;
|
||||
|
||||
// Setup
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign a_sign[i] = dataa[i][31];
|
||||
assign a_exponent[i] = dataa[i][30:23];
|
||||
assign a_mantissa[i] = dataa[i][22:0];
|
||||
|
||||
assign b_sign[i] = datab[i][31];
|
||||
assign b_exponent[i] = datab[i][30:23];
|
||||
assign b_mantissa[i] = datab[i][22:0];
|
||||
|
||||
VX_fpu_class #(
|
||||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_class_a (
|
||||
.exp_i (a_exponent[i]),
|
||||
.man_i (a_mantissa[i]),
|
||||
.clss_o (a_fclass[i])
|
||||
);
|
||||
|
||||
VX_fpu_class #(
|
||||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_class_b (
|
||||
.exp_i (b_exponent[i]),
|
||||
.man_i (b_mantissa[i]),
|
||||
.clss_o (b_fclass[i])
|
||||
);
|
||||
|
||||
assign a_smaller[i] = (dataa[i] < datab[i]) ^ (a_sign[i] || b_sign[i]);
|
||||
assign ab_equal[i] = (dataa[i] == datab[i])
|
||||
|| (a_fclass[i].is_zero && b_fclass[i].is_zero); // +0 == -0
|
||||
end
|
||||
|
||||
// Pipeline stage0
|
||||
|
||||
wire valid_in_s0;
|
||||
wire [NUM_LANES-1:0] lane_mask_s0;
|
||||
wire [TAGW-1:0] tag_in_s0;
|
||||
wire [3:0] op_mod_s0;
|
||||
wire [NUM_LANES-1:0][31:0] dataa_s0, datab_s0;
|
||||
wire [NUM_LANES-1:0] a_sign_s0, b_sign_s0;
|
||||
wire [NUM_LANES-1:0][7:0] a_exponent_s0;
|
||||
wire [NUM_LANES-1:0][22:0] a_mantissa_s0;
|
||||
fclass_t [NUM_LANES-1:0] a_fclass_s0, b_fclass_s0;
|
||||
wire [NUM_LANES-1:0] a_smaller_s0, ab_equal_s0;
|
||||
|
||||
wire stall;
|
||||
|
||||
wire [3:0] op_mod = {(op_type == `INST_FPU_CMP), frm};
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 4 + NUM_LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall),
|
||||
.data_in ({valid_in, lane_mask, tag_in, op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}),
|
||||
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
|
||||
);
|
||||
|
||||
// FCLASS
|
||||
reg [NUM_LANES-1:0][31:0] fclass_mask_s0; // generate a 10-bit mask for integer reg
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
if (a_fclass_s0[i].is_normal) begin
|
||||
fclass_mask_s0[i] = a_sign_s0[i] ? NEG_NORM : POS_NORM;
|
||||
end
|
||||
else if (a_fclass_s0[i].is_inf) begin
|
||||
fclass_mask_s0[i] = a_sign_s0[i] ? NEG_INF : POS_INF;
|
||||
end
|
||||
else if (a_fclass_s0[i].is_zero) begin
|
||||
fclass_mask_s0[i] = a_sign_s0[i] ? NEG_ZERO : POS_ZERO;
|
||||
end
|
||||
else if (a_fclass_s0[i].is_subnormal) begin
|
||||
fclass_mask_s0[i] = a_sign_s0[i] ? NEG_SUBNORM : POS_SUBNORM;
|
||||
end
|
||||
else if (a_fclass_s0[i].is_nan) begin
|
||||
fclass_mask_s0[i] = {22'h0, a_fclass_s0[i].is_quiet, a_fclass_s0[i].is_signaling, 8'h0};
|
||||
end
|
||||
else begin
|
||||
fclass_mask_s0[i] = QUT_NAN;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// Min/Max
|
||||
reg [NUM_LANES-1:0][31:0] fminmax_res_s0;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
if (a_fclass_s0[i].is_nan && b_fclass_s0[i].is_nan)
|
||||
fminmax_res_s0[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
|
||||
else if (a_fclass_s0[i].is_nan)
|
||||
fminmax_res_s0[i] = datab_s0[i];
|
||||
else if (b_fclass_s0[i].is_nan)
|
||||
fminmax_res_s0[i] = dataa_s0[i];
|
||||
else begin
|
||||
// FMIN, FMAX
|
||||
fminmax_res_s0[i] = (op_mod_s0[0] ^ a_smaller_s0[i]) ? dataa_s0[i] : datab_s0[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// Sign injection
|
||||
reg [NUM_LANES-1:0][31:0] fsgnj_res_s0; // result of sign injection
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
case (op_mod_s0[1:0])
|
||||
0: fsgnj_res_s0[i] = { b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
|
||||
1: fsgnj_res_s0[i] = {~b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
|
||||
default: fsgnj_res_s0[i] = { a_sign_s0[i] ^ b_sign_s0[i], a_exponent_s0[i], a_mantissa_s0[i]};
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// Comparison
|
||||
reg [NUM_LANES-1:0] fcmp_res_s0; // result of comparison
|
||||
reg [NUM_LANES-1:0] fcmp_fflags_NV_s0; // comparison fflags
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
case (op_mod_s0[1:0])
|
||||
0: begin // LE
|
||||
if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
|
||||
fcmp_res_s0[i] = 0;
|
||||
fcmp_fflags_NV_s0[i] = 1;
|
||||
end else begin
|
||||
fcmp_res_s0[i] = (a_smaller_s0[i] | ab_equal_s0[i]);
|
||||
fcmp_fflags_NV_s0[i] = 0;
|
||||
end
|
||||
end
|
||||
1: begin // LT
|
||||
if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
|
||||
fcmp_res_s0[i] = 0;
|
||||
fcmp_fflags_NV_s0[i] = 1;
|
||||
end else begin
|
||||
fcmp_res_s0[i] = (a_smaller_s0[i] & ~ab_equal_s0[i]);
|
||||
fcmp_fflags_NV_s0[i] = 0;
|
||||
end
|
||||
end
|
||||
2: begin // EQ
|
||||
if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
|
||||
fcmp_res_s0[i] = 0;
|
||||
fcmp_fflags_NV_s0[i] = a_fclass_s0[i].is_signaling | b_fclass_s0[i].is_signaling;
|
||||
end else begin
|
||||
fcmp_res_s0[i] = ab_equal_s0[i];
|
||||
fcmp_fflags_NV_s0[i] = 0;
|
||||
end
|
||||
end
|
||||
default: begin
|
||||
fcmp_res_s0[i] = 'x;
|
||||
fcmp_fflags_NV_s0[i] = 'x;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// outputs
|
||||
|
||||
reg [NUM_LANES-1:0][31:0] result_s0;
|
||||
reg [NUM_LANES-1:0] fflags_NV_s0;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
case (op_mod_s0[2:0])
|
||||
0,1,2: begin
|
||||
// SGNJ, CMP
|
||||
result_s0[i] = op_mod_s0[3] ? 32'(fcmp_res_s0[i]) : fsgnj_res_s0[i];
|
||||
fflags_NV_s0[i] = fcmp_fflags_NV_s0[i];
|
||||
end
|
||||
3: begin
|
||||
// CLASS
|
||||
result_s0[i] = fclass_mask_s0[i];
|
||||
fflags_NV_s0[i] = 'x;
|
||||
end
|
||||
4,5: begin
|
||||
// FMV
|
||||
result_s0[i] = dataa_s0[i];
|
||||
fflags_NV_s0[i] = 'x;
|
||||
end
|
||||
6,7: begin
|
||||
// MIN/MAX
|
||||
result_s0[i] = fminmax_res_s0[i];
|
||||
fflags_NV_s0[i] = a_fclass_s0[i].is_signaling | b_fclass_s0[i].is_signaling;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// only MIN/MAX and CMP return status flags
|
||||
wire has_fflags_s0 = (op_mod_s0[2:0] >= 6) || op_mod_s0[3];
|
||||
|
||||
assign stall = ~ready_out && valid_out;
|
||||
|
||||
wire fflags_NV;
|
||||
reg fflags_merged;
|
||||
|
||||
always @(*) begin
|
||||
fflags_merged = 0;
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
if (lane_mask_s0[i]) begin
|
||||
fflags_merged |= fflags_NV_s0[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + TAGW + (NUM_LANES * 32) + 1 + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall),
|
||||
.data_in ({valid_in_s0, tag_in_s0, result_s0, has_fflags_s0, fflags_merged}),
|
||||
.data_out ({valid_out, tag_out, result, has_fflags, fflags_NV})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
// NV, DZ, OF, UF, NX
|
||||
assign fflags = {fflags_NV, 1'b0, 1'b0, 1'b0, 1'b0};
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
41
hw/rtl/fpu/VX_fpu_pkg.sv
Normal file
41
hw/rtl/fpu/VX_fpu_pkg.sv
Normal file
@@ -0,0 +1,41 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_FPU_PKG_VH
|
||||
`define VX_FPU_PKG_VH
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
package VX_fpu_pkg;
|
||||
|
||||
typedef struct packed {
|
||||
logic is_normal;
|
||||
logic is_zero;
|
||||
logic is_subnormal;
|
||||
logic is_inf;
|
||||
logic is_nan;
|
||||
logic is_quiet;
|
||||
logic is_signaling;
|
||||
} fclass_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic NV; // 4-Invalid
|
||||
logic DZ; // 3-Divide by zero
|
||||
logic OF; // 2-Overflow
|
||||
logic UF; // 1-Underflow
|
||||
logic NX; // 0-Inexact
|
||||
} fflags_t;
|
||||
|
||||
endpackage
|
||||
|
||||
`endif // VX_FPU_PKG_VH
|
||||
79
hw/rtl/fpu/VX_fpu_rounding.sv
Normal file
79
hw/rtl/fpu/VX_fpu_rounding.sv
Normal file
@@ -0,0 +1,79 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
/// Modified port of rouding module from fpnew Libray
|
||||
/// reference: https://github.com/pulp-platform/fpnew
|
||||
|
||||
module VX_fpu_rounding #(
|
||||
parameter DAT_WIDTH = 2 // Width of the abolute value, without sign bit
|
||||
) (
|
||||
// inputs
|
||||
input wire [DAT_WIDTH-1:0] abs_value_i, // absolute value without sign
|
||||
input wire sign_i,
|
||||
// rounding information
|
||||
input wire [1:0] round_sticky_bits_i, // round and sticky bits {RS}
|
||||
input wire [2:0] rnd_mode_i,
|
||||
input wire effective_subtraction_i, // sign of inputs affects rounding of zeroes
|
||||
// outputs
|
||||
output wire [DAT_WIDTH-1:0] abs_rounded_o, // absolute value without sign
|
||||
output wire sign_o,
|
||||
output wire exact_zero_o // output is an exact zero
|
||||
);
|
||||
|
||||
reg round_up; // Rounding decision
|
||||
|
||||
// Take the rounding decision according to RISC-V spec
|
||||
// RoundMode | Mnemonic | Meaning
|
||||
// :--------:|:--------:|:-------
|
||||
// 000 | RNE | Round to Nearest, ties to Even
|
||||
// 001 | RTZ | Round towards Zero
|
||||
// 010 | RDN | Round Down (towards -\infty)
|
||||
// 011 | RUP | Round Up (towards \infty)
|
||||
// 100 | RMM | Round to Nearest, ties to Max Magnitude
|
||||
// others | | *invalid*
|
||||
|
||||
always @(*) begin
|
||||
case (rnd_mode_i)
|
||||
`INST_FRM_RNE: // Decide accoring to round/sticky bits
|
||||
case (round_sticky_bits_i)
|
||||
2'b00,
|
||||
2'b01: round_up = 1'b0; // < ulp/2 away, round down
|
||||
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
|
||||
2'b11: round_up = 1'b1; // > ulp/2 away, round up
|
||||
default: round_up = 1'bx;
|
||||
endcase
|
||||
`INST_FRM_RTZ: round_up = 1'b0; // always round down
|
||||
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
|
||||
`INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
|
||||
`INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
|
||||
default: round_up = 1'bx; // propagate x
|
||||
endcase
|
||||
end
|
||||
|
||||
// Perform the rounding, exponent change and overflow to inf happens automagically
|
||||
assign abs_rounded_o = abs_value_i + DAT_WIDTH'(round_up);
|
||||
|
||||
// True zero result is a zero result without dirty round/sticky bits
|
||||
assign exact_zero_o = (abs_value_i == 0) && (round_sticky_bits_i == 0);
|
||||
|
||||
// In case of effective subtraction (thus signs of addition operands must have differed) and a
|
||||
// true zero result, the result sign is '-' in case of RDN and '+' for other modes.
|
||||
assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `INST_FRM_RDN)
|
||||
: sign_i;
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
134
hw/rtl/fpu/VX_fpu_sqrt.sv
Normal file
134
hw/rtl/fpu/VX_fpu_sqrt.sv
Normal file
@@ -0,0 +1,134 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
module VX_fpu_sqrt import VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [NUM_LANES-1:0] lane_mask,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
output wire [NUM_LANES-1:0][31:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
|
||||
`UNUSED_VAR (frm)
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
fflags_t [NUM_LANES-1:0] per_lane_fflags;
|
||||
wire [NUM_LANES-1:0] lane_mask_out;
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW),
|
||||
.DEPTH (`LATENCY_FSQRT),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid_in, lane_mask, tag_in}),
|
||||
.data_out ({valid_out, lane_mask_out, tag_out})
|
||||
);
|
||||
|
||||
assign ready_in = enable;
|
||||
|
||||
`ifdef QUARTUS
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
acl_fsqrt fsqrt (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 0;
|
||||
assign per_lane_fflags = 'x;
|
||||
|
||||
`elsif VIVADO
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire tuser;
|
||||
|
||||
xil_fsqrt fsqrt (
|
||||
.aclk (clk),
|
||||
.aclken (enable),
|
||||
.s_axis_a_tvalid (1'b1),
|
||||
.s_axis_a_tdata (dataa[i][31:0]),
|
||||
`UNUSED_PIN (m_axis_result_tvalid),
|
||||
.m_axis_result_tdata (result[i][31:0]),
|
||||
.m_axis_result_tuser (tuser)
|
||||
);
|
||||
// NV, DZ, OF, UF, NX
|
||||
assign per_lane_fflags[i] = {tuser, 1'b0, 1'b0, 1'b0, 1'b0};
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [63:0] r;
|
||||
`UNUSED_VAR (r)
|
||||
|
||||
fflags_t f;
|
||||
|
||||
always @(*) begin
|
||||
dpi_fsqrt (enable && valid_in, int'(0), {32'hffffffff, dataa[i]}, frm, r, f);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (32 + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FSQRT)
|
||||
) shift_req_dpi (
|
||||
.clk (clk),
|
||||
`UNUSED_PIN (reset),
|
||||
.enable (enable),
|
||||
.data_in ({r[31:0], f}),
|
||||
.data_out ({result[i], per_lane_fflags[i]})
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`endif
|
||||
|
||||
`FPU_MERGE_FFLAGS(fflags, per_lane_fflags, lane_mask_out, NUM_LANES);
|
||||
|
||||
endmodule
|
||||
`endif
|
||||
43
hw/rtl/fpu/VX_fpu_to_csr_if.sv
Normal file
43
hw/rtl/fpu/VX_fpu_to_csr_if.sv
Normal file
@@ -0,0 +1,43 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
interface VX_fpu_to_csr_if import VX_fpu_pkg::*; ();
|
||||
|
||||
wire write_enable;
|
||||
wire [`NW_WIDTH-1:0] write_wid;
|
||||
fflags_t write_fflags;
|
||||
|
||||
wire [`NW_WIDTH-1:0] read_wid;
|
||||
wire [`INST_FRM_BITS-1:0] read_frm;
|
||||
|
||||
modport master (
|
||||
output write_enable,
|
||||
output write_wid,
|
||||
output write_fflags,
|
||||
|
||||
output read_wid,
|
||||
input read_frm
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input write_enable,
|
||||
input write_wid,
|
||||
input write_fflags,
|
||||
|
||||
input read_wid,
|
||||
output read_frm
|
||||
);
|
||||
|
||||
endinterface
|
||||
259
hw/rtl/fpu/VX_fpu_unit.sv
Normal file
259
hw/rtl/fpu/VX_fpu_unit.sv
Normal file
@@ -0,0 +1,259 @@
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
module VX_fpu_unit import VX_fpu_pkg::*; #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
||||
VX_fpu_to_csr_if.master fpu_to_csr_if[`NUM_FPU_BLOCKS],
|
||||
|
||||
VX_commit_if.master commit_if [`ISSUE_WIDTH]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
|
||||
localparam NUM_LANES = `NUM_FPU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) execute_if[BLOCK_SIZE]();
|
||||
|
||||
`RESET_RELAY (dispatch_reset, reset);
|
||||
|
||||
VX_dispatch_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 0)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
.dispatch_if(dispatch_if),
|
||||
.execute_if (execute_if)
|
||||
);
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_block_if[BLOCK_SIZE]();
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
`UNUSED_VAR (execute_if[block_idx].data.tid)
|
||||
`UNUSED_VAR (execute_if[block_idx].data.wb)
|
||||
`UNUSED_VAR (execute_if[block_idx].data.use_PC)
|
||||
`UNUSED_VAR (execute_if[block_idx].data.use_imm)
|
||||
|
||||
// Store request info
|
||||
wire fpu_req_valid, fpu_req_ready;
|
||||
wire fpu_rsp_valid, fpu_rsp_ready;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] fpu_rsp_result;
|
||||
fflags_t fpu_rsp_fflags;
|
||||
wire fpu_rsp_has_fflags;
|
||||
|
||||
wire [`UUID_WIDTH-1:0] fpu_rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] fpu_rsp_wid;
|
||||
wire [NUM_LANES-1:0] fpu_rsp_tmask;
|
||||
wire [`XLEN-1:0] fpu_rsp_PC;
|
||||
wire [`NR_BITS-1:0] fpu_rsp_rd;
|
||||
wire [PID_WIDTH-1:0] fpu_rsp_pid;
|
||||
wire fpu_rsp_sop;
|
||||
wire fpu_rsp_eop;
|
||||
|
||||
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
|
||||
wire mdata_full;
|
||||
|
||||
wire [`INST_FMT_BITS-1:0] fpu_fmt = execute_if[block_idx].data.imm[`INST_FMT_BITS-1:0];
|
||||
wire [`INST_FRM_BITS-1:0] fpu_frm = execute_if[block_idx].data.op_mod[`INST_FRM_BITS-1:0];
|
||||
|
||||
wire execute_fire = execute_if[block_idx].valid && execute_if[block_idx].ready;
|
||||
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
|
||||
.SIZE (`FPU_REQ_QUEUE_SIZE)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.acquire_en (execute_fire),
|
||||
.write_addr (fpu_req_tag),
|
||||
.write_data ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.rd, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}),
|
||||
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
|
||||
.read_addr (fpu_rsp_tag),
|
||||
.release_en (fpu_rsp_fire),
|
||||
.full (mdata_full),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
|
||||
// resolve dynamic FRM from CSR
|
||||
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
|
||||
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].read_wid, execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
assign fpu_req_frm = (execute_if[block_idx].data.op_type != `INST_FPU_MISC
|
||||
&& fpu_frm == `INST_FRM_DYN) ? fpu_to_csr_if[block_idx].read_frm : fpu_frm;
|
||||
|
||||
// submit FPU request
|
||||
|
||||
assign fpu_req_valid = execute_if[block_idx].valid && ~mdata_full;
|
||||
assign execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
|
||||
|
||||
`RESET_RELAY (fpu_reset, reset);
|
||||
|
||||
`ifdef FPU_DPI
|
||||
|
||||
VX_fpu_dpi #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAG_WIDTH),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_dpi (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.op_type (execute_if[block_idx].data.op_type),
|
||||
.lane_mask (execute_if[block_idx].data.tmask),
|
||||
.fmt (fpu_fmt),
|
||||
.frm (fpu_req_frm),
|
||||
.dataa (execute_if[block_idx].data.rs1_data),
|
||||
.datab (execute_if[block_idx].data.rs2_data),
|
||||
.datac (execute_if[block_idx].data.rs3_data),
|
||||
.tag_in (fpu_req_tag),
|
||||
.ready_in (fpu_req_ready),
|
||||
|
||||
.valid_out (fpu_rsp_valid),
|
||||
.result (fpu_rsp_result),
|
||||
.has_fflags (fpu_rsp_has_fflags),
|
||||
.fflags (fpu_rsp_fflags),
|
||||
.tag_out (fpu_rsp_tag),
|
||||
.ready_out (fpu_rsp_ready)
|
||||
);
|
||||
|
||||
`elsif FPU_FPNEW
|
||||
|
||||
VX_fpu_fpnew #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAG_WIDTH),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_fpnew (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.op_type (execute_if[block_idx].data.op_type),
|
||||
.lane_mask (execute_if[block_idx].data.tmask),
|
||||
.fmt (fpu_fmt),
|
||||
.frm (fpu_req_frm),
|
||||
.dataa (execute_if[block_idx].data.rs1_data),
|
||||
.datab (execute_if[block_idx].data.rs2_data),
|
||||
.datac (execute_if[block_idx].data.rs3_data),
|
||||
.tag_in (fpu_req_tag),
|
||||
.ready_in (fpu_req_ready),
|
||||
|
||||
.valid_out (fpu_rsp_valid),
|
||||
.result (fpu_rsp_result),
|
||||
.has_fflags (fpu_rsp_has_fflags),
|
||||
.fflags (fpu_rsp_fflags),
|
||||
.tag_out (fpu_rsp_tag),
|
||||
.ready_out (fpu_rsp_ready)
|
||||
);
|
||||
|
||||
`elsif FPU_DSP
|
||||
|
||||
VX_fpu_dsp #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAG_WIDTH),
|
||||
.OUT_REG (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_dsp (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.lane_mask (execute_if[block_idx].data.tmask),
|
||||
.op_type (execute_if[block_idx].data.op_type),
|
||||
.fmt (fpu_fmt),
|
||||
.frm (fpu_req_frm),
|
||||
.dataa (execute_if[block_idx].data.rs1_data),
|
||||
.datab (execute_if[block_idx].data.rs2_data),
|
||||
.datac (execute_if[block_idx].data.rs3_data),
|
||||
.tag_in (fpu_req_tag),
|
||||
.ready_in (fpu_req_ready),
|
||||
|
||||
.valid_out (fpu_rsp_valid),
|
||||
.result (fpu_rsp_result),
|
||||
.has_fflags (fpu_rsp_has_fflags),
|
||||
.fflags (fpu_rsp_fflags),
|
||||
.tag_out (fpu_rsp_tag),
|
||||
.ready_out (fpu_rsp_ready)
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
// handle FPU response
|
||||
|
||||
fflags_t fpu_rsp_fflags_q;
|
||||
|
||||
if (PID_BITS != 0) begin
|
||||
fflags_t fpu_rsp_fflags_r;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
fpu_rsp_fflags_r <= '0;
|
||||
end else if (fpu_rsp_fire) begin
|
||||
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
|
||||
end
|
||||
end
|
||||
assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
|
||||
end else begin
|
||||
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
|
||||
end
|
||||
|
||||
assign fpu_to_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
|
||||
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
assign fpu_to_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
|
||||
|
||||
// send response
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.SIZE (0)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (fpu_rsp_valid),
|
||||
.ready_in (fpu_rsp_ready),
|
||||
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
|
||||
.data_out ({commit_block_if[block_idx].data.uuid, commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.tmask, commit_block_if[block_idx].data.PC, commit_block_if[block_idx].data.rd, commit_block_if[block_idx].data.data, commit_block_if[block_idx].data.pid, commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop}),
|
||||
.valid_out (commit_block_if[block_idx].valid),
|
||||
.ready_out (commit_block_if[block_idx].ready)
|
||||
);
|
||||
assign commit_block_if[block_idx].data.wb = 1'b1;
|
||||
end
|
||||
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
VX_gather_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (PARTIAL_BW ? 3 : 0)
|
||||
) gather_unit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
.commit_in_if (commit_block_if),
|
||||
.commit_out_if (commit_if)
|
||||
);
|
||||
|
||||
endmodule
|
||||
Reference in New Issue
Block a user