fpga fixes: warp scheduler, fnmadd, fdiv, fsqrt

This commit is contained in:
Blaise Tine
2020-09-08 07:05:26 -07:00
parent 4c08929c56
commit 36ec603d17
22 changed files with 2374 additions and 2588 deletions

View File

@@ -51,9 +51,9 @@ module VX_fp_addmul #(
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -91,9 +91,9 @@ module VX_fp_addmul #(
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -131,9 +131,9 @@ module VX_fp_addmul #(
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),

View File

@@ -32,7 +32,7 @@ module VX_fp_div #(
`ifdef QUARTUS
acl_fdiv fdiv (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.b (datab[i]),

View File

@@ -27,7 +27,7 @@ module VX_fp_fpga #(
input wire ready_out,
output wire valid_out
);
localparam NUM_FPC = 8;
localparam NUM_FPC = 7;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
wire [NUM_FPC-1:0] per_core_ready_in;
@@ -40,28 +40,28 @@ module VX_fp_fpga #(
fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
reg [FPC_BITS-1:0] core_select;
reg do_sub, do_mul;
reg do_sub, do_mul, do_neg;
reg is_signed;
always @(*) begin
core_select = 'x;
do_sub = 'x;
do_mul = 'x;
is_signed = 'x;
do_sub = 'x;
do_mul = 'x;
do_neg = 'x;
is_signed = 'x;
case (op_type)
`FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end
`FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end
`FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end
`FPU_MADD: begin core_select = 2; do_sub = 0; end
`FPU_MSUB: begin core_select = 2; do_sub = 1; end
`FPU_NMADD: begin core_select = 3; do_sub = 0; end
`FPU_NMSUB: begin core_select = 3; do_sub = 1; end
`FPU_DIV: begin core_select = 4; end
`FPU_SQRT: begin core_select = 5; end
`FPU_CVTWS: begin core_select = 6; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 6; is_signed = 0; end
`FPU_CVTSW: begin core_select = 7; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 7; is_signed = 0; end
`FPU_MADD: begin core_select = 2; do_sub = 0; do_neg = 0; end
`FPU_MSUB: begin core_select = 2; do_sub = 1; do_neg = 0; end
`FPU_NMADD: begin core_select = 2; do_sub = 0; do_neg = 1; end
`FPU_NMSUB: begin core_select = 2; do_sub = 1; do_neg = 1; end
`FPU_DIV: begin core_select = 3; end
`FPU_SQRT: begin core_select = 4; end
`FPU_CVTWS: begin core_select = 5; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
`FPU_CVTSW: begin core_select = 6; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
default: begin core_select = 0; end
endcase
end
@@ -116,6 +116,7 @@ module VX_fp_fpga #(
.ready_in (per_core_ready_in[2]),
.tag_in (tag_in),
.do_sub (do_sub),
.do_neg (do_neg),
.dataa (dataa),
.datab (datab),
.datac (datac),
@@ -125,40 +126,21 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[2])
);
VX_fp_nmadd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_nmadd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.tag_in (tag_in),
.do_sub (do_sub),
.dataa (dataa),
.datab (datab),
.datac (datac),
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
);
VX_fp_div #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_div (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.tag_in (tag_in),
.dataa (dataa),
.datab (datab),
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
);
VX_fp_sqrt #(
@@ -167,14 +149,14 @@ module VX_fp_fpga #(
) fp_sqrt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.tag_in (tag_in),
.dataa (dataa),
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
);
VX_fp_ftoi #(
@@ -183,32 +165,32 @@ module VX_fp_fpga #(
) fp_ftoi (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
);
VX_fp_itof #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_itof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 7)),
.ready_in (per_core_ready_in[7]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[7]),
.tag_out (per_core_tag_out[7]),
.ready_out (per_core_ready_out[7]),
.valid_out (per_core_valid_out[7])
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
);
reg valid_out_n;
@@ -234,7 +216,7 @@ module VX_fp_fpga #(
end
end
assign ready_in = (& per_core_ready_in);
assign ready_in = per_core_ready_in[core_select];
assign valid_out = valid_out_n;
assign has_fflags = has_fflags_n;
assign tag_out = tag_out_n;

View File

@@ -39,7 +39,7 @@ module VX_fp_ftoi #(
`ifdef QUARTUS
acl_ftoi ftoi (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_s)
@@ -47,7 +47,7 @@ module VX_fp_ftoi #(
acl_ftou ftou (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_u)

View File

@@ -39,7 +39,7 @@ module VX_fp_itof #(
`ifdef QUARTUS
acl_itof itof (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_s)
@@ -47,7 +47,7 @@ module VX_fp_itof #(
acl_utof utof (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result_u)

View File

@@ -17,7 +17,8 @@ module VX_fp_madd #(
input wire [TAGW-1:0] tag_in,
input wire do_sub,
input wire do_neg,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
@@ -32,7 +33,7 @@ module VX_fp_madd #(
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_sub_r;
reg do_sub_r, do_neg_r;
for (genvar i = 0; i < LANES; i++) begin
@@ -50,9 +51,9 @@ module VX_fp_madd #(
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -90,9 +91,9 @@ module VX_fp_madd #(
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.clk({2'b00, clk}),
.ena({2'b00, enable}),
.aclr({reset, reset}),
.chainin(),
// outputs
.overflow(),
@@ -126,18 +127,20 @@ module VX_fp_madd #(
end
`endif
assign result[i] = do_sub_r ? result_msub : result_madd;
wire [31:0] result_unqual = do_sub_r ? result_msub : result_madd;
assign result[i][31] = result_unqual[31] ^ do_neg_r;
assign result[i][30:0] = result_unqual[30:0];
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1),
.DATAW(TAGW + 1 + 1 + 1),
.DEPTH(`LATENCY_FMADD)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in({tag_in, valid_in, do_sub}),
.out({tag_out, valid_out, do_sub_r})
.in({tag_in, valid_in, do_sub, do_neg}),
.out({tag_out, valid_out, do_sub_r, do_neg_r})
);
assign ready_in = enable;

View File

@@ -1,197 +0,0 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_nmadd #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire do_sub,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_sub_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_madd;
wire [31:0] result_msub;
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
`ifdef QUARTUS
twentynm_fp_mac mac_fp_madd (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_madd),
.chainout()
);
defparam mac_fp_madd.operation_mode = "sp_mult_add";
defparam mac_fp_madd.use_chainin = "false";
defparam mac_fp_madd.adder_subtract = "false";
defparam mac_fp_madd.ax_clock = "0";
defparam mac_fp_madd.ay_clock = "0";
defparam mac_fp_madd.az_clock = "0";
defparam mac_fp_madd.output_clock = "0";
defparam mac_fp_madd.accumulate_clock = "none";
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
defparam mac_fp_madd.accum_pipeline_clock = "none";
defparam mac_fp_madd.mult_pipeline_clock = "0";
defparam mac_fp_madd.adder_input_clock = "0";
defparam mac_fp_madd.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_msub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_msub),
.chainout()
);
defparam mac_fp_msub.operation_mode = "sp_mult_add";
defparam mac_fp_msub.use_chainin = "false";
defparam mac_fp_msub.adder_subtract = "true";
defparam mac_fp_msub.ax_clock = "0";
defparam mac_fp_msub.ay_clock = "0";
defparam mac_fp_msub.az_clock = "0";
defparam mac_fp_msub.output_clock = "0";
defparam mac_fp_msub.accumulate_clock = "none";
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
defparam mac_fp_msub.accum_pipeline_clock = "none";
defparam mac_fp_msub.mult_pipeline_clock = "0";
defparam mac_fp_msub.adder_input_clock = "0";
defparam mac_fp_msub.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_neg (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(32'h0),
.ay(result_st0),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result[i]),
.chainout()
);
defparam mac_fp_neg.operation_mode = "sp_add";
defparam mac_fp_neg.use_chainin = "false";
defparam mac_fp_neg.adder_subtract = "true";
defparam mac_fp_neg.ax_clock = "0";
defparam mac_fp_neg.ay_clock = "0";
defparam mac_fp_neg.az_clock = "none";
defparam mac_fp_neg.output_clock = "0";
defparam mac_fp_neg.accumulate_clock = "none";
defparam mac_fp_neg.ax_chainin_pl_clock = "none";
defparam mac_fp_neg.accum_pipeline_clock = "none";
defparam mac_fp_neg.mult_pipeline_clock = "none";
defparam mac_fp_neg.adder_input_clock = "0";
defparam mac_fp_neg.accum_adder_clock = "none";
`else
always @(posedge clk) begin
dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
end
`endif
end
VX_shift_register #(
.DATAW(1),
.DEPTH(`LATENCY_FMADD)
) shift_reg0 (
.clk(clk),
.reset(reset),
.enable(enable),
.in({do_sub}),
.out({do_sub_r})
);
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(enable),
.in({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = enable;
endmodule

View File

@@ -31,7 +31,7 @@ module VX_fp_sqrt #(
`ifdef QUARTUS
acl_fsqrt fsqrt (
.clk (clk),
.areset (1'b0),
.areset (reset),
.en (enable),
.a (dataa[i]),
.q (result[i])

View File

@@ -91,6 +91,7 @@ module VX_fpnew #(
fpu_operands[0] = dataa;
fpu_operands[1] = datab;
fpu_operands[2] = datac;
case (op_type)
`FPU_ADD: begin
fpu_op = fpnew_pkg::ADD;
@@ -107,23 +108,23 @@ module VX_fpnew #(
`FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end
`FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end
`FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
`FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
`FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
`FPU_CVTWS: begin fpu_op = fpnew_pkg::F2I; end
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::F2I; fpu_op_mod = 1; end
`FPU_CVTSW: begin fpu_op = fpnew_pkg::I2F; end
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
`FPU_CLASS: begin fpu_op = fpnew_pkg::CLASSIFY; fpu_has_fflags = 0; end
`FPU_CMP: begin fpu_op = fpnew_pkg::CMP; end
`FPU_MISC: begin
case (frm)
0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
0: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; fpu_has_fflags = 0; end
1: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; fpu_has_fflags = 0; end
2: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; fpu_has_fflags = 0; end
3: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
4: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
default: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; fpu_has_fflags = 0; end
endcase
end
default:;