multiplier unit optimization - using fifo for metadata, shift register optimization

This commit is contained in:
Blaise Tine
2020-12-26 11:23:21 -08:00
parent b459192dec
commit 33c431ed44
13 changed files with 171 additions and 76 deletions

View File

@@ -246,12 +246,12 @@
// Size of MUL Request Queue // Size of MUL Request Queue
`ifndef MULQ_SIZE `ifndef MULQ_SIZE
`define MULQ_SIZE 4 `define MULQ_SIZE 8
`endif `endif
// Size of FPU Request Queue // Size of FPU Request Queue
`ifndef FPUQ_SIZE `ifndef FPUQ_SIZE
`define FPUQ_SIZE 4 `define FPUQ_SIZE 8
`endif `endif
// Icache Configurable Knobs ////////////////////////////////////////////////// // Icache Configurable Knobs //////////////////////////////////////////////////

View File

@@ -43,7 +43,7 @@ module VX_fpu_unit #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1), .DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
.SIZE (`FPUQ_SIZE), .SIZE (`FPUQ_SIZE),
.FASTRAM (1) .FASTRAM (1)
) req_metadata_buf ( ) req_metadata (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.acquire_slot (fpuq_push), .acquire_slot (fpuq_push),

View File

@@ -119,7 +119,7 @@ module VX_lsu_unit #(
.DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2), .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2),
.SIZE (`LSUQ_SIZE), .SIZE (`LSUQ_SIZE),
.FASTRAM (1) .FASTRAM (1)
) req_metadata_buf ( ) req_metadata (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.write_addr (mbuf_waddr), .write_addr (mbuf_waddr),

View File

@@ -19,47 +19,43 @@ module VX_mul_unit #(
wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data; wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data;
wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data; wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data;
wire [`NW_BITS-1:0] rsp_wid; wire ready_out;
wire [`NUM_THREADS-1:0] rsp_tmask;
wire [31:0] rsp_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [MULQ_BITS-1:0] tag_in, tag_out;
wire valid_out, ready_out;
wire mulq_full;
wire mulq_push = mul_req_if.valid && mul_req_if.ready;
wire mulq_pop = valid_out && ready_out;
VX_index_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
.SIZE (`MULQ_SIZE),
.FASTRAM (1)
) req_metadata_buf (
.clk (clk),
.reset (reset),
.acquire_slot (mulq_push),
.write_addr (tag_in),
.read_addr (tag_out),
.release_addr (tag_out),
.write_data ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.rd, mul_req_if.wb}),
.read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}),
.release_slot (mulq_pop),
.full (mulq_full)
);
wire valid_in = mul_req_if.valid && ~mulq_full;
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] mul_result; wire [`NUM_THREADS-1:0][31:0] mul_result;
wire [MULQ_BITS-1:0] mul_tag; wire [`NW_BITS-1:0] mul_wid_out;
wire is_mul_in = (alu_op == `MUL_MUL); wire [`NUM_THREADS-1:0] mul_tmask_out;
wire is_mul_out; wire [31:0] mul_PC_out;
wire [`NR_BITS-1:0] mul_rd_out;
wire mul_wb_out;
wire mul_valid_out; wire mul_valid_out;
wire mul_valid_in = valid_in && !is_div_op; wire mul_valid_in = mul_req_if.valid && !is_div_op && ~mulq_full;
wire mul_ready_in = ready_out || ~mul_valid_out; wire mul_ready_in = ready_out || ~mul_valid_out;
wire mulq_push = mul_valid_in && mul_ready_in;
wire mulq_pop = mul_valid_out && ready_out;
wire mulq_full;
wire is_mulh_in = (alu_op != `MUL_MUL);
wire is_mulh_out;
VX_generic_queue #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
.SIZE (`MULQ_SIZE),
.FASTRAM (1)
) mul_metadata (
.clk (clk),
.reset (reset),
.push (mulq_push),
.pop (mulq_pop),
.data_in ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.rd, mul_req_if.wb, is_mulh_in}),
.data_out ({mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out}),
.full (mulq_full),
`UNUSED_PIN (empty),
`UNUSED_PIN (size)
);
for (genvar i = 0; i < `NUM_THREADS; i++) begin for (genvar i = 0; i < `NUM_THREADS; i++) begin
@@ -83,32 +79,36 @@ module VX_mul_unit #(
.result(mul_result_tmp) .result(mul_result_tmp)
); );
assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32]; assign mul_result[i] = is_mulh_out ? mul_result_tmp[63:32] : mul_result_tmp[31:0];
end end
VX_shift_register #( VX_shift_register #(
.DATAW(1 + MULQ_BITS + 1), .DATAW(1),
.DEPTH(`LATENCY_IMUL) .DEPTH(`LATENCY_IMUL)
) mul_shift_reg ( ) mul_shift_reg (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(mul_ready_in), .enable(mul_ready_in),
.in({mul_valid_in, tag_in, is_mul_in}), .data_in(mul_valid_in),
.out({mul_valid_out, mul_tag, is_mul_out}) .data_out(mul_valid_out)
); );
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp; wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
wire [`NW_BITS-1:0] div_wid_out;
wire [`NUM_THREADS-1:0] div_tmask_out;
wire [31:0] div_PC_out;
wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out;
wire is_rem_op = (alu_op == `MUL_REM) || (alu_op == `MUL_REMU); wire is_rem_op_in = (alu_op == `MUL_REM) || (alu_op == `MUL_REMU);
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM); wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
wire div_valid_in = valid_in && is_div_op; wire div_valid_in = mul_req_if.valid && is_div_op;
wire div_ready_out = ready_out && ~mul_valid_out; // arbitration prioritizes MUL wire div_ready_out = ready_out && ~mul_valid_out; // arbitration prioritizes MUL
wire div_ready_in; wire div_ready_in;
wire div_valid_out; wire div_valid_out;
wire is_rem_op_out; wire is_rem_op_out;
wire [MULQ_BITS-1:0] div_tag;
VX_serial_div #( VX_serial_div #(
.WIDTHN(32), .WIDTHN(32),
@@ -116,21 +116,21 @@ module VX_mul_unit #(
.WIDTHQ(32), .WIDTHQ(32),
.WIDTHR(32), .WIDTHR(32),
.LANES(`NUM_THREADS), .LANES(`NUM_THREADS),
.TAGW(MULQ_BITS + 1) .TAGW(`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
) divide ( ) divide (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.ready_in(div_ready_in),
.valid_in(div_valid_in), .valid_in(div_valid_in),
.ready_in(div_ready_in),
.signed_mode(is_signed_div), .signed_mode(is_signed_div),
.tag_in({tag_in, is_rem_op}), .tag_in({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.rd, mul_req_if.wb, is_rem_op_in}),
.numer(alu_in1), .numer(alu_in1),
.denom(alu_in2), .denom(alu_in2),
.quotient(div_result_tmp), .quotient(div_result_tmp),
.remainder(rem_result_tmp), .remainder(rem_result_tmp),
.ready_out(div_ready_out), .ready_out(div_ready_out),
.valid_out(div_valid_out), .valid_out(div_valid_out),
.tag_out({div_tag, is_rem_op_out}) .tag_out({div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
); );
wire [`NUM_THREADS-1:0][31:0] div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp; wire [`NUM_THREADS-1:0][31:0] div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
@@ -140,9 +140,13 @@ module VX_mul_unit #(
wire stall_out = ~mul_commit_if.ready && mul_commit_if.valid; wire stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
assign ready_out = ~stall_out; assign ready_out = ~stall_out;
assign valid_out = mul_valid_out || div_valid_out; wire rsp_valid = mul_valid_out || div_valid_out;
assign tag_out = mul_valid_out ? mul_tag : div_tag; wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out;
wire [`NUM_THREADS-1:0][31:0] result = mul_valid_out ? mul_result : div_result; wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out;
wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out;
wire [`NR_BITS-1:0] rsp_rd = mul_valid_out ? mul_rd_out : div_rd_out;
wire rsp_wb = mul_valid_out ? mul_wb_out : div_wb_out;
wire [`NUM_THREADS-1:0][31:0] rsp_data = mul_valid_out ? mul_result : div_result;
VX_generic_register #( VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)), .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
@@ -152,11 +156,11 @@ module VX_mul_unit #(
.reset (reset), .reset (reset),
.stall (stall_out), .stall (stall_out),
.flush (1'b0), .flush (1'b0),
.data_in ({valid_out, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result}), .data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
.data_out ({mul_commit_if.valid, mul_commit_if.wid, mul_commit_if.tmask, mul_commit_if.PC, mul_commit_if.rd, mul_commit_if.wb, mul_commit_if.data}) .data_out ({mul_commit_if.valid, mul_commit_if.wid, mul_commit_if.tmask, mul_commit_if.PC, mul_commit_if.rd, mul_commit_if.wb, mul_commit_if.data})
); );
// can accept new request? // can accept new request?
assign mul_req_if.ready = (is_div_op ? div_ready_in : mul_ready_in) && ~mulq_full; assign mul_req_if.ready = is_div_op ? div_ready_in : (mul_ready_in && ~mulq_full);
endmodule endmodule

View File

@@ -718,8 +718,8 @@ VX_generic_queue #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.push (cci_rdq_push), .push (cci_rdq_push),
.data_in (cci_rdq_din),
.pop (cci_rdq_pop), .pop (cci_rdq_pop),
.data_in (cci_rdq_din),
.data_out (cci_rdq_dout), .data_out (cci_rdq_dout),
.empty (cci_rdq_empty), .empty (cci_rdq_empty),
`UNUSED_PIN (full), `UNUSED_PIN (full),

View File

@@ -184,8 +184,8 @@ module VX_fp_addmul #(
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(enable), .enable(enable),
.in({valid_in, tag_in, do_sub, do_mul}), .data_in({valid_in, tag_in, do_sub, do_mul}),
.out({valid_out, tag_out, do_sub_r, do_mul_r}) .data_out({valid_out, tag_out, do_sub_r, do_mul_r})
); );
assign ready_in = enable; assign ready_in = enable;

View File

@@ -56,8 +56,8 @@ module VX_fp_div #(
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(enable), .enable(enable),
.in ({valid_in, tag_in}), .data_in ({valid_in, tag_in}),
.out({valid_out, tag_out}) .data_out({valid_out, tag_out})
); );
assign ready_in = enable; assign ready_in = enable;

View File

@@ -74,8 +74,8 @@ module VX_fp_ftoi #(
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(enable), .enable(enable),
.in ({valid_in, tag_in, is_signed}), .data_in ({valid_in, tag_in, is_signed}),
.out({valid_out, tag_out, is_signed_r}) .data_out({valid_out, tag_out, is_signed_r})
); );
assign ready_in = enable; assign ready_in = enable;

View File

@@ -74,8 +74,8 @@ module VX_fp_itof #(
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(enable), .enable(enable),
.in ({valid_in, tag_in, is_signed}), .data_in ({valid_in, tag_in, is_signed}),
.out({valid_out, tag_out, is_signed_r}) .data_out({valid_out, tag_out, is_signed_r})
); );
assign ready_in = enable; assign ready_in = enable;

View File

@@ -144,8 +144,8 @@ module VX_fp_madd #(
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(enable), .enable(enable),
.in({valid_in, tag_in, do_sub, do_neg}), .data_in({valid_in, tag_in, do_sub, do_neg}),
.out({valid_out, tag_out, do_sub_r, do_neg_r}) .data_out({valid_out, tag_out, do_sub_r, do_neg_r})
); );
assign ready_in = enable; assign ready_in = enable;

View File

@@ -54,8 +54,8 @@ module VX_fp_sqrt #(
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
.enable(enable), .enable(enable),
.in ({valid_in, tag_in}), .data_in ({valid_in, tag_in}),
.out({valid_out, tag_out}) .data_out({valid_out, tag_out})
); );
assign ready_in = enable; assign ready_in = enable;

View File

@@ -1,6 +1,6 @@
`include "VX_platform.vh" `include "VX_platform.vh"
module VX_divide #( module VX_divider #(
parameter WIDTHN = 1, parameter WIDTHN = 1,
parameter WIDTHD = 1, parameter WIDTHD = 1,
parameter WIDTHQ = 1, parameter WIDTHQ = 1,

View File

@@ -1,14 +1,105 @@
`include "VX_platform.vh" `include "VX_platform.vh"
module VX_shift_register #( module VX_shift_register #(
parameter DATAW = 1,
parameter RESETW = DATAW,
parameter DEPTH = 1
) (
input wire clk,
input wire reset,
input wire enable,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out
);
if (RESETW != 0) begin
if (RESETW == DATAW) begin
VX_shift_register_wr #(
.DATAW (DATAW),
.DEPTH (DEPTH)
) sr (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in (data_in),
.data_out (data_out)
);
end else begin
VX_shift_register_wr #(
.DATAW (DATAW),
.DEPTH (DEPTH)
) sr_wr (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in (data_in[DATAW-1:DATAW-RESETW]),
.data_out (data_out[DATAW-1:DATAW-RESETW])
);
VX_shift_register_nr #(
.DATAW (DATAW),
.DEPTH (DEPTH)
) sr_nr (
.clk (clk),
.enable (enable),
.data_in (data_in[DATAW-RESETW-1:0]),
.data_out (data_out[DATAW-RESETW-1:0])
);
end
end else begin
`UNUSED_VAR (reset)
VX_shift_register_nr #(
.DATAW (DATAW),
.DEPTH (DEPTH)
) sr (
.clk (clk),
.enable (enable),
.data_in (data_in),
.data_out (data_out)
);
end
endmodule
module VX_shift_register_nr #(
parameter DATAW = 1,
parameter DEPTH = 1
) (
input wire clk,
input wire enable,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out
);
reg [DATAW-1:0] entries [DEPTH-1:0];
always @(posedge clk) begin
if (enable) begin
for (integer i = DEPTH-1; i > 0; --i)
entries[i] <= entries[i-1];
entries[0] <= data_in;
end
end
assign data_out = entries [DEPTH-1];
endmodule
module VX_shift_register_wr #(
parameter DATAW = 1, parameter DATAW = 1,
parameter DEPTH = 1 parameter DEPTH = 1
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
input wire enable, input wire enable,
input wire [DATAW-1:0] in, input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] out output wire [DATAW-1:0] data_out
); );
reg [DEPTH-1:0][DATAW-1:0] entries; reg [DEPTH-1:0][DATAW-1:0] entries;
@@ -19,7 +110,7 @@ module VX_shift_register #(
entries <= (DEPTH * DATAW)'(0); entries <= (DEPTH * DATAW)'(0);
end else begin end else begin
if (enable) begin if (enable) begin
entries <= in; entries <= data_in;
end end
end end
end end
@@ -31,12 +122,12 @@ module VX_shift_register #(
entries <= (DEPTH * DATAW)'(0); entries <= (DEPTH * DATAW)'(0);
end else begin end else begin
if (enable) begin if (enable) begin
entries <= {entries[DEPTH-2:0], in}; entries <= {entries[DEPTH-2:0], data_in};
end end
end end
end end
end end
assign out = entries [DEPTH-1]; assign data_out = entries [DEPTH-1];
endmodule endmodule