Files
kernels/hw/rtl/core/VX_muldiv_unit.sv
Blaise Tine c1e168fdbe Vortex 2.0 changes:
+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
2023-11-10 02:47:05 -08:00

337 lines
13 KiB
Systemverilog

// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_muldiv_unit #(
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAGW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
wire [`INST_M_BITS-1:0] muldiv_op = `INST_M_BITS'(execute_if.data.op_type);
wire is_mulx_op = `INST_M_IS_MULX(muldiv_op);
wire is_signed_op = `INST_M_SIGNED(muldiv_op);
`ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod);
`else
wire is_alu_w = 0;
`endif
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_out;
wire [`UUID_WIDTH-1:0] mul_uuid_out;
wire [`NW_WIDTH-1:0] mul_wid_out;
wire [NUM_LANES-1:0] mul_tmask_out;
wire [`XLEN-1:0] mul_PC_out;
wire [`NR_BITS-1:0] mul_rd_out;
wire mul_wb_out;
wire [PID_WIDTH-1:0] mul_pid_out;
wire mul_sop_out, mul_eop_out;
wire mul_valid_in = execute_if.valid && is_mulx_op;
wire mul_ready_in;
wire mul_valid_out;
wire mul_ready_out;
wire is_mulh_in = `INST_M_IS_MULH(muldiv_op);
wire is_signed_mul_a = `INST_M_SIGNED_A(muldiv_op);
wire is_signed_mul_b = is_signed_op;
`ifdef IMUL_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_tmp;
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin
dpi_imul (mul_fire_in, is_signed_mul_a, is_signed_mul_b, mul_in1, mul_in2, mul_resultl, mul_resulth);
end
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : (is_alu_w ? `XLEN'($signed(mul_resultl[31:0])) : mul_resultl);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out})
);
assign mul_ready_in = mul_ready_out || ~mul_valid_out;
`else
wire [NUM_LANES-1:0][2*(`XLEN+1)-1:0] mul_result_tmp;
wire is_mulh_out;
wire is_mul_w_out;
`ifdef XLEN_64
wire [NUM_LANES-1:0][`XLEN:0] mul_in1;
wire [NUM_LANES-1:0][`XLEN:0] mul_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
end
wire mul_strode;
wire mul_busy;
VX_elastic_adapter mul_elastic_adapter (
.clk (clk),
.reset (reset),
.valid_in (mul_valid_in),
.ready_in (mul_ready_in),
.valid_out (mul_valid_out),
.ready_out (mul_ready_out),
.strobe (mul_strode),
.busy (mul_busy)
);
VX_serial_mul #(
.A_WIDTH (`XLEN+1),
.LANES (NUM_LANES),
.SIGNED (1)
) serial_mul (
.clk (clk),
.reset (reset),
.strobe (mul_strode),
.busy (mul_busy),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp)
);
reg [TAGW+2-1:0] mul_tag_r;
always @(posedge clk) begin
if (mul_valid_in && mul_ready_in) begin
mul_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_mulh_in, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out, is_mul_w_out, mul_pid_out, mul_sop_out, mul_eop_out} = mul_tag_r;
`else
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
VX_multiplier #(
.A_WIDTH (`XLEN+1),
.B_WIDTH (`XLEN+1),
.R_WIDTH (2*(`XLEN+1)),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
) multiplier (
.clk (clk),
.enable (mul_ready_in),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp[i])
);
end
VX_shift_register #(
.DATAW (1 + TAGW + 1 + 1),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, is_mulh_in, is_alu_w}),
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, is_mulh_out, is_mul_w_out})
);
assign mul_ready_in = mul_ready_out || ~mul_valid_out;
`endif
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
mul_result_tmp[i][`XLEN-1:0]);
`else
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] : mul_result_tmp[i][`XLEN-1:0];
`UNUSED_VAR (is_mul_w_out)
`endif
end
`endif
///////////////////////////////////////////////////////////////////////////
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_out;
wire [`UUID_WIDTH-1:0] div_uuid_out;
wire [`NW_WIDTH-1:0] div_wid_out;
wire [NUM_LANES-1:0] div_tmask_out;
wire [`XLEN-1:0] div_PC_out;
wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out;
wire [PID_WIDTH-1:0] div_pid_out;
wire div_sop_out, div_eop_out;
wire is_rem_op = `INST_M_IS_REM(muldiv_op);
wire div_valid_in = execute_if.valid && ~is_mulx_op;
wire div_ready_in;
wire div_valid_out;
wire div_ready_out;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
end
`ifdef IDIV_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
end
assign div_result_in[i] = is_rem_op ? (is_alu_w ? `XLEN'($signed(div_remainder[31:0])) : div_remainder) :
(is_alu_w ? `XLEN'($signed(div_quotient[31:0])) : div_quotient);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) div_shift_reg (
.clk(clk),
.reset (reset),
.enable (div_ready_in),
.data_in ({div_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, div_result_in}),
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out})
);
assign div_ready_in = div_ready_out || ~div_valid_out;
`else
wire [NUM_LANES-1:0][`XLEN-1:0] div_quotient, div_remainder;
wire is_rem_op_out;
wire is_div_w_out;
wire div_strode;
wire div_busy;
VX_elastic_adapter div_elastic_adapter (
.clk (clk),
.reset (reset),
.valid_in (div_valid_in),
.ready_in (div_ready_in),
.valid_out (div_valid_out),
.ready_out (div_ready_out),
.strobe (div_strode),
.busy (div_busy)
);
VX_serial_div #(
.WIDTHN (`XLEN),
.WIDTHD (`XLEN),
.WIDTHQ (`XLEN),
.WIDTHR (`XLEN),
.LANES (NUM_LANES)
) serial_div (
.clk (clk),
.reset (reset),
.strobe (div_strode),
.busy (div_busy),
.is_signed (is_signed_op),
.numer (div_in1),
.denom (div_in2),
.quotient (div_quotient),
.remainder (div_remainder)
);
reg [TAGW+2-1:0] div_tag_r;
always @(posedge clk) begin
if (div_valid_in && div_ready_in) begin
div_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_rem_op, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
`else
assign div_result_out[i] = is_rem_op_out ? div_remainder[i] : div_quotient[i];
`UNUSED_VAR (is_div_w_out)
`endif
end
`endif
// can accept new request?
assign execute_if.ready = is_mulx_op ? mul_ready_in : div_ready_in;
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAGW + (NUM_LANES * `XLEN)),
.OUT_REG (1)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in ({div_valid_out, mul_valid_out}),
.ready_in ({div_ready_out, mul_ready_out}),
.data_in ({{div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_pid_out, div_sop_out, div_eop_out, div_result_out},
{mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_pid_out, mul_sop_out, mul_eop_out, mul_result_out}}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, commit_if.data.data}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready),
`UNUSED_PIN (sel_out)
);
endmodule