pipeline optimization: fixed GPR fanout delay to execute units

This commit is contained in:
Blaise Tine
2020-11-07 02:01:21 -08:00
parent af2bb3b789
commit b14007f930
13 changed files with 155 additions and 151 deletions

View File

@@ -29,8 +29,8 @@ module VX_alu_unit #(
wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data;
wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data;
wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.rs1_is_PC ? {`NUM_THREADS{alu_req_if.PC}} : alu_in1;
wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.rs2_is_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.rs1_is_PC ? {`NUM_THREADS{alu_req_if.PC}} : alu_in1;
wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.rs2_is_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
wire [`NUM_THREADS-1:0][31:0] alu_in2_less = (alu_req_if.rs2_is_imm && !is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
@@ -40,7 +40,7 @@ module VX_alu_unit #(
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
wire [32:0] sub_in2 = {alu_signed & alu_in2_less[i][31], alu_in2_less[i]};
always @(*) begin
sub_result[i] = $signed(sub_in1) - $signed(sub_in2);

View File

@@ -7,7 +7,7 @@ module VX_csr_data #(
input wire reset,
VX_cmt_to_csr_if cmt_to_csr_if,
VX_csr_to_issue_if csr_to_issue_if,
VX_csr_to_fpu_if csr_to_fpu_if,
input wire read_enable,
input wire[`CSR_ADDR_BITS-1:0] read_addr,
@@ -144,6 +144,6 @@ module VX_csr_data #(
end
assign read_data = read_data_r;
assign csr_to_issue_if.frm = csr_frm[csr_to_issue_if.wid];
assign csr_to_fpu_if.frm = csr_frm[csr_to_fpu_if.wid];
endmodule

View File

@@ -7,7 +7,7 @@ module VX_csr_unit #(
input wire reset,
VX_cmt_to_csr_if cmt_to_csr_if,
VX_csr_to_issue_if csr_to_issue_if,
VX_csr_to_fpu_if csr_to_fpu_if,
VX_csr_io_req_if csr_io_req_if,
VX_csr_io_rsp_if csr_io_rsp_if,
@@ -47,7 +47,7 @@ module VX_csr_unit #(
.clk (clk),
.reset (reset),
.cmt_to_csr_if (cmt_to_csr_if),
.csr_to_issue_if(csr_to_issue_if),
.csr_to_fpu_if (csr_to_fpu_if),
.read_enable (csr_pipe_req_if.valid),
.read_addr (csr_pipe_req_if.csr_addr),
.read_wid (csr_pipe_req_if.wid),

View File

@@ -28,7 +28,6 @@ module VX_execute #(
VX_gpu_req_if gpu_req_if,
// outputs
VX_csr_to_issue_if csr_to_issue_if,
VX_branch_ctl_if branch_ctl_if,
VX_warp_ctl_if warp_ctl_if,
VX_exu_to_cmt_if alu_commit_if,
@@ -41,6 +40,7 @@ module VX_execute #(
input wire busy,
output wire ebreak
);
VX_csr_to_fpu_if csr_to_fpu_if();
VX_alu_unit #(
.CORE_ID(CORE_ID)
@@ -70,7 +70,7 @@ module VX_execute #(
.clk (clk),
.reset (reset),
.cmt_to_csr_if (cmt_to_csr_if),
.csr_to_issue_if(csr_to_issue_if),
.csr_to_fpu_if (csr_to_fpu_if),
.csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_req_if (csr_req_if),
@@ -104,7 +104,8 @@ module VX_execute #(
) fpu_unit (
.clk (clk),
.reset (reset),
.fpu_req_if (fpu_req_if),
.fpu_req_if (fpu_req_if),
.csr_to_fpu_if (csr_to_fpu_if),
.fpu_commit_if (fpu_commit_if)
);
`else

View File

@@ -9,6 +9,7 @@ module VX_fpu_unit #(
// inputs
VX_fpu_req_if fpu_req_if,
VX_csr_to_fpu_if csr_to_fpu_if,
// outputs
VX_fpu_to_cmt_if fpu_commit_if
@@ -56,6 +57,10 @@ module VX_fpu_unit #(
wire valid_in = fpu_req_if.valid && ~fpuq_full;
// resolve dynamic FRM
assign csr_to_fpu_if.wid = fpu_req_if.wid;
wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? csr_to_fpu_if.frm : fpu_req_if.op_mod;
`ifdef FPU_FAST
VX_fp_fpga #(
@@ -70,7 +75,7 @@ module VX_fpu_unit #(
.tag_in (tag_in),
.op_type (fpu_req_if.op_type),
.frm (fpu_req_if.frm),
.frm (fpu_frm),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),
@@ -104,7 +109,7 @@ module VX_fpu_unit #(
.tag_in (tag_in),
.op_type (fpu_req_if.op_type),
.frm (fpu_req_if.frm),
.frm (fpu_frm),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),

View File

@@ -40,27 +40,25 @@ module VX_gpr_bypass #(
buffer2 <= 0;
end else begin
delayed_push <= push;
assert(!use_buffer2 || use_buffer);
assert(!use_buffer2 || use_buffer);
if (pop) begin
if (use_buffer) begin
buffer <= buffer2;
use_buffer <= use_buffer2;
use_buffer2 <= 0;
end
buffer <= buffer2;
use_buffer <= use_buffer2;
use_buffer2 <= 0;
end
if (delayed_push) begin
if (use_buffer) begin
assert(!use_buffer2); // queue full!
assert(!use_buffer2); // full!
use_buffer <= 1;
if (pop) begin
buffer <= data_in;
end else begin
buffer2 <= data_in;
use_buffer2 <= 1;
end
use_buffer <= 1;
end
end else if (!pop) begin
buffer <= data_in;
use_buffer <= 1;
use_buffer <= 1;
end
end
end

View File

@@ -7,7 +7,6 @@ module VX_instr_demux (
// inputs
VX_decode_if execute_if,
VX_gpr_rsp_if gpr_rsp_if,
VX_csr_to_issue_if csr_to_issue_if,
// outputs
VX_alu_req_if alu_req_if,
@@ -34,78 +33,47 @@ module VX_instr_demux (
wire alu_req_ready;
wire is_br_op = `IS_BR_MOD(execute_if.op_mod);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS)
) alu_reg (
VX_opd_collect #(
.INSTW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS),
.OPDSW (2 * `NUM_THREADS * 32),
.PASSTHRU (1) // ALU has no backpressure
) alu_opc (
.clk (clk),
.reset (reset),
.ready_in (alu_req_ready),
.valid_in (alu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}),
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}),
.inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}),
.opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
.ready_out (alu_req_if.ready),
.valid_out (alu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW (2 * `NUM_THREADS * 32),
.PASSTHRU (1) // ALU has no back-pressure, bypass not needed
) alu_bypass (
.clk (clk),
.reset (reset),
.push (alu_req_valid && alu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({alu_req_if.rs1_data, alu_req_if.rs2_data}),
.pop (alu_req_if.valid && alu_req_if.ready)
);
// lsu unit
wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU);
wire lsu_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 1 + `BYTEEN_BITS + 32 + `NR_BITS + 1)
) lsu_reg (
VX_opd_collect #(
.INSTW (`NW_BITS + `NUM_THREADS + 32 + 1 + `BYTEEN_BITS + 32 + `NR_BITS + 1),
.OPDSW (2 * `NUM_THREADS * 32)
) lsu_opc (
.clk (clk),
.reset (reset),
.ready_in (lsu_req_ready),
.valid_in (lsu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `LSU_RW(execute_if.op_type), `LSU_BE(execute_if.op_type), execute_if.imm, execute_if.rd, execute_if.wb}),
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb}),
.inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `LSU_RW(execute_if.op_type), `LSU_BE(execute_if.op_type), execute_if.imm, execute_if.rd, execute_if.wb}),
.opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}),
.ready_out (lsu_req_if.ready),
.valid_out (lsu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((2 * `NUM_THREADS * 32))
) lsu_bypass (
.clk (clk),
.reset (reset),
.push (lsu_req_valid && lsu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({lsu_req_if.base_addr, lsu_req_if.store_data}),
.pop (lsu_req_if.valid && lsu_req_if.ready)
);
// csr unit
wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR);
wire csr_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1)
) csr_reg (
.clk (clk),
.reset (reset),
.ready_in (csr_req_ready),
.valid_in (csr_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `CSR_OP(execute_if.op_type), execute_if.imm[`CSR_ADDR_BITS-1:0], execute_if.rd, execute_if.wb, 1'b0}),
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.csr_addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.is_io}),
.ready_out (csr_req_if.ready),
.valid_out (csr_req_if.valid)
);
reg tmp_rs2_is_imm;
reg [`NR_BITS-1:0] tmp_rs1;
@@ -116,15 +84,19 @@ module VX_instr_demux (
wire [31:0] csr_req_mask = tmp_rs2_is_imm ? 32'(tmp_rs1) : gpr_rsp_if.rs1_data[0];
VX_gpr_bypass #(
.DATAW (32)
) csr_bypass (
VX_opd_collect #(
.INSTW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1),
.OPDSW (32)
) csr_opc (
.clk (clk),
.reset (reset),
.push (csr_req_valid && csr_req_ready),
.data_in (csr_req_mask),
.data_out (csr_req_if.csr_mask),
.pop (csr_req_if.valid && csr_req_if.ready)
.ready_in (csr_req_ready),
.valid_in (csr_req_valid),
.inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `CSR_OP(execute_if.op_type), execute_if.imm[`CSR_ADDR_BITS-1:0], execute_if.rd, execute_if.wb, 1'b0}),
.opds_in ({csr_req_mask}),
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.csr_addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.is_io, csr_req_if.csr_mask}),
.ready_out (csr_req_if.ready),
.valid_out (csr_req_if.valid)
);
// mul unit
@@ -133,29 +105,20 @@ module VX_instr_demux (
wire mul_req_valid = execute_if.valid && (execute_if.ex_type == `EX_MUL);
wire mul_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `MUL_BITS + `NR_BITS + 1)
) mul_reg (
VX_opd_collect #(
.INSTW (`NW_BITS + `NUM_THREADS + 32 + `MUL_BITS + `NR_BITS + 1),
.OPDSW (2 * `NUM_THREADS * 32)
) mul_opc (
.clk (clk),
.reset (reset),
.ready_in (mul_req_ready),
.valid_in (mul_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `MUL_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
.data_out ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.op_type, mul_req_if.rd, mul_req_if.wb}),
.inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `MUL_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
.opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({mul_req_if.wid, mul_req_if.tmask, mul_req_if.PC, mul_req_if.op_type, mul_req_if.rd, mul_req_if.wb, mul_req_if.rs1_data, mul_req_if.rs2_data}),
.ready_out (mul_req_if.ready),
.valid_out (mul_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((2 * `NUM_THREADS * 32))
) mul_bypass (
.clk (clk),
.reset (reset),
.push (mul_req_valid && mul_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
.data_out ({mul_req_if.rs1_data, mul_req_if.rs2_data}),
.pop (mul_req_if.valid && mul_req_if.ready)
);
`endif
// fpu unit
@@ -164,33 +127,20 @@ module VX_instr_demux (
wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU);
wire fpu_req_ready;
// resolve dynamic FRM
assign csr_to_issue_if.wid = execute_if.wid;
wire [`FRM_BITS-1:0] fpu_frm = (execute_if.op_mod == `FRM_DYN) ? csr_to_issue_if.frm : execute_if.op_mod;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `FRM_BITS + `NR_BITS + 1)
) fpu_reg (
VX_opd_collect #(
.INSTW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1),
.OPDSW (3 * `NUM_THREADS * 32)
) fpu_opc (
.clk (clk),
.reset (reset),
.ready_in (fpu_req_ready),
.valid_in (fpu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `FPU_OP(execute_if.op_type), fpu_frm, execute_if.rd, execute_if.wb}),
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.frm, fpu_req_if.rd, fpu_req_if.wb}),
.inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, `FPU_OP(execute_if.op_type), execute_if.op_mod, execute_if.rd, execute_if.wb}),
.opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.ready_out (fpu_req_if.ready),
.valid_out (fpu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((3 * `NUM_THREADS * 32))
) fpu_bypass (
.clk (clk),
.reset (reset),
.push (fpu_req_valid && fpu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
.data_out ({fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.pop (fpu_req_if.valid && fpu_req_if.ready)
);
`endif
// gpu unit
@@ -198,30 +148,21 @@ module VX_instr_demux (
wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU);
wire gpu_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1)
) gpu_reg (
VX_opd_collect #(
.INSTW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1),
.OPDSW (`NUM_THREADS * 32 + 32)
) gpu_opc (
.clk (clk),
.reset (reset),
.ready_in (gpu_req_ready),
.valid_in (gpu_req_valid),
.data_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb}),
.inst_in ({execute_if.wid, execute_if.tmask, execute_if.PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
.opds_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.ready_out (gpu_req_if.ready),
.valid_out (gpu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((`NUM_THREADS * 32) + 32)
) gpu_bypass (
.clk (clk),
.reset (reset),
.push (gpu_req_valid && gpu_req_ready),
.data_in ({gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
.data_out ({gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.pop (gpu_req_if.valid && gpu_req_if.ready)
);
// can take next request?
assign execute_if.ready = (alu_req_ready && (execute_if.ex_type == `EX_ALU))
|| (lsu_req_ready && (execute_if.ex_type == `EX_LSU))

View File

@@ -10,7 +10,6 @@ module VX_issue #(
VX_decode_if decode_if,
VX_writeback_if writeback_if,
VX_csr_to_issue_if csr_to_issue_if,
VX_alu_req_if alu_req_if,
VX_lsu_req_if lsu_req_if,
@@ -71,7 +70,7 @@ module VX_issue #(
);
`UNUSED_VAR (gpr_rsp_if.valid);
assign execute_if.valid = ibuf_deq_if.valid && gpr_req_if.ready && ~scoreboard_delay;
assign execute_if.wid = ibuf_deq_if.wid;
assign execute_if.tmask = ibuf_deq_if.tmask;
@@ -91,7 +90,6 @@ module VX_issue #(
.reset (reset),
.execute_if (execute_if),
.gpr_rsp_if (gpr_rsp_if),
.csr_to_issue_if(csr_to_issue_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
@@ -136,22 +134,22 @@ module VX_issue #(
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rs1_data, alu_req_if.rs2_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rd, alu_req_if.rs1_data, alu_req_if.rs2_data);
end
if (lsu_req_if.valid && lsu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rd, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data);
end
if (csr_req_if.valid && csr_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.csr_addr, csr_req_if.csr_mask);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.csr_addr, csr_req_if.csr_mask);
end
if (mul_req_if.valid && mul_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=MUL, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.PC, mul_req_if.tmask, mul_req_if.rs1_data, mul_req_if.rs2_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=MUL, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.PC, mul_req_if.tmask, mul_req_if.rd, mul_req_if.rs1_data, mul_req_if.rs2_data);
end
if (fpu_req_if.valid && fpu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rd, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data);
end
if (gpu_req_if.valid && gpu_req_if.ready) begin
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rs1_data, gpu_req_if.rs2_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rd, gpu_req_if.rs1_data, gpu_req_if.rs2_data);
end
end
`endif

62
hw/rtl/VX_opd_collect.v Normal file
View File

@@ -0,0 +1,62 @@
`include "VX_platform.vh"
module VX_opd_collect #(
parameter INSTW = 1,
parameter OPDSW = 1,
parameter PASSTHRU = 0
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [INSTW-1:0] inst_in,
input wire [OPDSW-1:0] opds_in,
output wire [INSTW+OPDSW-1:0] data_out,
output wire valid_out,
input wire ready_out
);
wire [INSTW-1:0] inst_out;
wire [OPDSW-1:0] opds_out;
wire valid_out_tmp, ready_out_tmp;
VX_skid_buffer #(
.DATAW (INSTW)
) skid_buffer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (inst_in),
.data_out (inst_out),
.valid_out (valid_out_tmp),
.ready_out (ready_out_tmp)
);
VX_gpr_bypass #(
.DATAW (OPDSW),
.PASSTHRU (PASSTHRU)
) gpr_bypass (
.clk (clk),
.reset (reset),
.push (valid_in && ready_in),
.pop (valid_out_tmp && ready_out_tmp),
.data_in (opds_in),
.data_out (opds_out)
);
wire stall_out = valid_out && ~ready_out;
VX_generic_register #(
.N(1+INSTW+OPDSW)
) pipe_reg (
.clk (clk),
.reset (reset),
.stall (stall_out),
.flush (1'b0),
.in ({valid_out_tmp, inst_out, opds_out}),
.out ({valid_out, data_out})
);
assign ready_out_tmp = ~stall_out;
endmodule

View File

@@ -98,7 +98,6 @@ module VX_pipeline #(
assign csr_io_rsp_data = csr_io_rsp_if.data;
assign csr_io_rsp_if.ready = csr_io_rsp_ready;
VX_csr_to_issue_if csr_to_issue_if();
VX_cmt_to_csr_if cmt_to_csr_if();
VX_decode_if decode_if();
VX_branch_ctl_if branch_ctl_if();
@@ -157,7 +156,6 @@ module VX_pipeline #(
.decode_if (decode_if),
.writeback_if (writeback_if),
.csr_to_issue_if(csr_to_issue_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
@@ -181,7 +179,6 @@ module VX_pipeline #(
.csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_to_issue_if(csr_to_issue_if),
.cmt_to_csr_if (cmt_to_csr_if),
.alu_req_if (alu_req_if),

View File

@@ -1,5 +1,5 @@
`ifndef VX_CSR_TO_ISSUE_IF
`define VX_CSR_TO_ISSUE_IF
`ifndef VX_CSR_TO_FPU_IF
`define VX_CSR_TO_FPU_IF
`include "VX_define.vh"
@@ -7,7 +7,7 @@
`IGNORE_WARNINGS_BEGIN
`endif
interface VX_csr_to_issue_if ();
interface VX_csr_to_fpu_if ();
wire [`NW_BITS-1:0] wid;
wire [`FRM_BITS-1:0] frm;

View File

@@ -15,7 +15,7 @@ interface VX_fpu_req_if ();
wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC;
wire [`FPU_BITS-1:0] op_type;
wire [`FRM_BITS-1:0] frm;
wire [`MOD_BITS-1:0] op_mod;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;

View File

@@ -17,24 +17,26 @@ module VX_skid_buffer #(
reg valid_out_r;
reg use_buffer;
wire push = valid_in && ready_in;
always @(posedge clk) begin
if (reset) begin
data_out_r <= 0;
buffer <= 0;
use_buffer <= 0;
valid_out_r <= 0;
end else begin
if (valid_in && ready_in && valid_out && !ready_out) begin
assert(!use_buffer);
use_buffer <= 1;
end
end else begin
if (ready_out) begin
use_buffer <= 0;
end
if (valid_in && ready_in) begin
if (push) begin
buffer <= data_in;
if (valid_out_r && !ready_out) begin
assert(!use_buffer);
use_buffer <= 1;
end
end
if (!valid_out || ready_out) begin
if (!valid_out_r || ready_out) begin
valid_out_r <= valid_in || use_buffer;
data_out_r <= use_buffer ? buffer : data_in;
end