initial tensor core
This commit is contained in:
@@ -40,6 +40,10 @@
|
|||||||
`define EXT_F_ENABLE
|
`define EXT_F_ENABLE
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifndef EXT_T_DISABLE
|
||||||
|
`define EXT_T_ENABLE
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifndef XLEN_32
|
`ifndef XLEN_32
|
||||||
`ifndef XLEN_64
|
`ifndef XLEN_64
|
||||||
`define XLEN_32
|
`define XLEN_32
|
||||||
@@ -618,6 +622,12 @@
|
|||||||
`define EXT_F_ENABLED 0
|
`define EXT_F_ENABLED 0
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
`define EXT_T_ENABLED 1
|
||||||
|
`else
|
||||||
|
`define EXT_T_ENABLED 0
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifdef EXT_M_ENABLE
|
`ifdef EXT_M_ENABLE
|
||||||
`define EXT_M_ENABLED 1
|
`define EXT_M_ENABLED 1
|
||||||
`else
|
`else
|
||||||
|
|||||||
@@ -58,8 +58,9 @@
|
|||||||
`define EX_LSU 1
|
`define EX_LSU 1
|
||||||
`define EX_SFU 2
|
`define EX_SFU 2
|
||||||
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
|
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
|
||||||
|
`define EX_TENSOR (`EX_FPU + `EXT_T_ENABLED)
|
||||||
|
|
||||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED + `EXT_T_ENABLED)
|
||||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||||
`define EX_WIDTH `UP(`EX_BITS)
|
`define EX_WIDTH `UP(`EX_BITS)
|
||||||
|
|
||||||
@@ -253,6 +254,8 @@
|
|||||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||||
|
|
||||||
|
`define INST_TENSOR_HMMA 4'b0000
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// non-cacheable tag bits
|
// non-cacheable tag bits
|
||||||
|
|||||||
@@ -27,6 +27,10 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
`endif
|
`endif
|
||||||
VX_commit_if.slave sfu_commit_if [`ISSUE_WIDTH],
|
VX_commit_if.slave sfu_commit_if [`ISSUE_WIDTH],
|
||||||
|
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
VX_commit_if.slave tensor_commit_if [`ISSUE_WIDTH],
|
||||||
|
`endif
|
||||||
|
|
||||||
// outputs
|
// outputs
|
||||||
VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
|
VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
|
||||||
VX_commit_csr_if.master commit_csr_if,
|
VX_commit_csr_if.master commit_csr_if,
|
||||||
@@ -65,6 +69,9 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
sfu_commit_if[i].valid,
|
sfu_commit_if[i].valid,
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
fpu_commit_if[i].valid,
|
fpu_commit_if[i].valid,
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
tensor_commit_if[i].valid,
|
||||||
`endif
|
`endif
|
||||||
alu_commit_if[i].valid,
|
alu_commit_if[i].valid,
|
||||||
lsu_commit_if[i].valid
|
lsu_commit_if[i].valid
|
||||||
@@ -73,6 +80,9 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
sfu_commit_if[i].ready,
|
sfu_commit_if[i].ready,
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
fpu_commit_if[i].ready,
|
fpu_commit_if[i].ready,
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
tensor_commit_if[i].ready,
|
||||||
`endif
|
`endif
|
||||||
alu_commit_if[i].ready,
|
alu_commit_if[i].ready,
|
||||||
lsu_commit_if[i].ready
|
lsu_commit_if[i].ready
|
||||||
@@ -81,6 +91,9 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
sfu_commit_if[i].data,
|
sfu_commit_if[i].data,
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
fpu_commit_if[i].data,
|
fpu_commit_if[i].data,
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
tensor_commit_if[i].data,
|
||||||
`endif
|
`endif
|
||||||
alu_commit_if[i].data,
|
alu_commit_if[i].data,
|
||||||
lsu_commit_if[i].data
|
lsu_commit_if[i].data
|
||||||
@@ -157,7 +170,18 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
// Committed instructions
|
// Committed instructions
|
||||||
|
|
||||||
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
|
// temporary hack to not underflow the pending instructions buffer
|
||||||
|
wire [`ISSUE_WIDTH-1:0] final_hmma;
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
|
assign final_hmma[i] = ~(tensor_commit_if[i].ready && tensor_commit_if[i].valid) || (tensor_commit_if[i].data.rd == `NR_BITS'(32 + 23));
|
||||||
|
end
|
||||||
|
`else
|
||||||
|
assign final_hmma = '1;
|
||||||
|
`endif
|
||||||
|
|
||||||
|
|
||||||
|
wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma;
|
||||||
|
|
||||||
VX_pipe_register #(
|
VX_pipe_register #(
|
||||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||||
|
|||||||
@@ -65,6 +65,10 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
|
||||||
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
VX_dispatch_if tensor_dispatch_if[`ISSUE_WIDTH]();
|
||||||
|
VX_commit_if tensor_commit_if[`ISSUE_WIDTH]();
|
||||||
`endif
|
`endif
|
||||||
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
|
||||||
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
|
||||||
@@ -172,6 +176,9 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
.lsu_dispatch_if(lsu_dispatch_if),
|
.lsu_dispatch_if(lsu_dispatch_if),
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
.fpu_dispatch_if(fpu_dispatch_if),
|
.fpu_dispatch_if(fpu_dispatch_if),
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
.tensor_dispatch_if(tensor_dispatch_if),
|
||||||
`endif
|
`endif
|
||||||
.sfu_dispatch_if(sfu_dispatch_if)
|
.sfu_dispatch_if(sfu_dispatch_if)
|
||||||
);
|
);
|
||||||
@@ -197,6 +204,10 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
.fpu_dispatch_if(fpu_dispatch_if),
|
.fpu_dispatch_if(fpu_dispatch_if),
|
||||||
.fpu_commit_if (fpu_commit_if),
|
.fpu_commit_if (fpu_commit_if),
|
||||||
`endif
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
.tensor_dispatch_if (tensor_dispatch_if),
|
||||||
|
.tensor_commit_if (tensor_commit_if),
|
||||||
|
`endif
|
||||||
|
|
||||||
.commit_csr_if (commit_csr_if),
|
.commit_csr_if (commit_csr_if),
|
||||||
.sched_csr_if (sched_csr_if),
|
.sched_csr_if (sched_csr_if),
|
||||||
@@ -227,6 +238,9 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
.fpu_commit_if (fpu_commit_if),
|
.fpu_commit_if (fpu_commit_if),
|
||||||
`endif
|
`endif
|
||||||
.sfu_commit_if (sfu_commit_if),
|
.sfu_commit_if (sfu_commit_if),
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
.tensor_commit_if (tensor_commit_if),
|
||||||
|
`endif
|
||||||
|
|
||||||
.writeback_if (writeback_if),
|
.writeback_if (writeback_if),
|
||||||
|
|
||||||
|
|||||||
@@ -533,6 +533,12 @@ module VX_decode #(
|
|||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
`INST_EXT4: begin
|
||||||
|
ex_type = `EX_TENSOR;
|
||||||
|
op_type = `INST_TENSOR_HMMA;
|
||||||
|
end
|
||||||
|
`endif
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -31,6 +31,9 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||||||
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
|
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
|
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
|
||||||
`endif
|
`endif
|
||||||
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
|
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
|
||||||
);
|
);
|
||||||
@@ -139,6 +142,35 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||||||
end
|
end
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
// Tensor Core dispatch
|
||||||
|
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
|
||||||
|
VX_operands_if tensor_operands_if[`ISSUE_WIDTH]();
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
|
assign tensor_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_TENSOR);
|
||||||
|
assign tensor_operands_if[i].data = operands_if[i].data;
|
||||||
|
|
||||||
|
`RESET_RELAY (tensor_reset, reset);
|
||||||
|
|
||||||
|
VX_elastic_buffer #(
|
||||||
|
.DATAW (DATAW),
|
||||||
|
.SIZE (2),
|
||||||
|
.OUT_REG (2)
|
||||||
|
) tensor_buffer (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (tensor_reset),
|
||||||
|
.valid_in (tensor_operands_if[i].valid),
|
||||||
|
.ready_in (tensor_operands_if[i].ready),
|
||||||
|
.data_in (`TO_DISPATCH_DATA(tensor_operands_if[i].data, last_active_tid[i])),
|
||||||
|
.data_out (tensor_dispatch_if[i].data),
|
||||||
|
.valid_out (tensor_dispatch_if[i].valid),
|
||||||
|
.ready_out (tensor_dispatch_if[i].ready)
|
||||||
|
);
|
||||||
|
end
|
||||||
|
`endif
|
||||||
|
|
||||||
// SFU dispatch
|
// SFU dispatch
|
||||||
|
|
||||||
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
|
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
|
||||||
@@ -171,6 +203,9 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||||||
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
|
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
|
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
|| (tensor_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_TENSOR))
|
||||||
`endif
|
`endif
|
||||||
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -53,6 +53,11 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||||||
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
|
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
|
||||||
VX_warp_ctl_if.master warp_ctl_if,
|
VX_warp_ctl_if.master warp_ctl_if,
|
||||||
|
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
VX_dispatch_if.slave tensor_dispatch_if [`ISSUE_WIDTH],
|
||||||
|
VX_commit_if.master tensor_commit_if [`ISSUE_WIDTH],
|
||||||
|
`endif
|
||||||
|
|
||||||
// simulation helper signals
|
// simulation helper signals
|
||||||
output wire sim_ebreak
|
output wire sim_ebreak
|
||||||
);
|
);
|
||||||
@@ -127,6 +132,18 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||||||
.commit_if (sfu_commit_if)
|
.commit_if (sfu_commit_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
VX_tensor_core #(
|
||||||
|
|
||||||
|
) tensor_core (
|
||||||
|
.clk(clk),
|
||||||
|
.reset(reset),
|
||||||
|
|
||||||
|
.dispatch_if(tensor_dispatch_if),
|
||||||
|
.commit_if(tensor_commit_if)
|
||||||
|
);
|
||||||
|
`endif
|
||||||
|
|
||||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||||
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
|
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
|
||||||
&& alu_dispatch_if[0].data.wis == 0
|
&& alu_dispatch_if[0].data.wis == 0
|
||||||
|
|||||||
@@ -36,6 +36,8 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
assign decode_if.ready = ibuf_ready_in[decode_isw];
|
assign decode_if.ready = ibuf_ready_in[decode_isw];
|
||||||
|
|
||||||
|
VX_ibuffer_if uop_sequencer_if [`ISSUE_WIDTH];
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
VX_elastic_buffer #(
|
VX_elastic_buffer #(
|
||||||
.DATAW (DATAW),
|
.DATAW (DATAW),
|
||||||
@@ -62,13 +64,24 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||||||
decode_if.data.rs1,
|
decode_if.data.rs1,
|
||||||
decode_if.data.rs2,
|
decode_if.data.rs2,
|
||||||
decode_if.data.rs3}),
|
decode_if.data.rs3}),
|
||||||
.data_out(ibuffer_if[i].data),
|
|
||||||
.valid_out (ibuffer_if[i].valid),
|
.data_out (uop_sequencer_if[i].data),
|
||||||
.ready_out(ibuffer_if[i].ready)
|
.valid_out (uop_sequencer_if[i].valid),
|
||||||
|
.ready_out (uop_sequencer_if[i].ready)
|
||||||
);
|
);
|
||||||
|
|
||||||
`ifndef L1_ENABLE
|
`ifndef L1_ENABLE
|
||||||
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
|
assign decode_if.ibuf_pop[i] = uop_sequencer_if[i].valid && uop_sequencer_if[i].ready;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
VX_uop_sequencer uop_sequencer (
|
||||||
|
.clk(clk),
|
||||||
|
.reset(reset),
|
||||||
|
|
||||||
|
.uop_sequencer_if(uop_sequencer_if[i]),
|
||||||
|
.ibuffer_if(ibuffer_if[i])
|
||||||
|
);
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -33,6 +33,9 @@ module VX_issue #(
|
|||||||
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
|
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
|
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
VX_dispatch_if.master tensor_dispatch_if [`ISSUE_WIDTH],
|
||||||
`endif
|
`endif
|
||||||
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
|
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
|
||||||
);
|
);
|
||||||
@@ -92,6 +95,9 @@ module VX_issue #(
|
|||||||
.lsu_dispatch_if(lsu_dispatch_if),
|
.lsu_dispatch_if(lsu_dispatch_if),
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
.fpu_dispatch_if(fpu_dispatch_if),
|
.fpu_dispatch_if(fpu_dispatch_if),
|
||||||
|
`endif
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
.tensor_dispatch_if(tensor_dispatch_if),
|
||||||
`endif
|
`endif
|
||||||
.sfu_dispatch_if(sfu_dispatch_if)
|
.sfu_dispatch_if(sfu_dispatch_if)
|
||||||
);
|
);
|
||||||
|
|||||||
15
hw/rtl/core/VX_tensor_core.sv
Normal file
15
hw/rtl/core/VX_tensor_core.sv
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
`include "VX_fpu_define.vh"
|
||||||
|
|
||||||
|
module VX_tensor_core #(
|
||||||
|
|
||||||
|
) (
|
||||||
|
input clk,
|
||||||
|
input reset,
|
||||||
|
|
||||||
|
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
|
||||||
|
VX_commit_if.master commit_if [`ISSUE_WIDTH]
|
||||||
|
);
|
||||||
|
`STATIC_ASSERT(`NUM_THREADS == 32, ("tensor core requires # of threads in a warp to be 32"));
|
||||||
|
`UNUSED_VAR(clk);
|
||||||
|
`UNUSED_VAR(reset);
|
||||||
|
endmodule
|
||||||
187
hw/rtl/core/VX_uop_sequencer.sv
Normal file
187
hw/rtl/core/VX_uop_sequencer.sv
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
`include "VX_define.vh"
|
||||||
|
|
||||||
|
`define FREG(x) {1'b1, `NRI_BITS'(`CLOG2(x))}
|
||||||
|
|
||||||
|
module VX_uop_sequencer import VX_gpu_pkg::*; (
|
||||||
|
input clk,
|
||||||
|
input reset,
|
||||||
|
|
||||||
|
VX_ibuffer_if.slave uop_sequencer_if,
|
||||||
|
VX_ibuffer_if.master ibuffer_if
|
||||||
|
);
|
||||||
|
|
||||||
|
`ifdef EXT_T_ENABLE
|
||||||
|
localparam UOP_TABLE_SIZE = 64;
|
||||||
|
localparam UPC_BITS = `CLOG2(UOP_TABLE_SIZE);
|
||||||
|
|
||||||
|
localparam NEXT = 2'b00;
|
||||||
|
localparam FINISH = 2'b01;
|
||||||
|
|
||||||
|
localparam UBR_BITS = 2;
|
||||||
|
|
||||||
|
// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
|
||||||
|
localparam UOP_TABLE_WIDTH = UBR_BITS + UPC_BITS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + (`NR_BITS * 4);
|
||||||
|
localparam IBUFFER_IF_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4);
|
||||||
|
|
||||||
|
logic [UOP_TABLE_WIDTH-1:0] uop;
|
||||||
|
|
||||||
|
// reserve space at start of table for more uop sequences
|
||||||
|
localparam HMMA_SET0_STEP0_0 = UPC_BITS'(0);
|
||||||
|
localparam HMMA_SET0_STEP0_1 = UPC_BITS'(8);
|
||||||
|
/*
|
||||||
|
localparam HMMA_SET0_STEP1_0 = UPC_BITS'(9);
|
||||||
|
localparam HMMA_SET0_STEP1_1 = UPC_BITS'(10);
|
||||||
|
localparam HMMA_SET0_STEP2_0 = UPC_BITS'(11);
|
||||||
|
localparam HMMA_SET0_STEP2_1 = UPC_BITS'(12);
|
||||||
|
localparam HMMA_SET0_STEP3_0 = UPC_BITS'(13);
|
||||||
|
localparam HMMA_SET0_STEP3_1 = UPC_BITS'(14);
|
||||||
|
|
||||||
|
localparam HMMA_SET1_STEP0_0 = UPC_BITS'(15);
|
||||||
|
localparam HMMA_SET1_STEP0_1 = UPC_BITS'(16);
|
||||||
|
localparam HMMA_SET1_STEP1_0 = UPC_BITS'(17);
|
||||||
|
localparam HMMA_SET1_STEP1_1 = UPC_BITS'(18);
|
||||||
|
localparam HMMA_SET1_STEP2_0 = UPC_BITS'(19);
|
||||||
|
localparam HMMA_SET1_STEP2_1 = UPC_BITS'(20);
|
||||||
|
localparam HMMA_SET1_STEP3_0 = UPC_BITS'(21);
|
||||||
|
localparam HMMA_SET1_STEP3_1 = UPC_BITS'(22);
|
||||||
|
|
||||||
|
localparam HMMA_SET2_STEP0_0 = UPC_BITS'(23);
|
||||||
|
localparam HMMA_SET2_STEP0_1 = UPC_BITS'(24);
|
||||||
|
localparam HMMA_SET2_STEP1_0 = UPC_BITS'(25);
|
||||||
|
localparam HMMA_SET2_STEP1_1 = UPC_BITS'(26);
|
||||||
|
localparam HMMA_SET2_STEP2_0 = UPC_BITS'(27);
|
||||||
|
localparam HMMA_SET2_STEP2_1 = UPC_BITS'(28);
|
||||||
|
localparam HMMA_SET2_STEP3_0 = UPC_BITS'(29);
|
||||||
|
localparam HMMA_SET2_STEP3_1 = UPC_BITS'(30);
|
||||||
|
|
||||||
|
localparam HMMA_SET3_STEP0_0 = UPC_BITS'(31);
|
||||||
|
localparam HMMA_SET3_STEP0_1 = UPC_BITS'(32);
|
||||||
|
localparam HMMA_SET3_STEP1_0 = UPC_BITS'(33);
|
||||||
|
localparam HMMA_SET3_STEP1_1 = UPC_BITS'(34);
|
||||||
|
localparam HMMA_SET3_STEP2_0 = UPC_BITS'(35);
|
||||||
|
localparam HMMA_SET3_STEP2_1 = UPC_BITS'(36);
|
||||||
|
localparam HMMA_SET3_STEP3_0 = UPC_BITS'(37);
|
||||||
|
localparam HMMA_SET3_STEP3_1 = UPC_BITS'(38);
|
||||||
|
*/
|
||||||
|
// register layout: f0-f7 used for A, f8-f15 used for B, f16-f23 used for C
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
always @(*) begin
|
||||||
|
case (upc)
|
||||||
|
HMMA_SET0_STEP0_0: begin
|
||||||
|
uop = {
|
||||||
|
NEXT,
|
||||||
|
HMMA_SET0_STEP0_1,
|
||||||
|
`EX_BITS'(`EX_TENSOR),
|
||||||
|
`INST_OP_BITS'(0), // denotes that the first half is being computed
|
||||||
|
`INST_MOD_BITS'(0), // field is unused for HMMA
|
||||||
|
1'b1, // write back
|
||||||
|
1'b0, // don't use PC
|
||||||
|
1'b0, // don't use immediate
|
||||||
|
32'b0, // PC is unused - TODO: don't send a bogus PC down the pipeline as it is very confusing in trace
|
||||||
|
32'b0, // immediate is unused
|
||||||
|
`FREG(16), // rd=f16
|
||||||
|
`FREG(0), // rs1=f0,
|
||||||
|
`FREG(8), // rs2=f8
|
||||||
|
`FREG(16) // rs3=f16
|
||||||
|
};
|
||||||
|
end
|
||||||
|
HMMA_SET0_STEP0_1: begin
|
||||||
|
uop = {
|
||||||
|
FINISH,
|
||||||
|
HMMA_SET0_STEP0_0,
|
||||||
|
`EX_BITS'(`EX_TENSOR),
|
||||||
|
`INST_OP_BITS'(1), // denotes that the second half is being computed
|
||||||
|
`INST_MOD_BITS'(0), // field is unused for HMMA
|
||||||
|
1'b1, // write back
|
||||||
|
1'b0, // don't use PC
|
||||||
|
1'b0, // don't use immediate
|
||||||
|
32'b0, // PC is unused - TODO: don't send a bogus PC down the pipeline as it is very confusing in trace
|
||||||
|
32'b0, // immediate is unused
|
||||||
|
`FREG(17), // rd=f17
|
||||||
|
`FREG(1), // rs1=f1,
|
||||||
|
`FREG(9), // rs2=f9
|
||||||
|
`FREG(17) // rs3=f17
|
||||||
|
};
|
||||||
|
end
|
||||||
|
default: begin
|
||||||
|
uop = '0;
|
||||||
|
end
|
||||||
|
endcase
|
||||||
|
end
|
||||||
|
|
||||||
|
logic [UPC_BITS-1:0] upc, upc_r, upc_n;
|
||||||
|
|
||||||
|
logic [UBR_BITS-1:0] ubr = uop[UOP_TABLE_WIDTH-1:UOP_TABLE_WIDTH-UBR_BITS];
|
||||||
|
logic [UPC_BITS-1:0] next_upc = uop[UOP_TABLE_WIDTH-UBR_BITS-1:UOP_TABLE_WIDTH-UBR_BITS-UPC_BITS];
|
||||||
|
|
||||||
|
logic uop_fire = use_uop && ibuffer_if.valid && ibuffer_if.ready;
|
||||||
|
logic uop_start = ~use_uop_1d && use_uop;
|
||||||
|
logic uop_finish = use_uop && uop_sequencer_if.valid && uop_sequencer_if.ready;
|
||||||
|
logic use_uop, use_uop_1d;
|
||||||
|
|
||||||
|
// merging the 2 always blocks leads to spurious UNOPTFLAT verilator lint, but conceptually they should be linked
|
||||||
|
always @(*) begin
|
||||||
|
use_uop = uop_sequencer_if.valid && uop_sequencer_if.data.ex_type == `EX_TENSOR;
|
||||||
|
|
||||||
|
if (uop_start) begin
|
||||||
|
// 1st cycle of microcoded operation, use op_type to determine entry point into microcode table
|
||||||
|
upc_n = UPC_BITS'(uop_sequencer_if.data.op_type);
|
||||||
|
end
|
||||||
|
else begin
|
||||||
|
upc_n = upc;
|
||||||
|
end
|
||||||
|
|
||||||
|
if (uop_fire) begin
|
||||||
|
upc_n = next_upc;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
always @(*) begin
|
||||||
|
if (uop_start) begin
|
||||||
|
// 1st cycle of microcoded operation, use op_type to determine entry point into microcode table
|
||||||
|
upc = UPC_BITS'(uop_sequencer_if.data.op_type);
|
||||||
|
end
|
||||||
|
else begin
|
||||||
|
upc = upc_r;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
// copy UUID, wis, tmask from microcoded instruction
|
||||||
|
logic [IBUFFER_IF_DATAW-1:0] ibuffer_output = {
|
||||||
|
uop_sequencer_if.data.uuid,
|
||||||
|
uop_sequencer_if.data.wis,
|
||||||
|
uop_sequencer_if.data.tmask,
|
||||||
|
uop[UOP_TABLE_WIDTH-UBR_BITS-UPC_BITS-1:0]
|
||||||
|
};
|
||||||
|
|
||||||
|
assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid;
|
||||||
|
assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready;
|
||||||
|
assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data;
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
upc_r <= '0;
|
||||||
|
use_uop_1d <= '0;
|
||||||
|
end
|
||||||
|
else begin
|
||||||
|
upc_r <= upc_n;
|
||||||
|
if (uop_finish) begin
|
||||||
|
use_uop_1d <= 1'b0; // allow microcoded instructions to start immediately after eachother
|
||||||
|
end
|
||||||
|
else begin
|
||||||
|
use_uop_1d <= use_uop;
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
`else
|
||||||
|
`UNUSED_VAR(clk);
|
||||||
|
`UNUSED_VAR(reset);
|
||||||
|
assign ibuffer_if.valid = uop_sequencer_if.valid;
|
||||||
|
assign uop_sequencer_if.ready = ibuffer_if.ready;
|
||||||
|
assign ibuffer_if.data = uop_sequencer_if.data;
|
||||||
|
`endif
|
||||||
|
|
||||||
|
|
||||||
|
endmodule
|
||||||
Reference in New Issue
Block a user