store fencing, large smem, fix tensor core for firesim
This commit is contained in:
@@ -180,7 +180,7 @@
|
||||
#endif
|
||||
|
||||
#ifndef SMEM_LOG_SIZE
|
||||
#define SMEM_LOG_SIZE 15
|
||||
#define SMEM_LOG_SIZE 17
|
||||
#endif
|
||||
|
||||
#ifndef IO_BASE_ADDR
|
||||
|
||||
@@ -95,7 +95,7 @@
|
||||
`endif
|
||||
|
||||
`ifndef NUM_BARRIERS
|
||||
`define NUM_BARRIERS 4
|
||||
`define NUM_BARRIERS 8
|
||||
`endif
|
||||
|
||||
`ifndef SOCKET_SIZE
|
||||
@@ -179,7 +179,7 @@
|
||||
`endif
|
||||
|
||||
`ifndef SMEM_LOG_SIZE
|
||||
`define SMEM_LOG_SIZE 15
|
||||
`define SMEM_LOG_SIZE 17
|
||||
`endif
|
||||
|
||||
`ifndef IO_BASE_ADDR
|
||||
|
||||
@@ -131,6 +131,8 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
output wire [31:0] acc_write_out,
|
||||
output wire acc_write_en,
|
||||
|
||||
input downstream_mem_busy,
|
||||
|
||||
output finished,
|
||||
|
||||
input traceStall,
|
||||
@@ -421,6 +423,7 @@ module Vortex import VX_gpu_pkg::*; #(
|
||||
.sim_ebreak (sim_ebreak),
|
||||
.sim_wb_value (sim_wb_value),
|
||||
.busy (busy),
|
||||
.downstream_mem_busy(downstream_mem_busy),
|
||||
|
||||
.acc_read_in (acc_read_in),
|
||||
.acc_write_out (acc_write_out),
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
`define NUM_CORES 2
|
||||
`define NUM_THREADS 8
|
||||
`define NUM_WARPS 8
|
||||
`define EXT_T_DISABLE
|
||||
|
||||
`define FIRESIM
|
||||
`endif // SYNTHESIS
|
||||
|
||||
@@ -48,6 +48,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
|
||||
// Status
|
||||
output wire busy, //stays 1 when busy, 0 when done (termination) detect the negative edge
|
||||
input wire downstream_mem_busy,
|
||||
|
||||
input wire [31:0] acc_read_in,
|
||||
output wire [31:0] acc_write_out,
|
||||
@@ -198,6 +199,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
||||
.reset (execute_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
.downstream_mem_busy(downstream_mem_busy),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
|
||||
@@ -22,6 +22,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
input wire reset,
|
||||
|
||||
input base_dcrs_t base_dcrs,
|
||||
input wire downstream_mem_busy,
|
||||
|
||||
// Dcache interface
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||
@@ -92,6 +93,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (lsu_reset),
|
||||
.downstream_mem_busy (downstream_mem_busy),
|
||||
.cache_bus_if (dcache_bus_if),
|
||||
.dispatch_if (lsu_dispatch_if),
|
||||
.commit_if (lsu_commit_if)
|
||||
|
||||
@@ -21,6 +21,8 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire downstream_mem_busy,
|
||||
|
||||
// Dcache interface
|
||||
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
@@ -131,7 +133,21 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
||||
|
||||
// fence: stall the pipeline until all pending requests are sent
|
||||
wire is_fence = `INST_LSU_IS_FENCE(execute_if[0].data.op_type);
|
||||
wire fence_wait = is_fence && ~mem_req_empty;
|
||||
wire start_fence = is_fence && ((~mem_req_empty) || downstream_mem_busy);
|
||||
wire end_fence = mem_req_empty && (!downstream_mem_busy);
|
||||
logic fencing;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
fencing <= 1'b0;
|
||||
end else if (start_fence) begin
|
||||
fencing <= 1'b1;
|
||||
end else if (end_fence) begin
|
||||
fencing <= 1'b0;
|
||||
end
|
||||
end
|
||||
|
||||
wire fence_wait = start_fence || fencing;
|
||||
|
||||
assign lsu_valid = execute_if[0].valid && ~fence_wait;
|
||||
assign execute_if[0].ready = lsu_ready && ~fence_wait;
|
||||
|
||||
@@ -384,7 +384,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
||||
.empty (no_pending_instr)
|
||||
);
|
||||
|
||||
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
|
||||
`BUFFER_EX(busy, (active_warps != 0 || stalled_warps != 0 || barrier_stalls != 0 || ~no_pending_instr), 1'b1, 1);
|
||||
|
||||
// export CSRs
|
||||
assign sched_csr_if.cycles = cycles;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
`ifdef EXT_T_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
module VX_tensor_core #(
|
||||
@@ -43,7 +44,11 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
|
||||
|
||||
assign dispatch_if.ready = &octet_operands_ready;
|
||||
|
||||
`ifdef EXT_T_ENABLE
|
||||
for (genvar i = 0; i < 4/*octets*/; ++i) begin
|
||||
`else
|
||||
for (genvar i = 0; i < 0; ++i) begin
|
||||
`endif
|
||||
// lane-to-octet mapping; see figure 13 of the paper
|
||||
wire [7:0][31:0] octet_A = {
|
||||
dispatch_if.data.rs1_data[16+4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4]
|
||||
@@ -314,3 +319,4 @@ module VX_tensor_octet #(
|
||||
.D_tile(D_out)
|
||||
);
|
||||
endmodule
|
||||
`endif
|
||||
|
||||
97
hw/rtl/core/VX_tensor_ucode.txt
Normal file
97
hw/rtl/core/VX_tensor_ucode.txt
Normal file
@@ -0,0 +1,97 @@
|
||||
// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
|
||||
HMMA_SET0_STEP0_0: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)};
|
||||
end
|
||||
HMMA_SET0_STEP0_1: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(1), `FREG(9), `FREG(17)};
|
||||
end
|
||||
HMMA_SET0_STEP1_0: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(0), `FREG(8), `FREG(18)};
|
||||
end
|
||||
HMMA_SET0_STEP1_1: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(1), `FREG(9), `FREG(19)};
|
||||
end
|
||||
HMMA_SET0_STEP2_0: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(0), `FREG(8), `FREG(20)};
|
||||
end
|
||||
HMMA_SET0_STEP2_1: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(1), `FREG(9), `FREG(21)};
|
||||
end
|
||||
HMMA_SET0_STEP3_0: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(0), `FREG(8), `FREG(22)};
|
||||
end
|
||||
HMMA_SET0_STEP3_1: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(1), `FREG(9), `FREG(23)};
|
||||
end
|
||||
HMMA_SET1_STEP0_0: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(2), `FREG(10), `FREG(16)};
|
||||
end
|
||||
HMMA_SET1_STEP0_1: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(3), `FREG(11), `FREG(17)};
|
||||
end
|
||||
HMMA_SET1_STEP1_0: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(2), `FREG(10), `FREG(18)};
|
||||
end
|
||||
HMMA_SET1_STEP1_1: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(3), `FREG(11), `FREG(19)};
|
||||
end
|
||||
HMMA_SET1_STEP2_0: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(2), `FREG(10), `FREG(20)};
|
||||
end
|
||||
HMMA_SET1_STEP2_1: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(3), `FREG(11), `FREG(21)};
|
||||
end
|
||||
HMMA_SET1_STEP3_0: begin
|
||||
uop = {NEXT, HMMA_SET1_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(2), `FREG(10), `FREG(22)};
|
||||
end
|
||||
HMMA_SET1_STEP3_1: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(3), `FREG(11), `FREG(23)};
|
||||
end
|
||||
HMMA_SET2_STEP0_0: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(4), `FREG(12), `FREG(16)};
|
||||
end
|
||||
HMMA_SET2_STEP0_1: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(5), `FREG(13), `FREG(17)};
|
||||
end
|
||||
HMMA_SET2_STEP1_0: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(4), `FREG(12), `FREG(18)};
|
||||
end
|
||||
HMMA_SET2_STEP1_1: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(5), `FREG(13), `FREG(19)};
|
||||
end
|
||||
HMMA_SET2_STEP2_0: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(4), `FREG(12), `FREG(20)};
|
||||
end
|
||||
HMMA_SET2_STEP2_1: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(5), `FREG(13), `FREG(21)};
|
||||
end
|
||||
HMMA_SET2_STEP3_0: begin
|
||||
uop = {NEXT, HMMA_SET2_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(4), `FREG(12), `FREG(22)};
|
||||
end
|
||||
HMMA_SET2_STEP3_1: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(5), `FREG(13), `FREG(23)};
|
||||
end
|
||||
HMMA_SET3_STEP0_0: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(6), `FREG(14), `FREG(16)};
|
||||
end
|
||||
HMMA_SET3_STEP0_1: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(7), `FREG(15), `FREG(17)};
|
||||
end
|
||||
HMMA_SET3_STEP1_0: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(6), `FREG(14), `FREG(18)};
|
||||
end
|
||||
HMMA_SET3_STEP1_1: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(7), `FREG(15), `FREG(19)};
|
||||
end
|
||||
HMMA_SET3_STEP2_0: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(6), `FREG(14), `FREG(20)};
|
||||
end
|
||||
HMMA_SET3_STEP2_1: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(7), `FREG(15), `FREG(21)};
|
||||
end
|
||||
HMMA_SET3_STEP3_0: begin
|
||||
uop = {NEXT, HMMA_SET3_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(6), `FREG(14), `FREG(22)};
|
||||
end
|
||||
HMMA_SET3_STEP3_1: begin
|
||||
uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(7), `FREG(15), `FREG(23)};
|
||||
end
|
||||
@@ -1,3 +1,5 @@
|
||||
`ifndef FIRESIM
|
||||
`ifdef EXT_T_ENABLE
|
||||
// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
|
||||
HMMA_SET0_STEP0_0: begin
|
||||
uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)};
|
||||
@@ -95,3 +97,5 @@ end
|
||||
HMMA_SET3_STEP3_1: begin
|
||||
uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(7), `FREG(15), `FREG(23)};
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
@@ -1,3 +1,4 @@
|
||||
`ifdef EXT_T_ENABLE
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
module VX_tensor_dpu #(
|
||||
@@ -42,3 +43,4 @@ module VX_tensor_dpu #(
|
||||
.data_out ({valid_out, D_tile})
|
||||
);
|
||||
endmodule
|
||||
`endif
|
||||
|
||||
Reference in New Issue
Block a user