store fencing, large smem, fix tensor core for firesim

This commit is contained in:
Richard Yan
2024-05-15 21:45:48 -07:00
parent 0dd5335851
commit d624b3e50a
12 changed files with 140 additions and 7 deletions

View File

@@ -180,7 +180,7 @@
#endif
#ifndef SMEM_LOG_SIZE
#define SMEM_LOG_SIZE 15
#define SMEM_LOG_SIZE 17
#endif
#ifndef IO_BASE_ADDR

View File

@@ -95,7 +95,7 @@
`endif
`ifndef NUM_BARRIERS
`define NUM_BARRIERS 4
`define NUM_BARRIERS 8
`endif
`ifndef SOCKET_SIZE
@@ -179,7 +179,7 @@
`endif
`ifndef SMEM_LOG_SIZE
`define SMEM_LOG_SIZE 15
`define SMEM_LOG_SIZE 17
`endif
`ifndef IO_BASE_ADDR

View File

@@ -131,6 +131,8 @@ module Vortex import VX_gpu_pkg::*; #(
output wire [31:0] acc_write_out,
output wire acc_write_en,
input downstream_mem_busy,
output finished,
input traceStall,
@@ -421,6 +423,7 @@ module Vortex import VX_gpu_pkg::*; #(
.sim_ebreak (sim_ebreak),
.sim_wb_value (sim_wb_value),
.busy (busy),
.downstream_mem_busy(downstream_mem_busy),
.acc_read_in (acc_read_in),
.acc_write_out (acc_write_out),

View File

@@ -35,6 +35,7 @@
`define NUM_CORES 2
`define NUM_THREADS 8
`define NUM_WARPS 8
`define EXT_T_DISABLE
`define FIRESIM
`endif // SYNTHESIS

View File

@@ -48,6 +48,7 @@ module VX_core import VX_gpu_pkg::*; #(
// Status
output wire busy, //stays 1 when busy, 0 when done (termination) detect the negative edge
input wire downstream_mem_busy,
input wire [31:0] acc_read_in,
output wire [31:0] acc_write_out,
@@ -198,6 +199,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (execute_reset),
.base_dcrs (base_dcrs),
.downstream_mem_busy(downstream_mem_busy),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),

View File

@@ -22,6 +22,7 @@ module VX_execute import VX_gpu_pkg::*; #(
input wire reset,
input base_dcrs_t base_dcrs,
input wire downstream_mem_busy,
// Dcache interface
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
@@ -92,6 +93,7 @@ module VX_execute import VX_gpu_pkg::*; #(
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (lsu_reset),
.downstream_mem_busy (downstream_mem_busy),
.cache_bus_if (dcache_bus_if),
.dispatch_if (lsu_dispatch_if),
.commit_if (lsu_commit_if)

View File

@@ -21,6 +21,8 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
input wire clk,
input wire reset,
input wire downstream_mem_busy,
// Dcache interface
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS],
@@ -131,7 +133,21 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
// fence: stall the pipeline until all pending requests are sent
wire is_fence = `INST_LSU_IS_FENCE(execute_if[0].data.op_type);
wire fence_wait = is_fence && ~mem_req_empty;
wire start_fence = is_fence && ((~mem_req_empty) || downstream_mem_busy);
wire end_fence = mem_req_empty && (!downstream_mem_busy);
logic fencing;
always @(posedge clk) begin
if (reset) begin
fencing <= 1'b0;
end else if (start_fence) begin
fencing <= 1'b1;
end else if (end_fence) begin
fencing <= 1'b0;
end
end
wire fence_wait = start_fence || fencing;
assign lsu_valid = execute_if[0].valid && ~fence_wait;
assign execute_if[0].ready = lsu_ready && ~fence_wait;

View File

@@ -384,7 +384,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
.empty (no_pending_instr)
);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
`BUFFER_EX(busy, (active_warps != 0 || stalled_warps != 0 || barrier_stalls != 0 || ~no_pending_instr), 1'b1, 1);
// export CSRs
assign sched_csr_if.cycles = cycles;

View File

@@ -1,3 +1,4 @@
`ifdef EXT_T_ENABLE
`include "VX_fpu_define.vh"
module VX_tensor_core #(
@@ -43,7 +44,11 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
assign dispatch_if.ready = &octet_operands_ready;
`ifdef EXT_T_ENABLE
for (genvar i = 0; i < 4/*octets*/; ++i) begin
`else
for (genvar i = 0; i < 0; ++i) begin
`endif
// lane-to-octet mapping; see figure 13 of the paper
wire [7:0][31:0] octet_A = {
dispatch_if.data.rs1_data[16+4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4]
@@ -314,3 +319,4 @@ module VX_tensor_octet #(
.D_tile(D_out)
);
endmodule
`endif

View File

@@ -0,0 +1,97 @@
// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
HMMA_SET0_STEP0_0: begin
uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)};
end
HMMA_SET0_STEP0_1: begin
uop = {NEXT, HMMA_SET0_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(1), `FREG(9), `FREG(17)};
end
HMMA_SET0_STEP1_0: begin
uop = {NEXT, HMMA_SET0_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(0), `FREG(8), `FREG(18)};
end
HMMA_SET0_STEP1_1: begin
uop = {NEXT, HMMA_SET0_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(1), `FREG(9), `FREG(19)};
end
HMMA_SET0_STEP2_0: begin
uop = {NEXT, HMMA_SET0_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(0), `FREG(8), `FREG(20)};
end
HMMA_SET0_STEP2_1: begin
uop = {NEXT, HMMA_SET0_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(1), `FREG(9), `FREG(21)};
end
HMMA_SET0_STEP3_0: begin
uop = {NEXT, HMMA_SET0_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(0), `FREG(8), `FREG(22)};
end
HMMA_SET0_STEP3_1: begin
uop = {NEXT, HMMA_SET1_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(1), `FREG(9), `FREG(23)};
end
HMMA_SET1_STEP0_0: begin
uop = {NEXT, HMMA_SET1_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(2), `FREG(10), `FREG(16)};
end
HMMA_SET1_STEP0_1: begin
uop = {NEXT, HMMA_SET1_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(3), `FREG(11), `FREG(17)};
end
HMMA_SET1_STEP1_0: begin
uop = {NEXT, HMMA_SET1_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(2), `FREG(10), `FREG(18)};
end
HMMA_SET1_STEP1_1: begin
uop = {NEXT, HMMA_SET1_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(3), `FREG(11), `FREG(19)};
end
HMMA_SET1_STEP2_0: begin
uop = {NEXT, HMMA_SET1_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(2), `FREG(10), `FREG(20)};
end
HMMA_SET1_STEP2_1: begin
uop = {NEXT, HMMA_SET1_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(3), `FREG(11), `FREG(21)};
end
HMMA_SET1_STEP3_0: begin
uop = {NEXT, HMMA_SET1_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(2), `FREG(10), `FREG(22)};
end
HMMA_SET1_STEP3_1: begin
uop = {NEXT, HMMA_SET2_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(3), `FREG(11), `FREG(23)};
end
HMMA_SET2_STEP0_0: begin
uop = {NEXT, HMMA_SET2_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(4), `FREG(12), `FREG(16)};
end
HMMA_SET2_STEP0_1: begin
uop = {NEXT, HMMA_SET2_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(5), `FREG(13), `FREG(17)};
end
HMMA_SET2_STEP1_0: begin
uop = {NEXT, HMMA_SET2_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(4), `FREG(12), `FREG(18)};
end
HMMA_SET2_STEP1_1: begin
uop = {NEXT, HMMA_SET2_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(5), `FREG(13), `FREG(19)};
end
HMMA_SET2_STEP2_0: begin
uop = {NEXT, HMMA_SET2_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(4), `FREG(12), `FREG(20)};
end
HMMA_SET2_STEP2_1: begin
uop = {NEXT, HMMA_SET2_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(5), `FREG(13), `FREG(21)};
end
HMMA_SET2_STEP3_0: begin
uop = {NEXT, HMMA_SET2_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(4), `FREG(12), `FREG(22)};
end
HMMA_SET2_STEP3_1: begin
uop = {NEXT, HMMA_SET3_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(5), `FREG(13), `FREG(23)};
end
HMMA_SET3_STEP0_0: begin
uop = {NEXT, HMMA_SET3_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(6), `FREG(14), `FREG(16)};
end
HMMA_SET3_STEP0_1: begin
uop = {NEXT, HMMA_SET3_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(7), `FREG(15), `FREG(17)};
end
HMMA_SET3_STEP1_0: begin
uop = {NEXT, HMMA_SET3_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(6), `FREG(14), `FREG(18)};
end
HMMA_SET3_STEP1_1: begin
uop = {NEXT, HMMA_SET3_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(7), `FREG(15), `FREG(19)};
end
HMMA_SET3_STEP2_0: begin
uop = {NEXT, HMMA_SET3_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(6), `FREG(14), `FREG(20)};
end
HMMA_SET3_STEP2_1: begin
uop = {NEXT, HMMA_SET3_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(7), `FREG(15), `FREG(21)};
end
HMMA_SET3_STEP3_0: begin
uop = {NEXT, HMMA_SET3_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(6), `FREG(14), `FREG(22)};
end
HMMA_SET3_STEP3_1: begin
uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(7), `FREG(15), `FREG(23)};
end

View File

@@ -1,3 +1,5 @@
`ifndef FIRESIM
`ifdef EXT_T_ENABLE
// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
HMMA_SET0_STEP0_0: begin
uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)};
@@ -95,3 +97,5 @@ end
HMMA_SET3_STEP3_1: begin
uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(7), `FREG(15), `FREG(23)};
end
`endif
`endif

View File

@@ -1,3 +1,4 @@
`ifdef EXT_T_ENABLE
`include "VX_fpu_define.vh"
module VX_tensor_dpu #(
@@ -42,3 +43,4 @@ module VX_tensor_dpu #(
.data_out ({valid_out, D_tile})
);
endmodule
`endif