diff --git a/hw/VX_config.h b/hw/VX_config.h index 3db93559..c89f5b4e 100644 --- a/hw/VX_config.h +++ b/hw/VX_config.h @@ -180,7 +180,7 @@ #endif #ifndef SMEM_LOG_SIZE -#define SMEM_LOG_SIZE 15 +#define SMEM_LOG_SIZE 17 #endif #ifndef IO_BASE_ADDR diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index f0910b2b..65d56e8a 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -95,7 +95,7 @@ `endif `ifndef NUM_BARRIERS -`define NUM_BARRIERS 4 +`define NUM_BARRIERS 8 `endif `ifndef SOCKET_SIZE @@ -179,7 +179,7 @@ `endif `ifndef SMEM_LOG_SIZE -`define SMEM_LOG_SIZE 15 +`define SMEM_LOG_SIZE 17 `endif `ifndef IO_BASE_ADDR diff --git a/hw/rtl/VX_core_wrapper.sv b/hw/rtl/VX_core_wrapper.sv index 2115d050..79507dd9 100644 --- a/hw/rtl/VX_core_wrapper.sv +++ b/hw/rtl/VX_core_wrapper.sv @@ -131,6 +131,8 @@ module Vortex import VX_gpu_pkg::*; #( output wire [31:0] acc_write_out, output wire acc_write_en, + input downstream_mem_busy, + output finished, input traceStall, @@ -421,6 +423,7 @@ module Vortex import VX_gpu_pkg::*; #( .sim_ebreak (sim_ebreak), .sim_wb_value (sim_wb_value), .busy (busy), + .downstream_mem_busy(downstream_mem_busy), .acc_read_in (acc_read_in), .acc_write_out (acc_write_out), diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 091e5d14..65cbd0bf 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -35,6 +35,7 @@ `define NUM_CORES 2 `define NUM_THREADS 8 `define NUM_WARPS 8 +`define EXT_T_DISABLE `define FIRESIM `endif // SYNTHESIS diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 9b3df41e..6f8d9778 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -48,6 +48,7 @@ module VX_core import VX_gpu_pkg::*; #( // Status output wire busy, //stays 1 when busy, 0 when done (termination) detect the negative edge + input wire downstream_mem_busy, input wire [31:0] acc_read_in, output wire [31:0] acc_write_out, @@ -198,6 +199,7 @@ module VX_core import VX_gpu_pkg::*; #( .reset (execute_reset), .base_dcrs (base_dcrs), + .downstream_mem_busy(downstream_mem_busy), `ifdef PERF_ENABLE .mem_perf_if (mem_perf_tmp_if), diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index ba6c5e7d..58aa5e04 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -22,6 +22,7 @@ module VX_execute import VX_gpu_pkg::*; #( input wire reset, input base_dcrs_t base_dcrs, + input wire downstream_mem_busy, // Dcache interface VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS], @@ -92,6 +93,7 @@ module VX_execute import VX_gpu_pkg::*; #( `SCOPE_IO_BIND (0) .clk (clk), .reset (lsu_reset), + .downstream_mem_busy (downstream_mem_busy), .cache_bus_if (dcache_bus_if), .dispatch_if (lsu_dispatch_if), .commit_if (lsu_commit_if) diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index d6798138..b4fd6ee1 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -21,6 +21,8 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( input wire clk, input wire reset, + input wire downstream_mem_busy, + // Dcache interface VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS], @@ -131,7 +133,21 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( // fence: stall the pipeline until all pending requests are sent wire is_fence = `INST_LSU_IS_FENCE(execute_if[0].data.op_type); - wire fence_wait = is_fence && ~mem_req_empty; + wire start_fence = is_fence && ((~mem_req_empty) || downstream_mem_busy); + wire end_fence = mem_req_empty && (!downstream_mem_busy); + logic fencing; + + always @(posedge clk) begin + if (reset) begin + fencing <= 1'b0; + end else if (start_fence) begin + fencing <= 1'b1; + end else if (end_fence) begin + fencing <= 1'b0; + end + end + + wire fence_wait = start_fence || fencing; assign lsu_valid = execute_if[0].valid && ~fence_wait; assign execute_if[0].ready = lsu_ready && ~fence_wait; @@ -264,7 +280,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr); always @(posedge clk) begin - if (reset) begin + if (reset) begin pkt_ctr <= '0; pkt_sop <= '0; pkt_eop <= '0; diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 9f6672a5..53a11ffb 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -384,7 +384,7 @@ module VX_schedule import VX_gpu_pkg::*; #( .empty (no_pending_instr) ); - `BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1); + `BUFFER_EX(busy, (active_warps != 0 || stalled_warps != 0 || barrier_stalls != 0 || ~no_pending_instr), 1'b1, 1); // export CSRs assign sched_csr_if.cycles = cycles; diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 546a1da1..9971d619 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -1,3 +1,4 @@ +`ifdef EXT_T_ENABLE `include "VX_fpu_define.vh" module VX_tensor_core #( @@ -43,7 +44,11 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( assign dispatch_if.ready = &octet_operands_ready; +`ifdef EXT_T_ENABLE for (genvar i = 0; i < 4/*octets*/; ++i) begin +`else + for (genvar i = 0; i < 0; ++i) begin +`endif // lane-to-octet mapping; see figure 13 of the paper wire [7:0][31:0] octet_A = { dispatch_if.data.rs1_data[16+4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4] @@ -314,3 +319,4 @@ module VX_tensor_octet #( .D_tile(D_out) ); endmodule +`endif diff --git a/hw/rtl/core/VX_tensor_ucode.txt b/hw/rtl/core/VX_tensor_ucode.txt new file mode 100644 index 00000000..5776aaa8 --- /dev/null +++ b/hw/rtl/core/VX_tensor_ucode.txt @@ -0,0 +1,97 @@ +// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3 +HMMA_SET0_STEP0_0: begin + uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)}; +end +HMMA_SET0_STEP0_1: begin + uop = {NEXT, HMMA_SET0_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(1), `FREG(9), `FREG(17)}; +end +HMMA_SET0_STEP1_0: begin + uop = {NEXT, HMMA_SET0_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(0), `FREG(8), `FREG(18)}; +end +HMMA_SET0_STEP1_1: begin + uop = {NEXT, HMMA_SET0_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(1), `FREG(9), `FREG(19)}; +end +HMMA_SET0_STEP2_0: begin + uop = {NEXT, HMMA_SET0_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(0), `FREG(8), `FREG(20)}; +end +HMMA_SET0_STEP2_1: begin + uop = {NEXT, HMMA_SET0_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(1), `FREG(9), `FREG(21)}; +end +HMMA_SET0_STEP3_0: begin + uop = {NEXT, HMMA_SET0_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(0), `FREG(8), `FREG(22)}; +end +HMMA_SET0_STEP3_1: begin + uop = {NEXT, HMMA_SET1_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(1), `FREG(9), `FREG(23)}; +end +HMMA_SET1_STEP0_0: begin + uop = {NEXT, HMMA_SET1_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(2), `FREG(10), `FREG(16)}; +end +HMMA_SET1_STEP0_1: begin + uop = {NEXT, HMMA_SET1_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(3), `FREG(11), `FREG(17)}; +end +HMMA_SET1_STEP1_0: begin + uop = {NEXT, HMMA_SET1_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(2), `FREG(10), `FREG(18)}; +end +HMMA_SET1_STEP1_1: begin + uop = {NEXT, HMMA_SET1_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(3), `FREG(11), `FREG(19)}; +end +HMMA_SET1_STEP2_0: begin + uop = {NEXT, HMMA_SET1_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(2), `FREG(10), `FREG(20)}; +end +HMMA_SET1_STEP2_1: begin + uop = {NEXT, HMMA_SET1_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(3), `FREG(11), `FREG(21)}; +end +HMMA_SET1_STEP3_0: begin + uop = {NEXT, HMMA_SET1_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(2), `FREG(10), `FREG(22)}; +end +HMMA_SET1_STEP3_1: begin + uop = {NEXT, HMMA_SET2_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(3), `FREG(11), `FREG(23)}; +end +HMMA_SET2_STEP0_0: begin + uop = {NEXT, HMMA_SET2_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(4), `FREG(12), `FREG(16)}; +end +HMMA_SET2_STEP0_1: begin + uop = {NEXT, HMMA_SET2_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(5), `FREG(13), `FREG(17)}; +end +HMMA_SET2_STEP1_0: begin + uop = {NEXT, HMMA_SET2_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(4), `FREG(12), `FREG(18)}; +end +HMMA_SET2_STEP1_1: begin + uop = {NEXT, HMMA_SET2_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(5), `FREG(13), `FREG(19)}; +end +HMMA_SET2_STEP2_0: begin + uop = {NEXT, HMMA_SET2_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(4), `FREG(12), `FREG(20)}; +end +HMMA_SET2_STEP2_1: begin + uop = {NEXT, HMMA_SET2_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(5), `FREG(13), `FREG(21)}; +end +HMMA_SET2_STEP3_0: begin + uop = {NEXT, HMMA_SET2_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(4), `FREG(12), `FREG(22)}; +end +HMMA_SET2_STEP3_1: begin + uop = {NEXT, HMMA_SET3_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(5), `FREG(13), `FREG(23)}; +end +HMMA_SET3_STEP0_0: begin + uop = {NEXT, HMMA_SET3_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(6), `FREG(14), `FREG(16)}; +end +HMMA_SET3_STEP0_1: begin + uop = {NEXT, HMMA_SET3_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(7), `FREG(15), `FREG(17)}; +end +HMMA_SET3_STEP1_0: begin + uop = {NEXT, HMMA_SET3_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(6), `FREG(14), `FREG(18)}; +end +HMMA_SET3_STEP1_1: begin + uop = {NEXT, HMMA_SET3_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(7), `FREG(15), `FREG(19)}; +end +HMMA_SET3_STEP2_0: begin + uop = {NEXT, HMMA_SET3_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(6), `FREG(14), `FREG(20)}; +end +HMMA_SET3_STEP2_1: begin + uop = {NEXT, HMMA_SET3_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(7), `FREG(15), `FREG(21)}; +end +HMMA_SET3_STEP3_0: begin + uop = {NEXT, HMMA_SET3_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(6), `FREG(14), `FREG(22)}; +end +HMMA_SET3_STEP3_1: begin + uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(7), `FREG(15), `FREG(23)}; +end diff --git a/hw/rtl/core/VX_tensor_ucode.vh b/hw/rtl/core/VX_tensor_ucode.vh index 5776aaa8..0105187b 100644 --- a/hw/rtl/core/VX_tensor_ucode.vh +++ b/hw/rtl/core/VX_tensor_ucode.vh @@ -1,3 +1,5 @@ +`ifndef FIRESIM +`ifdef EXT_T_ENABLE // uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3 HMMA_SET0_STEP0_0: begin uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)}; @@ -94,4 +96,6 @@ HMMA_SET3_STEP3_0: begin end HMMA_SET3_STEP3_1: begin uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(7), `FREG(15), `FREG(23)}; -end +end +`endif +`endif \ No newline at end of file diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 3927e1e3..cfc5f507 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -1,3 +1,4 @@ +`ifdef EXT_T_ENABLE `include "VX_fpu_define.vh" module VX_tensor_dpu #( @@ -42,3 +43,4 @@ module VX_tensor_dpu #( .data_out ({valid_out, D_tile}) ); endmodule +`endif