From c7a81d1493b5e0420546b25c9465a64321418d20 Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 20 Dec 2023 11:57:44 -0800 Subject: [PATCH] adding sockets support to simx and cache subsystem refactoring minor update minor update minor updates --- hw/rtl/VX_cluster.sv | 66 +++++++++- hw/rtl/VX_config.vh | 9 +- hw/rtl/VX_define.vh | 18 ++- hw/rtl/VX_gpu_pkg.sv | 9 +- hw/rtl/VX_socket.sv | 45 +------ hw/rtl/VX_types.vh | 44 +++---- hw/rtl/core/VX_core.sv | 18 +-- hw/rtl/core/VX_csr_data.sv | 15 +-- hw/rtl/core/VX_issue.sv | 2 +- hw/rtl/core/VX_lsu_unit.sv | 8 +- hw/rtl/interfaces/VX_pipeline_perf_if.sv | 25 ++-- runtime/common/utils.cpp | 34 +---- runtime/simx/vortex.cpp | 2 +- sim/simx/Makefile | 2 +- sim/simx/arch.h | 10 +- sim/simx/cluster.cpp | 158 ++++++++--------------- sim/simx/cluster.h | 41 +++--- sim/simx/core.cpp | 111 ++++++++-------- sim/simx/core.h | 26 ++-- sim/simx/exe_unit.cpp | 38 +++--- sim/simx/main.cpp | 8 +- sim/simx/socket.cpp | 146 +++++++++++++++++++++ sim/simx/socket.h | 87 +++++++++++++ sim/simx/types.h | 7 + 24 files changed, 541 insertions(+), 388 deletions(-) create mode 100644 sim/simx/socket.cpp create mode 100644 sim/simx/socket.h diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 90076673..6de47c5f 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -85,8 +85,8 @@ module VX_cluster import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) - ) per_socket_mem_bus_if[`NUM_SOCKETS](); + .TAG_WIDTH (L1_MEM_TAG_WIDTH) + ) l1_mem_bus_if[2](); `RESET_RELAY (l2_reset, reset); @@ -102,7 +102,7 @@ module VX_cluster import VX_gpu_pkg::*; #( .MSHR_SIZE (`L2_MSHR_SIZE), .MRSQ_SIZE (`L2_MRSQ_SIZE), .MREQ_SIZE (`L2_MREQ_SIZE), - .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH), + .TAG_WIDTH (L1_MEM_TAG_WIDTH), .WRITE_ENABLE (1), .UUID_WIDTH (`UUID_WIDTH), .CORE_OUT_REG (2), @@ -115,10 +115,65 @@ module VX_cluster import VX_gpu_pkg::*; #( `ifdef PERF_ENABLE .cache_perf (perf_l2cache), `endif - .core_bus_if (per_socket_mem_bus_if), + .core_bus_if (l1_mem_bus_if), .mem_bus_if (mem_bus_if) ); + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) + ) per_socket_icache_mem_bus_if[`NUM_SOCKETS](); + + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) + ) per_socket_dcache_mem_bus_if[`NUM_SOCKETS](); + + VX_mem_bus_if #( + .DATA_SIZE (ICACHE_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH) + ) icache_mem_bus_if[1](); + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH) + ) dcache_mem_bus_if[1](); + + `RESET_RELAY (l1_mem_arb_reset, reset); + + VX_mem_arb #( + .NUM_INPUTS (`NUM_SOCKETS), + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH), + .TAG_SEL_IDX (1), // Skip 0 for NC flag + .ARBITER ("R"), + .OUT_REG_REQ (2), + .OUT_REG_RSP (2) + ) icache_mem_arb ( + .clk (clk), + .reset (l1_mem_arb_reset), + .bus_in_if (per_socket_icache_mem_bus_if), + .bus_out_if (icache_mem_bus_if) + ); + + VX_mem_arb #( + .NUM_INPUTS (`NUM_SOCKETS), + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH), + .TAG_SEL_IDX (1), // Skip 0 for NC flag + .ARBITER ("R"), + .OUT_REG_REQ (2), + .OUT_REG_RSP (2) + ) dcache_mem_arb ( + .clk (clk), + .reset (l1_mem_arb_reset), + .bus_in_if (per_socket_dcache_mem_bus_if), + .bus_out_if (dcache_mem_bus_if) + ); + + `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH); + `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH); + /////////////////////////////////////////////////////////////////////////// wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak; @@ -155,7 +210,8 @@ module VX_cluster import VX_gpu_pkg::*; #( .dcr_bus_if (socket_dcr_bus_if), - .mem_bus_if (per_socket_mem_bus_if[i]), + .icache_mem_bus_if (per_socket_icache_mem_bus_if[i]), + .dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]), `ifdef GBAR_ENABLE .gbar_bus_if (per_socket_gbar_bus_if[i]), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 3af544c6..d35d906b 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -262,7 +262,10 @@ `endif // LSU Duplicate Address Check -`ifdef LSU_DUP +`ifndef LSU_DUP_DISABLE +`define LSU_DUP_ENABLE +`endif +`ifdef LSU_DUP_ENABLE `define LSU_DUP_ENABLED 1 `else `define LSU_DUP_ENABLED 0 @@ -381,7 +384,7 @@ // Number of Cache Units `ifndef NUM_ICACHES -`define NUM_ICACHES `UP(`NUM_CORES / 4) +`define NUM_ICACHES `UP(`SOCKET_SIZE / 4) `endif // Cache Size @@ -430,7 +433,7 @@ // Number of Cache Units `ifndef NUM_DCACHES -`define NUM_DCACHES `UP(`NUM_CORES / 4) +`define NUM_DCACHES `UP(`SOCKET_SIZE / 4) `endif // Cache Size diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 95d206ce..f39e7fea 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -410,8 +410,22 @@ assign dst = src; \ end -`define TO_DISPATCH_DATA(data, tid) \ - {data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data} +`define TO_DISPATCH_DATA(data, tid) { \ + data.uuid, \ + data.wis, \ + data.tmask, \ + data.op_type, \ + data.op_mod, \ + data.wb, \ + data.use_PC, \ + data.use_imm, \ + data.PC, \ + data.imm, \ + data.rd, \ + tid, \ + data.rs1_data, \ + data.rs2_data, \ + data.rs3_data} /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 668b53ee..b32b9600 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -141,8 +141,9 @@ package VX_gpu_pkg; /////////////////////////////// L1 Parameters ///////////////////////////// - localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2)); + localparam ICACHE_MEM_ARB_TAG_WIDTH = (ICACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS)); + localparam DCACHE_MEM_ARB_TAG_WIDTH = (DCACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS)); + localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_ARB_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH); /////////////////////////////// L2 Parameters ///////////////////////////// @@ -150,10 +151,10 @@ package VX_gpu_pkg; localparam L2_WORD_SIZE = `L1_LINE_SIZE; // Input request size - localparam L2_NUM_REQS = `NUM_SOCKETS; + localparam L2_NUM_REQS = 2; // Core request tag bits - localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH; + localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH; // Memory request data bits localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8); diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 139598d9..74a074d1 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -30,7 +30,8 @@ module VX_socket import VX_gpu_pkg::*; #( VX_dcr_bus_if.slave dcr_bus_if, // Memory - VX_mem_bus_if.master mem_bus_if, + VX_mem_bus_if.master icache_mem_bus_if, + VX_mem_bus_if.master dcache_mem_bus_if, `ifdef GBAR_ENABLE // Barrier @@ -76,47 +77,7 @@ module VX_socket import VX_gpu_pkg::*; #( assign mem_perf_tmp_if.mem = mem_perf_if.mem; `endif - VX_mem_bus_if #( - .DATA_SIZE (ICACHE_LINE_SIZE), - .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) - ) icache_mem_bus_if(); - - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_LINE_SIZE), - .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) - ) dcache_mem_bus_if(); - - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH) - ) cache_mem_bus_if[2](); - - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) - ) mem_bus_tmp_if[1](); - - `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH); - `ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); - - `RESET_RELAY (mem_arb_reset, reset); - - VX_mem_arb #( - .NUM_INPUTS (2), - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH), - .TAG_SEL_IDX (1), // Skip 0 for NC flag - .ARBITER ("R"), - .OUT_REG_REQ (2), - .OUT_REG_RSP (2) - ) mem_arb ( - .clk (clk), - .reset (mem_arb_reset), - .bus_in_if (cache_mem_bus_if), - .bus_out_if (mem_bus_tmp_if) - ); - - `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]); + /////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 4fb03783..a5044ccf 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -78,33 +78,25 @@ `define VX_CSR_MPM_IBUF_ST_H 12'hB85 `define VX_CSR_MPM_SCRB_ST 12'hB06 `define VX_CSR_MPM_SCRB_ST_H 12'hB86 -`define VX_CSR_MPM_ALU_ST 12'hB07 -`define VX_CSR_MPM_ALU_ST_H 12'hB87 -`define VX_CSR_MPM_LSU_ST 12'hB08 -`define VX_CSR_MPM_LSU_ST_H 12'hB88 -`define VX_CSR_MPM_FPU_ST 12'hB09 -`define VX_CSR_MPM_FPU_ST_H 12'hB89 -`define VX_CSR_MPM_SFU_ST 12'hB0A -`define VX_CSR_MPM_SFU_ST_H 12'hB8A -`define VX_CSR_MPM_SCRB_ALU 12'hB0B -`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B -`define VX_CSR_MPM_SCRB_FPU 12'hB0C -`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C -`define VX_CSR_MPM_SCRB_LSU 12'hB0D -`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D -`define VX_CSR_MPM_SCRB_SFU 12'hB0E -`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E +`define VX_CSR_MPM_SCRB_ALU 12'hB07 +`define VX_CSR_MPM_SCRB_ALU_H 12'hB87 +`define VX_CSR_MPM_SCRB_FPU 12'hB08 +`define VX_CSR_MPM_SCRB_FPU_H 12'hB88 +`define VX_CSR_MPM_SCRB_LSU 12'hB09 +`define VX_CSR_MPM_SCRB_LSU_H 12'hB89 +`define VX_CSR_MPM_SCRB_SFU 12'hB0A +`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A // PERF: memory -`define VX_CSR_MPM_IFETCHES 12'hB0F -`define VX_CSR_MPM_IFETCHES_H 12'hB8F -`define VX_CSR_MPM_LOADS 12'hB10 -`define VX_CSR_MPM_LOADS_H 12'hB90 -`define VX_CSR_MPM_STORES 12'hB11 -`define VX_CSR_MPM_STORES_H 12'hB91 -`define VX_CSR_MPM_IFETCH_LT 12'hB12 -`define VX_CSR_MPM_IFETCH_LT_H 12'hB92 -`define VX_CSR_MPM_LOAD_LT 12'hB13 -`define VX_CSR_MPM_LOAD_LT_H 12'hB93 +`define VX_CSR_MPM_IFETCHES 12'hB0B +`define VX_CSR_MPM_IFETCHES_H 12'hB8B +`define VX_CSR_MPM_LOADS 12'hB0C +`define VX_CSR_MPM_LOADS_H 12'hB8C +`define VX_CSR_MPM_STORES 12'hB0D +`define VX_CSR_MPM_STORES_H 12'hB8D +`define VX_CSR_MPM_IFETCH_LT 12'hB0E +`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E +`define VX_CSR_MPM_LOAD_LT 12'hB0F +`define VX_CSR_MPM_LOAD_LT_H 12'hB8F // Machine Performance-monitoring memory counters // PERF: icache diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 5aba3075..4d3ce297 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -273,23 +273,23 @@ module VX_core import VX_gpu_pkg::*; #( wire [1:0] perf_icache_pending_read_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; - reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; - reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; + reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; + reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; - reg [`PERF_CTR_BITS-1:0] perf_ifetches; - reg [`PERF_CTR_BITS-1:0] perf_loads; - reg [`PERF_CTR_BITS-1:0] perf_stores; + reg [`PERF_CTR_BITS-1:0] perf_ifetches; + reg [`PERF_CTR_BITS-1:0] perf_loads; + reg [`PERF_CTR_BITS-1:0] perf_stores; - wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready; - wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready; + wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready; + wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r; wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r; wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire; for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin - assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; - assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; + assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw; + assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw; assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; end diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 6d7c41f8..1b370260 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -195,19 +195,6 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; `VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0]; - `VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0]; - `VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]); - `ifdef EXT_F_ENABLE - `VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0]; - `VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]); - `else - `VX_CSR_MPM_FPU_ST : read_data_ro_r = '0; - `VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0; - `endif - `VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0]; - `VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0]; `ifdef EXT_F_ENABLE @@ -220,7 +207,7 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0]; `VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]); - `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; + `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; // PERF: memory `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 8d0eaff6..912abc97 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -84,7 +84,7 @@ module VX_issue #( .clk (clk), .reset (dispatch_reset), `ifdef PERF_ENABLE - .perf_stalls (perf_issue_if.dsp_stalls), + `UNUSED_PIN (perf_stalls), `endif .operands_if (operands_if), .alu_dispatch_if(alu_dispatch_if), diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 1e0a09b8..5a57db4c 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -96,7 +96,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( // detect duplicate addresses wire lsu_is_dup; -`ifdef LSU_DUP +`ifdef LSU_DUP_ENABLE if (NUM_LANES > 1) begin wire [NUM_LANES-2:0] addr_matches; for (genvar i = 0; i < (NUM_LANES-1); ++i) begin @@ -304,7 +304,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( assign mem_req_tag = { execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr - `ifdef LSU_DUP + `ifdef LSU_DUP_ENABLE , lsu_is_dup `endif }; @@ -448,13 +448,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( wire [PID_WIDTH-1:0] rsp_pid; wire rsp_is_dup; -`ifndef LSU_DUP +`ifndef LSU_DUP_ENABLE assign rsp_is_dup = 0; `endif assign { rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr - `ifdef LSU_DUP + `ifdef LSU_DUP_ENABLE , rsp_is_dup `endif } = mem_rsp_tag; diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index 66225336..2ae0f678 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -14,18 +14,17 @@ `include "VX_define.vh" interface VX_pipeline_perf_if (); - wire [`PERF_CTR_BITS-1:0] sched_idles; - wire [`PERF_CTR_BITS-1:0] sched_stalls; - wire [`PERF_CTR_BITS-1:0] ibf_stalls; - wire [`PERF_CTR_BITS-1:0] scb_stalls; - wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; - wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS]; + wire [`PERF_CTR_BITS-1:0] sched_idles; + wire [`PERF_CTR_BITS-1:0] sched_stalls; + wire [`PERF_CTR_BITS-1:0] ibf_stalls; + wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; - wire [`PERF_CTR_BITS-1:0] ifetches; - wire [`PERF_CTR_BITS-1:0] loads; - wire [`PERF_CTR_BITS-1:0] stores; - wire [`PERF_CTR_BITS-1:0] ifetch_latency; - wire [`PERF_CTR_BITS-1:0] load_latency; + wire [`PERF_CTR_BITS-1:0] ifetches; + wire [`PERF_CTR_BITS-1:0] loads; + wire [`PERF_CTR_BITS-1:0] stores; + wire [`PERF_CTR_BITS-1:0] ifetch_latency; + wire [`PERF_CTR_BITS-1:0] load_latency; modport schedule ( output sched_idles, @@ -35,8 +34,7 @@ interface VX_pipeline_perf_if (); modport issue ( output ibf_stalls, output scb_stalls, - output scb_uses, - output dsp_stalls + output scb_uses ); modport slave ( @@ -45,7 +43,6 @@ interface VX_pipeline_perf_if (); input ibf_stalls, input scb_stalls, input scb_uses, - input dsp_stalls, input ifetches, input loads, input stores, diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index c0199a86..5f472c84 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -204,10 +204,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t sched_stalls = 0; uint64_t ibuffer_stalls = 0; uint64_t scrb_stalls = 0; - uint64_t lsu_stalls = 0; - uint64_t fpu_stalls = 0; - uint64_t alu_stalls = 0; - uint64_t sfu_stalls = 0; uint64_t scrb_alu = 0; uint64_t scrb_fpu = 0; uint64_t scrb_lsu = 0; @@ -310,34 +306,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { calcAvgPercent(scrb_sfu_per_core, scrb_total)); scrb_stalls += scrb_stalls_per_core; } - // alu_stalls - { - uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); - alu_stalls += alu_stalls_per_core; - } - // lsu_stalls - { - uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); - lsu_stalls += lsu_stalls_per_core; - } - // fpu_stalls - { - uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); - fpu_stalls += fpu_stalls_per_core; - } - // sfu_stalls - { - uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core); - sfu_stalls += sfu_stalls_per_core; - } // PERF: memory // ifetches { - uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); + uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCHES); if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); ifetches += ifetches_per_core; @@ -464,10 +436,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { calcAvgPercent(scrb_fpu, scrb_total), calcAvgPercent(scrb_lsu, scrb_total), calcAvgPercent(scrb_sfu, scrb_total)); - fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); - fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls); - fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); - fprintf(stream, "PERF: sfu unit stalls=%ld\n", sfu_stalls); fprintf(stream, "PERF: ifetches=%ld\n", ifetches); fprintf(stream, "PERF: loads=%ld\n", loads); fprintf(stream, "PERF: stores=%ld\n", stores); diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 3b4cb171..b7b9cdcb 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -87,7 +87,7 @@ private: class vx_device { public: vx_device() - : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, NUM_CLUSTERS) + : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES) , ram_(RAM_PAGE_SIZE) , processor_(arch_) , global_mem_( diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 42823205..bb67dbb5 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -15,7 +15,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp -SRCS += processor.cpp cluster.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp +SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp shared_mem.cpp dcrs.cpp # Debugigng ifdef DEBUG diff --git a/sim/simx/arch.h b/sim/simx/arch.h index ab6ac4a3..099fbedd 100644 --- a/sim/simx/arch.h +++ b/sim/simx/arch.h @@ -28,6 +28,7 @@ private: uint16_t num_warps_; uint16_t num_cores_; uint16_t num_clusters_; + uint16_t socket_size_; uint16_t vsize_; uint16_t num_regs_; uint16_t num_csrs_; @@ -35,11 +36,12 @@ private: uint16_t ipdom_size_; public: - Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint16_t num_clusters) + Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores) : num_threads_(num_threads) , num_warps_(num_warps) , num_cores_(num_cores) - , num_clusters_(num_clusters) + , num_clusters_(NUM_CLUSTERS) + , socket_size_(SOCKET_SIZE) , vsize_(16) , num_regs_(32) , num_csrs_(4096) @@ -82,6 +84,10 @@ public: uint16_t num_clusters() const { return num_clusters_; } + + uint16_t socket_size() const { + return socket_size_; + } }; } \ No newline at end of file diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index d7104915..7f690fb6 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -24,14 +24,38 @@ Cluster::Cluster(const SimContext& ctx, , mem_req_port(this) , mem_rsp_port(this) , cluster_id_(cluster_id) - , cores_(arch.num_cores()) + , sockets_(NUM_SOCKETS) , barriers_(arch.num_barriers(), 0) - , sharedmems_(arch.num_cores()) , processor_(processor) + , cores_per_socket_(arch.socket_size()) { - auto num_cores = arch.num_cores(); - char sname[100]; + + auto sockets_per_cluster = sockets_.size(); + + // create sockets + + snprintf(sname, 100, "cluster%d-icache-arb", cluster_id); + auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); + + snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id); + auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); + + for (uint32_t i = 0; i < sockets_per_cluster; ++i) { + uint32_t socket_id = cluster_id * sockets_per_cluster + i; + auto socket = Socket::Create(socket_id, this, arch, dcrs); + + socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i)); + icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port); + + socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i)); + dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port); + + sockets_.at(i) = socket; + } + + // Create l2cache + snprintf(sname, 100, "cluster%d-l2cache", cluster_id); l2cache_ = CacheSim::Create(sname, CacheSim::Config{ !L2_ENABLED, @@ -42,7 +66,7 @@ Cluster::Cluster(const SimContext& ctx, log2ceil(L2_NUM_BANKS), // B XLEN, // address bits 1, // number of ports - 5, // request size + 2, // request size true, // write-through false, // write response L2_MSHR_SIZE, // mshr @@ -52,87 +76,11 @@ Cluster::Cluster(const SimContext& ctx, l2cache_->MemReqPort.bind(&this->mem_req_port); this->mem_rsp_port.bind(&l2cache_->MemRspPort); - snprintf(sname, 100, "cluster%d-icaches", cluster_id); - icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{ - !ICACHE_ENABLED, - log2ceil(ICACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // L - log2ceil(sizeof(uint32_t)), // W - log2ceil(ICACHE_NUM_WAYS),// A - 1, // B - XLEN, // address bits - 1, // number of ports - 1, // number of inputs - true, // write-through - false, // write response - (uint8_t)arch.num_warps(), // mshr - 2, // pipeline latency - }); + icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0)); + l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0)); - icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0)); - l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort); - - snprintf(sname, 100, "cluster%d-dcaches", cluster_id); - dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{ - !DCACHE_ENABLED, - log2ceil(DCACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // L - log2ceil(sizeof(Word)), // W - log2ceil(DCACHE_NUM_WAYS),// A - log2ceil(DCACHE_NUM_BANKS), // B - XLEN, // address bits - 1, // number of ports - DCACHE_NUM_BANKS, // number of inputs - true, // write-through - false, // write response - DCACHE_MSHR_SIZE, // mshr - 4, // pipeline latency - }); - - dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1)); - l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort); - - /////////////////////////////////////////////////////////////////////////// - - // create shared memory blocks - for (uint32_t i = 0; i < num_cores; ++i) { - snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i); - sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{ - (1 << SMEM_LOG_SIZE), - sizeof(Word), - NUM_LSU_LANES, - NUM_LSU_LANES, - false - }); - } - - // create cores - - for (uint32_t i = 0; i < num_cores; ++i) { - uint32_t core_id = cluster_id * num_cores + i; - cores_.at(i) = Core::Create(core_id, - this, - arch, - dcrs, - sharedmems_.at(i)); - - cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); - icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); - - for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) { - snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j); - auto smem_demux = SMemDemux::Create(sname); - - cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn); - smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j)); - - smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j)); - dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC); - - smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j)); - sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM); - } - } + dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1)); + l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0)); } Cluster::~Cluster() { @@ -150,14 +98,14 @@ void Cluster::tick() { } void Cluster::attach_ram(RAM* ram) { - for (auto core : cores_) { - core->attach_ram(ram); + for (auto& socket : sockets_) { + socket->attach_ram(ram); } } bool Cluster::running() const { - for (auto& core : cores_) { - if (core->running()) + for (auto& socket : sockets_) { + if (socket->running()) return true; } return false; @@ -166,9 +114,9 @@ bool Cluster::running() const { bool Cluster::check_exit(Word* exitcode, bool riscv_test) const { bool done = true; Word exitcode_ = 0; - for (auto& core : cores_) { + for (auto& socket : sockets_) { Word ec; - if (core->check_exit(&ec, riscv_test)) { + if (socket->check_exit(&ec, riscv_test)) { exitcode_ |= ec; } else { done = false; @@ -181,36 +129,32 @@ bool Cluster::check_exit(Word* exitcode, bool riscv_test) const { void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) { auto& barrier = barriers_.at(bar_id); - uint32_t local_core_id = core_id % cores_.size(); + auto sockets_per_cluster = sockets_.size(); + auto cores_per_socket = cores_per_socket_; + + uint32_t cores_per_cluster = sockets_per_cluster * cores_per_socket; + uint32_t local_core_id = core_id % cores_per_cluster; barrier.set(local_core_id); DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id); if (barrier.count() == (size_t)count) { // resume all suspended cores - for (uint32_t i = 0; i < cores_.size(); ++i) { - if (barrier.test(i)) { - DP(3, "*** Resume core #" << i << " at barrier #" << bar_id); - cores_.at(i)->resume(); + for (uint32_t s = 0; s < sockets_per_cluster; ++s) { + for (uint32_t c = 0; c < cores_per_socket; ++c) { + uint32_t i = s * cores_per_socket + c; + if (barrier.test(i)) { + DP(3, "*** Resume core #" << i << " at barrier #" << bar_id); + sockets_.at(s)->resume(c); + } } } barrier.reset(); } } -ProcessorImpl* Cluster::processor() const { - return processor_; -} - Cluster::PerfStats Cluster::perf_stats() const { Cluster::PerfStats perf; - perf.icache = icaches_->perf_stats(); - perf.dcache = dcaches_->perf_stats(); perf.l2cache = l2cache_->perf_stats(); - - for (auto sharedmem : sharedmems_) { - perf.sharedmem += sharedmem->perf_stats(); - } - return perf; } \ No newline at end of file diff --git a/sim/simx/cluster.h b/sim/simx/cluster.h index f91241e9..2547486d 100644 --- a/sim/simx/cluster.h +++ b/sim/simx/cluster.h @@ -17,8 +17,8 @@ #include "dcrs.h" #include "arch.h" #include "cache_cluster.h" -#include "shared_mem.h" #include "core.h" +#include "socket.h" #include "constants.h" namespace vortex { @@ -27,17 +27,11 @@ class ProcessorImpl; class Cluster : public SimObject { public: - struct PerfStats { - CacheSim::PerfStats icache; - CacheSim::PerfStats dcache; - SharedMem::PerfStats sharedmem; - CacheSim::PerfStats l2cache; + struct PerfStats { + CacheSim::PerfStats l2cache; PerfStats& operator+=(const PerfStats& rhs) { - this->icache += rhs.icache; - this->dcache += rhs.dcache; - this->sharedmem += rhs.sharedmem; - this->l2cache += rhs.l2cache; + this->l2cache += rhs.l2cache; return *this; } }; @@ -53,6 +47,14 @@ public: ~Cluster(); + uint32_t id() const { + return cluster_id_; + } + + ProcessorImpl* processor() const { + return processor_; + } + void reset(); void tick(); @@ -65,22 +67,15 @@ public: void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); - ProcessorImpl* processor() const; - Cluster::PerfStats perf_stats() const; private: - uint32_t cluster_id_; - std::vector cores_; - std::vector barriers_; - CacheSim::Ptr l2cache_; - CacheCluster::Ptr icaches_; - CacheCluster::Ptr dcaches_; - std::vector sharedmems_; - CacheCluster::Ptr tcaches_; - CacheCluster::Ptr ocaches_; - CacheCluster::Ptr rcaches_; - ProcessorImpl* processor_; + uint32_t cluster_id_; + std::vector sockets_; + std::vector barriers_; + CacheSim::Ptr l2cache_; + ProcessorImpl* processor_; + uint32_t cores_per_socket_; }; } // namespace vortex \ No newline at end of file diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 49c2ec35..7a549ebd 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -21,18 +21,14 @@ #include "mem.h" #include "decode.h" #include "core.h" +#include "socket.h" #include "debug.h" #include "constants.h" #include "processor_impl.h" using namespace vortex; -Core::Core(const SimContext& ctx, - uint32_t core_id, - Cluster* cluster, - const Arch &arch, - const DCRS &dcrs, - SharedMem::Ptr sharedmem) +Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs) : SimObject(ctx, "core") , icache_req_ports(1, this) , icache_rsp_ports(1, this) @@ -50,12 +46,12 @@ Core::Core(const SimContext& ctx, , operands_(ISSUE_WIDTH) , dispatchers_((uint32_t)ExeType::ExeTypeCount) , exe_units_((uint32_t)ExeType::ExeTypeCount) - , sharedmem_(sharedmem) + , smem_demuxs_(NUM_LSU_LANES) , fetch_latch_("fetch") , decode_latch_("decode") , pending_icache_(arch_.num_warps()) , csrs_(arch.num_warps()) - , cluster_(cluster) + , socket_(socket) , commit_arbs_(ISSUE_WIDTH) { char sname[100]; @@ -72,6 +68,27 @@ Core::Core(const SimContext& ctx, operands_.at(i) = SimPlatform::instance().create_object(); } + // initialize shared memory + shared_mem_ = SharedMem::Create(sname, SharedMem::Config{ + (1 << SMEM_LOG_SIZE), + sizeof(Word), + NUM_LSU_LANES, + NUM_LSU_LANES, + false + }); + for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) { + snprintf(sname, 100, "smem_demux%d_%d", core_id, i); + auto smem_demux = SMemDemux::Create(sname); + + smem_demux->ReqDC.bind(&dcache_req_ports.at(i)); + dcache_rsp_ports.at(i).bind(&smem_demux->RspDC); + + smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i)); + shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM); + + smem_demuxs_.at(i) = smem_demux; + } + // initialize dispatchers dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES); dispatchers_.at((int)ExeType::FPU) = SimPlatform::instance().create_object(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES); @@ -241,13 +258,6 @@ void Core::decode() { stalled_warps_.reset(trace->wid); } - // update perf counters - uint32_t active_threads = trace->tmask.count(); - if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::LOAD) - perf_stats_.loads += active_threads; - if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE) - perf_stats_.stores += active_threads; - DT(3, "pipeline-decode: " << *trace); // insert to ibuffer @@ -394,7 +404,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) { if (is_global) { // global barrier handling if (barrier.count() == active_warps_.count()) { - cluster_->barrier(bar_idx, count, core_id_); + socket_->barrier(bar_idx, count, core_id_); barrier.reset(); } } else { @@ -431,7 +441,7 @@ AddrType Core::get_addr_type(uint64_t addr) { void Core::dcache_read(void *data, uint64_t addr, uint32_t size) { auto type = this->get_addr_type(addr); if (type == AddrType::Shared) { - sharedmem_->read(data, addr, size); + shared_mem_->read(data, addr, size); } else { mmu_.read(data, addr, size, 0); } @@ -446,7 +456,7 @@ void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) { this->writeToStdOut(data, addr, size); } else { if (type == AddrType::Shared) { - sharedmem_->write(data, addr, size); + shared_mem_->write(data, addr, size); } else { mmu_.write(data, addr, size, 0); } @@ -554,16 +564,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; - case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; - case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32; - case VX_CSR_MPM_ALU_ST: return perf_stats_.alu_stalls & 0xffffffff; - case VX_CSR_MPM_ALU_ST_H: return perf_stats_.alu_stalls >> 32; - case VX_CSR_MPM_LSU_ST: return perf_stats_.lsu_stalls & 0xffffffff; - case VX_CSR_MPM_LSU_ST_H: return perf_stats_.lsu_stalls >> 32; - case VX_CSR_MPM_FPU_ST: return perf_stats_.fpu_stalls & 0xffffffff; - case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32; - case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff; - case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32; + case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; + case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32; case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff; case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32; case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff; @@ -572,7 +574,6 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32; case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff; case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32; - case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff; case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff; @@ -586,27 +587,29 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { } } break; case VX_DCR_MPM_CLASS_MEM: { - auto proc_perf = cluster_->processor()->perf_stats(); + auto proc_perf = socket_->cluster()->processor()->perf_stats(); + auto socket_perf = socket_->perf_stats(); + auto smem_perf = shared_mem_->perf_stats(); switch (addr) { - case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff; - case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32; - case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff; - case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32; - case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff; - case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32; + case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff; + case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32; + case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff; + case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32; + case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32; - case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff; - case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32; - case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff; - case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; - case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; - case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; - case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32; - case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32; + case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff; + case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32; + case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff; + case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32; + case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32; + case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32; + case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32; + case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32; case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff; case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32; @@ -641,12 +644,12 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff; case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32; - case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; - case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; - case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff; - case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32; - case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; - case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32; + case VX_CSR_MPM_SMEM_READS: return smem_perf.reads & 0xffffffff; + case VX_CSR_MPM_SMEM_READS_H: return smem_perf.reads >> 32; + case VX_CSR_MPM_SMEM_WRITES: return smem_perf.writes & 0xffffffff; + case VX_CSR_MPM_SMEM_WRITES_H: return smem_perf.writes >> 32; + case VX_CSR_MPM_SMEM_BANK_ST: return smem_perf.bank_stalls & 0xffffffff; + case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32; } } break; } diff --git a/sim/simx/core.h b/sim/simx/core.h index cef60e81..343fdb31 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -40,7 +40,7 @@ namespace vortex { -class Cluster; +class Socket; using TraceSwitch = Mux; @@ -53,10 +53,6 @@ public: uint64_t sched_stalls; uint64_t ibuf_stalls; uint64_t scrb_stalls; - uint64_t alu_stalls; - uint64_t lsu_stalls; - uint64_t fpu_stalls; - uint64_t sfu_stalls; uint64_t scrb_alu; uint64_t scrb_fpu; uint64_t scrb_lsu; @@ -74,10 +70,6 @@ public: , sched_stalls(0) , ibuf_stalls(0) , scrb_stalls(0) - , alu_stalls(0) - , lsu_stalls(0) - , fpu_stalls(0) - , sfu_stalls(0) , scrb_alu(0) , scrb_fpu(0) , scrb_lsu(0) @@ -96,12 +88,7 @@ public: std::vector> dcache_req_ports; std::vector> dcache_rsp_ports; - Core(const SimContext& ctx, - uint32_t core_id, - Cluster* cluster, - const Arch &arch, - const DCRS &dcrs, - SharedMem::Ptr sharedmem); + Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs); ~Core(); @@ -119,6 +106,10 @@ public: return core_id_; } + Socket* socket() const { + return socket_; + } + const Arch& arch() const { return arch_; } @@ -181,7 +172,8 @@ private: std::vector operands_; std::vector dispatchers_; std::vector exe_units_; - SharedMem::Ptr sharedmem_; + SharedMem::Ptr shared_mem_; + std::vector smem_demuxs_; PipelineLatch fetch_latch_; PipelineLatch decode_latch_; @@ -201,7 +193,7 @@ private: PerfStats perf_stats_; - Cluster* cluster_; + Socket* socket_; std::vector commit_arbs_; diff --git a/sim/simx/exe_unit.cpp b/sim/simx/exe_unit.cpp index 2f3e79e3..4b5cb356 100644 --- a/sim/simx/exe_unit.cpp +++ b/sim/simx/exe_unit.cpp @@ -51,8 +51,7 @@ void AluUnit::tick() { assert(core_->stalled_warps_.test(trace->wid)); core_->stalled_warps_.reset(trace->wid); } - auto time = input.pop(); - core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); } } @@ -87,8 +86,7 @@ void FpuUnit::tick() { std::abort(); } DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace); - auto time = input.pop(); - core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); } } @@ -114,7 +112,7 @@ void LsuUnit::tick() { // handle dcache response for (uint32_t t = 0; t < num_lanes_; ++t) { - auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t); + auto& dcache_rsp_port = core_->smem_demuxs_.at(t)->RspIn; if (dcache_rsp_port.empty()) continue; auto& mem_rsp = dcache_rsp_port.front(); @@ -136,7 +134,7 @@ void LsuUnit::tick() { // handle shared memory response for (uint32_t t = 0; t < num_lanes_; ++t) { - auto& smem_rsp_port = core_->sharedmem_->Outputs.at(t); + auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t); if (smem_rsp_port.empty()) continue; auto& mem_rsp = smem_rsp_port.front(); @@ -184,8 +182,7 @@ void LsuUnit::tick() { fence_lock_ = true; DT(3, "fence-lock: " << *trace); // remove input - auto time = input.pop(); - core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); break; } @@ -213,7 +210,9 @@ void LsuUnit::tick() { auto mem_addr = trace_data->mem_addrs.at(t).addr & ~addr_mask; matches += (addr0 == mem_addr); } + #ifdef LSU_DUP_ENABLE is_dup = (matches == trace->tmask.count()); + #endif } uint32_t addr_count; @@ -229,7 +228,7 @@ void LsuUnit::tick() { if (!trace->tmask.test(t0 + t)) continue; - auto& dcache_req_port = core_->dcache_req_ports.at(t); + auto& dcache_req_port = core_->smem_demuxs_.at(t)->ReqIn; auto mem_addr = trace_data->mem_addrs.at(t); auto type = core_->get_addr_type(mem_addr.addr); @@ -241,12 +240,16 @@ void LsuUnit::tick() { mem_req.cid = trace->cid; mem_req.uuid = trace->uuid; - dcache_req_port.send(mem_req, 2); + dcache_req_port.send(mem_req, 1); DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace); - ++pending_loads_; - ++core_->perf_stats_.loads; + if (is_write) { + ++core_->perf_stats_.stores; + } else { + ++core_->perf_stats_.loads; + ++pending_loads_; + } if (is_dup) break; } @@ -254,13 +257,11 @@ void LsuUnit::tick() { // do not wait on writes if (is_write) { pending_rd_reqs_.release(tag); - output.send(trace, 1); - ++core_->perf_stats_.stores; + output.send(trace, 1); } // remove input - auto time = input.pop(); - core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time); + input.pop(); break; // single block } @@ -318,10 +319,7 @@ void SfuUnit::tick() { core_->stalled_warps_.reset(trace->wid); } - auto time = input.pop(); - auto stalls = (SimPlatform::instance().cycles() - time); - - core_->perf_stats_.sfu_stalls += stalls; + input.pop(); break; // single block } diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index 22d9c880..64031bb8 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -34,14 +34,13 @@ static void show_usage() { uint32_t num_threads = NUM_THREADS; uint32_t num_warps = NUM_WARPS; uint32_t num_cores = NUM_CORES; -uint32_t num_clusters = NUM_CLUSTERS; bool showStats = false;; bool riscv_test = false; const char* program = nullptr; static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "t:w:c:g:rsh?")) != -1) { + while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) { switch (c) { case 't': num_threads = atoi(optarg); @@ -51,9 +50,6 @@ static void parse_args(int argc, char **argv) { break; case 'c': num_cores = atoi(optarg); - break; - case 'g': - num_clusters = atoi(optarg); break; case 'r': riscv_test = true; @@ -88,7 +84,7 @@ int main(int argc, char **argv) { { // create processor configuation - Arch arch(num_threads, num_warps, num_cores, num_clusters); + Arch arch(num_threads, num_warps, num_cores); // create memory module RAM ram(RAM_PAGE_SIZE); diff --git a/sim/simx/socket.cpp b/sim/simx/socket.cpp new file mode 100644 index 00000000..fb620d62 --- /dev/null +++ b/sim/simx/socket.cpp @@ -0,0 +1,146 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "socket.h" +#include "cluster.h" + +using namespace vortex; + +Socket::Socket(const SimContext& ctx, + uint32_t socket_id, + Cluster* cluster, + const Arch &arch, const + DCRS &dcrs) + : SimObject(ctx, "socket") + , icache_mem_req_port(this) + , icache_mem_rsp_port(this) + , dcache_mem_req_port(this) + , dcache_mem_rsp_port(this) + , socket_id_(socket_id) + , cores_(arch.socket_size()) + , cluster_(cluster) +{ + auto cores_per_socket = cores_.size(); + + char sname[100]; + snprintf(sname, 100, "socket%d-icaches", socket_id); + icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{ + !ICACHE_ENABLED, + log2ceil(ICACHE_SIZE), // C + log2ceil(L1_LINE_SIZE), // L + log2ceil(sizeof(uint32_t)), // W + log2ceil(ICACHE_NUM_WAYS),// A + 1, // B + XLEN, // address bits + 1, // number of ports + 1, // number of inputs + true, // write-through + false, // write response + (uint8_t)arch.num_warps(), // mshr + 2, // pipeline latency + }); + + icaches_->MemReqPort.bind(&icache_mem_req_port); + icache_mem_rsp_port.bind(&icaches_->MemRspPort); + + snprintf(sname, 100, "socket%d-dcaches", socket_id); + dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{ + !DCACHE_ENABLED, + log2ceil(DCACHE_SIZE), // C + log2ceil(L1_LINE_SIZE), // L + log2ceil(sizeof(Word)), // W + log2ceil(DCACHE_NUM_WAYS),// A + log2ceil(DCACHE_NUM_BANKS), // B + XLEN, // address bits + 1, // number of ports + DCACHE_NUM_BANKS, // number of inputs + true, // write-through + false, // write response + DCACHE_MSHR_SIZE, // mshr + 2, // pipeline latency + }); + + dcaches_->MemReqPort.bind(&dcache_mem_req_port); + dcache_mem_rsp_port.bind(&dcaches_->MemRspPort); + + // create cores + + for (uint32_t i = 0; i < cores_per_socket; ++i) { + uint32_t core_id = socket_id * cores_per_socket + i; + cores_.at(i) = Core::Create(core_id, this, arch, dcrs); + + cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); + icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); + + for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) { + cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j)); + dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j)); + } + } +} + +Socket::~Socket() { + //-- +} + +void Socket::reset() { + //-- +} + +void Socket::tick() { + //-- +} + +void Socket::attach_ram(RAM* ram) { + for (auto core : cores_) { + core->attach_ram(ram); + } +} + +bool Socket::running() const { + for (auto& core : cores_) { + if (core->running()) + return true; + } + return false; +} + +bool Socket::check_exit(Word* exitcode, bool riscv_test) const { + bool done = true; + Word exitcode_ = 0; + for (auto& core : cores_) { + Word ec; + if (core->check_exit(&ec, riscv_test)) { + exitcode_ |= ec; + } else { + done = false; + } + } + *exitcode = exitcode_; + return done; +} + +void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) { + cluster_->barrier(bar_id, count, socket_id_ * cores_.size() + core_id); +} + +void Socket::resume(uint32_t core_index) { + cores_.at(core_index)->resume(); +} + +Socket::PerfStats Socket::perf_stats() const { + Socket::PerfStats perf; + perf.icache = icaches_->perf_stats(); + perf.dcache = dcaches_->perf_stats(); + return perf; +} \ No newline at end of file diff --git a/sim/simx/socket.h b/sim/simx/socket.h new file mode 100644 index 00000000..5c94c31f --- /dev/null +++ b/sim/simx/socket.h @@ -0,0 +1,87 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "dcrs.h" +#include "arch.h" +#include "cache_cluster.h" +#include "shared_mem.h" +#include "core.h" +#include "constants.h" + +namespace vortex { + +class Cluster; + +class Socket : public SimObject { +public: + struct PerfStats { + CacheSim::PerfStats icache; + CacheSim::PerfStats dcache; + + PerfStats& operator+=(const PerfStats& rhs) { + this->icache += rhs.icache; + this->dcache += rhs.dcache; + return *this; + } + }; + + SimPort icache_mem_req_port; + SimPort icache_mem_rsp_port; + + SimPort dcache_mem_req_port; + SimPort dcache_mem_rsp_port; + + Socket(const SimContext& ctx, + uint32_t socket_id, + Cluster* cluster, + const Arch &arch, + const DCRS &dcrs); + + ~Socket(); + + uint32_t id() const { + return socket_id_; + } + + Cluster* cluster() const { + return cluster_; + } + + void reset(); + + void tick(); + + void attach_ram(RAM* ram); + + bool running() const; + + bool check_exit(Word* exitcode, bool riscv_test) const; + + void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); + + void resume(uint32_t core_id); + + Socket::PerfStats perf_stats() const; + +private: + uint32_t socket_id_; + std::vector cores_; + CacheCluster::Ptr icaches_; + CacheCluster::Ptr dcaches_; + Cluster* cluster_; +}; + +} // namespace vortex \ No newline at end of file diff --git a/sim/simx/types.h b/sim/simx/types.h index 6bba7f9c..d3fcfa1a 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) { case RegType::Integer: os << "x"; break; case RegType::Float: os << "f"; break; case RegType::Vector: os << "v"; break; + default: assert(false); } return os; } @@ -112,6 +113,7 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { case AluType::SYSCALL: os << "SYSCALL"; break; case AluType::IMUL: os << "IMUL"; break; case AluType::IDIV: os << "IDIV"; break; + default: assert(false); } return os; } @@ -129,6 +131,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { case LsuType::LOAD: os << "LOAD"; break; case LsuType::STORE: os << "STORE"; break; case LsuType::FENCE: os << "FENCE"; break; + default: assert(false); } return os; } @@ -146,6 +149,7 @@ inline std::ostream &operator<<(std::ostream &os, const AddrType& type) { case AddrType::Global: os << "Global"; break; case AddrType::Shared: os << "Shared"; break; case AddrType::IO: os << "IO"; break; + default: assert(false); } return os; } @@ -174,6 +178,7 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { case FpuType::FDIV: os << "FDIV"; break; case FpuType::FSQRT: os << "FSQRT"; break; case FpuType::FCVT: os << "FCVT"; break; + default: assert(false); } return os; } @@ -205,6 +210,7 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) { case SfuType::CSRRS: os << "CSRRS"; break; case SfuType::CSRRC: os << "CSRRC"; break; case SfuType::CMOV: os << "CMOV"; break; + default: assert(false); } return os; } @@ -220,6 +226,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) { switch (type) { case ArbiterType::Priority: os << "Priority"; break; case ArbiterType::RoundRobin: os << "RoundRobin"; break; + default: assert(false); } return os; }