adding tracking for SFU stalls
This commit is contained in:
@@ -45,6 +45,15 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
|
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifdef PERF_ENABLE
|
||||||
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
|
assign mem_perf_tmp_if.icache = 'x;
|
||||||
|
assign mem_perf_tmp_if.dcache = 'x;
|
||||||
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
|
assign mem_perf_tmp_if.smem = 'x;
|
||||||
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifdef GBAR_ENABLE
|
`ifdef GBAR_ENABLE
|
||||||
|
|
||||||
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
|
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
|
||||||
@@ -69,24 +78,68 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.reset (gbar_reset),
|
.reset (gbar_reset),
|
||||||
.gbar_bus_if (gbar_bus_if)
|
.gbar_bus_if (gbar_bus_if)
|
||||||
);
|
);
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
VX_mem_perf_if mem_perf_tmp_if();
|
|
||||||
cache_perf_t perf_l2cache;
|
|
||||||
|
|
||||||
assign mem_perf_tmp_if.icache = 'x;
|
|
||||||
assign mem_perf_tmp_if.dcache = 'x;
|
|
||||||
assign mem_perf_tmp_if.l2cache = perf_l2cache;
|
|
||||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
|
||||||
assign mem_perf_tmp_if.smem = 'x;
|
|
||||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
.DATA_SIZE (L2_WORD_SIZE),
|
||||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
.TAG_WIDTH (L2_TAG_WIDTH)
|
||||||
) l1_mem_bus_if[2]();
|
) l2_mem_bus_if[L2_NUM_REQS]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||||
|
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||||
|
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
|
||||||
|
) icache_mem_bus_if[1]();
|
||||||
|
|
||||||
|
VX_mem_bus_if #(
|
||||||
|
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
|
||||||
|
) dcache_mem_bus_if[1]();
|
||||||
|
|
||||||
|
`RESET_RELAY (l1_mem_arb_reset, reset);
|
||||||
|
|
||||||
|
VX_mem_arb #(
|
||||||
|
.NUM_INPUTS (`NUM_SOCKETS),
|
||||||
|
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
|
||||||
|
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||||
|
.ARBITER ("R"),
|
||||||
|
.OUT_REG_REQ (2),
|
||||||
|
.OUT_REG_RSP (2)
|
||||||
|
) icache_mem_arb (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (l1_mem_arb_reset),
|
||||||
|
.bus_in_if (per_socket_icache_mem_bus_if),
|
||||||
|
.bus_out_if (icache_mem_bus_if)
|
||||||
|
);
|
||||||
|
|
||||||
|
VX_mem_arb #(
|
||||||
|
.NUM_INPUTS (`NUM_SOCKETS),
|
||||||
|
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||||
|
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
|
||||||
|
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||||
|
.ARBITER ("R"),
|
||||||
|
.OUT_REG_REQ (2),
|
||||||
|
.OUT_REG_RSP (2)
|
||||||
|
) dcache_mem_arb (
|
||||||
|
.clk (clk),
|
||||||
|
.reset (l1_mem_arb_reset),
|
||||||
|
.bus_in_if (per_socket_dcache_mem_bus_if),
|
||||||
|
.bus_out_if (dcache_mem_bus_if)
|
||||||
|
);
|
||||||
|
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[ICACHE_MEM_ARB_IDX], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
|
||||||
|
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[DCACHE_MEM_ARB_IDX], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
|
||||||
|
|
||||||
`RESET_RELAY (l2_reset, reset);
|
`RESET_RELAY (l2_reset, reset);
|
||||||
|
|
||||||
@@ -113,67 +166,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (l2_reset),
|
.reset (l2_reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_l2cache),
|
.cache_perf (mem_perf_tmp_if.l2cache),
|
||||||
`endif
|
`endif
|
||||||
.core_bus_if (l1_mem_bus_if),
|
.core_bus_if (l2_mem_bus_if),
|
||||||
.mem_bus_if (mem_bus_if)
|
.mem_bus_if (mem_bus_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
|
||||||
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
|
||||||
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
|
|
||||||
) icache_mem_bus_if[1]();
|
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
|
||||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
|
|
||||||
) dcache_mem_bus_if[1]();
|
|
||||||
|
|
||||||
`RESET_RELAY (l1_mem_arb_reset, reset);
|
|
||||||
|
|
||||||
VX_mem_arb #(
|
|
||||||
.NUM_INPUTS (`NUM_SOCKETS),
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
|
|
||||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
|
||||||
.ARBITER ("R"),
|
|
||||||
.OUT_REG_REQ (2),
|
|
||||||
.OUT_REG_RSP (2)
|
|
||||||
) icache_mem_arb (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (l1_mem_arb_reset),
|
|
||||||
.bus_in_if (per_socket_icache_mem_bus_if),
|
|
||||||
.bus_out_if (icache_mem_bus_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
VX_mem_arb #(
|
|
||||||
.NUM_INPUTS (`NUM_SOCKETS),
|
|
||||||
.DATA_SIZE (`L1_LINE_SIZE),
|
|
||||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
|
|
||||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
|
||||||
.ARBITER ("R"),
|
|
||||||
.OUT_REG_REQ (2),
|
|
||||||
.OUT_REG_RSP (2)
|
|
||||||
) dcache_mem_arb (
|
|
||||||
.clk (clk),
|
|
||||||
.reset (l1_mem_arb_reset),
|
|
||||||
.bus_in_if (per_socket_dcache_mem_bus_if),
|
|
||||||
.bus_out_if (dcache_mem_bus_if)
|
|
||||||
);
|
|
||||||
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
|
|
||||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
|
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
|
||||||
@@ -201,6 +199,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||||||
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
|
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
|
||||||
) socket (
|
) socket (
|
||||||
`SCOPE_IO_BIND (scope_socket+i)
|
`SCOPE_IO_BIND (scope_socket+i)
|
||||||
|
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (socket_reset),
|
.reset (socket_reset),
|
||||||
|
|
||||||
|
|||||||
@@ -57,10 +57,18 @@
|
|||||||
`define EX_ALU 0
|
`define EX_ALU 0
|
||||||
`define EX_LSU 1
|
`define EX_LSU 1
|
||||||
`define EX_SFU 2
|
`define EX_SFU 2
|
||||||
`define EX_FPU 3
|
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
|
||||||
|
|
||||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
||||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||||
|
`define EX_WIDTH `UP(`EX_BITS)
|
||||||
|
|
||||||
|
`define SFU_CSRS 0
|
||||||
|
`define SFU_WCTL 1
|
||||||
|
|
||||||
|
`define NUM_SFU_UNITS (2)
|
||||||
|
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
|
||||||
|
`define SFU_WIDTH `UP(`SFU_BITS)
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@@ -369,35 +377,32 @@
|
|||||||
VX_dcr_bus_if dst(); \
|
VX_dcr_bus_if dst(); \
|
||||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
||||||
|
|
||||||
`define PERF_REDUCE(dst, src, field, width, count) \
|
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
|
||||||
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
for (genvar __d = 0; __d < dst_count; ++__d) begin \
|
||||||
wire [width-1:0] __reduce_add_o_``dst``field; \
|
localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
|
||||||
reg [width-1:0] __reduce_add_r_``dst``field; \
|
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
||||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
wire [width-1:0] __reduce_add_o_``dst``field; \
|
||||||
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
|
for (genvar __i = 0; __i < __count; ++__i) begin \
|
||||||
end \
|
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
|
||||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
|
end \
|
||||||
__reduce_add_i_``src``field, \
|
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
|
||||||
__reduce_add_o_``dst``field \
|
__reduce_add_i_``src``field, \
|
||||||
); \
|
__reduce_add_o_``dst``field \
|
||||||
always @(posedge clk) begin \
|
); \
|
||||||
if (reset) begin \
|
if (reg_enable) begin \
|
||||||
__reduce_add_r_``dst``field <= '0; \
|
reg [width-1:0] __reduce_add_r_``dst``field; \
|
||||||
end else begin \
|
always @(posedge clk) begin \
|
||||||
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
if (reset) begin \
|
||||||
end \
|
__reduce_add_r_``dst``field <= '0; \
|
||||||
end \
|
end else begin \
|
||||||
assign ``dst.``field = __reduce_add_r_``dst``field
|
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
||||||
|
end \
|
||||||
`define PERF_CACHE_REDUCE(dst, src, count) \
|
end \
|
||||||
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
|
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
|
||||||
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
|
end else begin \
|
||||||
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
|
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
|
||||||
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \
|
end \
|
||||||
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
|
end
|
||||||
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
|
|
||||||
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
|
|
||||||
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
|
|
||||||
|
|
||||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||||
if (block_size != 1) begin \
|
if (block_size != 1) begin \
|
||||||
|
|||||||
@@ -99,7 +99,7 @@ package VX_gpu_pkg;
|
|||||||
`ifdef ICACHE_ENABLE
|
`ifdef ICACHE_ENABLE
|
||||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||||
`else
|
`else
|
||||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
|
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
////////////////////////// Dcache Parameters //////////////////////////////
|
////////////////////////// Dcache Parameters //////////////////////////////
|
||||||
@@ -147,6 +147,9 @@ package VX_gpu_pkg;
|
|||||||
|
|
||||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||||
|
|
||||||
|
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||||
|
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||||
|
|
||||||
// Word size in bytes
|
// Word size in bytes
|
||||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||||
|
|
||||||
|
|||||||
@@ -66,19 +66,12 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_tmp_if();
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
cache_perf_t perf_icache;
|
|
||||||
cache_perf_t perf_dcache;
|
|
||||||
|
|
||||||
assign mem_perf_tmp_if.icache = perf_icache;
|
|
||||||
assign mem_perf_tmp_if.dcache = perf_dcache;
|
|
||||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
assign mem_perf_tmp_if.smem = 'x;
|
assign mem_perf_tmp_if.smem = 'x;
|
||||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
@@ -110,7 +103,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.MEM_OUT_REG (2)
|
.MEM_OUT_REG (2)
|
||||||
) icache (
|
) icache (
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_icache),
|
.cache_perf (mem_perf_tmp_if.icache),
|
||||||
`endif
|
`endif
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (icache_reset),
|
.reset (icache_reset),
|
||||||
@@ -150,7 +143,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||||||
.MEM_OUT_REG (2)
|
.MEM_OUT_REG (2)
|
||||||
) dcache (
|
) dcache (
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_dcache),
|
.cache_perf (mem_perf_tmp_if.dcache),
|
||||||
`endif
|
`endif
|
||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (dcache_reset),
|
.reset (dcache_reset),
|
||||||
|
|||||||
@@ -97,6 +97,11 @@
|
|||||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
|
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
|
||||||
`define VX_CSR_MPM_LOAD_LT 12'hB0F
|
`define VX_CSR_MPM_LOAD_LT 12'hB0F
|
||||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
|
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
|
||||||
|
// SFU: scoreboard
|
||||||
|
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
|
||||||
|
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
|
||||||
|
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
|
||||||
|
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
|
||||||
|
|
||||||
// Machine Performance-monitoring memory counters
|
// Machine Performance-monitoring memory counters
|
||||||
// PERF: icache
|
// PERF: icache
|
||||||
|
|||||||
@@ -46,15 +46,9 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_if();
|
VX_mem_perf_if mem_perf_if();
|
||||||
cache_perf_t perf_l3cache;
|
|
||||||
mem_perf_t mem_perf;
|
|
||||||
|
|
||||||
assign mem_perf_if.smem = 'x;
|
|
||||||
assign mem_perf_if.icache = 'x;
|
assign mem_perf_if.icache = 'x;
|
||||||
assign mem_perf_if.dcache = 'x;
|
assign mem_perf_if.dcache = 'x;
|
||||||
assign mem_perf_if.l2cache = 'x;
|
assign mem_perf_if.l2cache = 'x;
|
||||||
assign mem_perf_if.l3cache = perf_l3cache;
|
|
||||||
assign mem_perf_if.mem = mem_perf;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
@@ -93,7 +87,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
.reset (l3_reset),
|
.reset (l3_reset),
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (perf_l3cache),
|
.cache_perf (mem_perf_if.l3cache),
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.core_bus_if (per_cluster_mem_bus_if),
|
.core_bus_if (per_cluster_mem_bus_if),
|
||||||
@@ -171,6 +165,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||||
|
mem_perf_t mem_perf;
|
||||||
|
|
||||||
always @(posedge clk) begin
|
always @(posedge clk) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
@@ -193,6 +188,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||||||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
assign mem_perf_if.mem = mem_perf;
|
||||||
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|||||||
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
5
hw/rtl/cache/VX_cache_cluster.sv
vendored
@@ -83,8 +83,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
cache_perf_t perf_cache_unit[NUM_CACHES];
|
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
|
||||||
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES);
|
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
|
||||||
|
assign cache_perf = perf_cache_tmp[0];
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_mem_bus_if #(
|
VX_mem_bus_if #(
|
||||||
|
|||||||
12
hw/rtl/cache/VX_cache_define.vh
vendored
12
hw/rtl/cache/VX_cache_define.vh
vendored
@@ -62,4 +62,16 @@
|
|||||||
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
||||||
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
|
||||||
|
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
|
||||||
|
|
||||||
`endif // VX_CACHE_DEFINE_VH
|
`endif // VX_CACHE_DEFINE_VH
|
||||||
|
|||||||
@@ -77,20 +77,14 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_pipeline_perf_if pipeline_perf_if();
|
|
||||||
VX_mem_perf_if mem_perf_tmp_if();
|
VX_mem_perf_if mem_perf_tmp_if();
|
||||||
|
VX_pipeline_perf_if pipeline_perf_if();
|
||||||
|
|
||||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||||
`ifdef SM_ENABLE
|
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||||
cache_perf_t smem_perf;
|
|
||||||
assign mem_perf_tmp_if.smem = smem_perf;
|
|
||||||
`else
|
|
||||||
assign mem_perf_tmp_if.smem = '0;
|
|
||||||
`endif
|
|
||||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`RESET_RELAY (dcr_data_reset, reset);
|
`RESET_RELAY (dcr_data_reset, reset);
|
||||||
@@ -250,7 +244,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||||||
.clk (clk),
|
.clk (clk),
|
||||||
.reset (reset),
|
.reset (reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.cache_perf (smem_perf),
|
.cache_perf (mem_perf_tmp_if.smem),
|
||||||
`endif
|
`endif
|
||||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||||
.dcache_bus_out_if (dcache_bus_if)
|
.dcache_bus_out_if (dcache_bus_if)
|
||||||
|
|||||||
@@ -130,11 +130,11 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if mem_perf_if();
|
VX_mem_perf_if mem_perf_if();
|
||||||
assign mem_perf_if.smem = '0;
|
|
||||||
assign mem_perf_if.icache = '0;
|
assign mem_perf_if.icache = '0;
|
||||||
assign mem_perf_if.dcache = '0;
|
assign mem_perf_if.dcache = '0;
|
||||||
assign mem_perf_if.l2cache = '0;
|
assign mem_perf_if.l2cache = '0;
|
||||||
assign mem_perf_if.l3cache = '0;
|
assign mem_perf_if.l3cache = '0;
|
||||||
|
assign mem_perf_if.smem = '0;
|
||||||
assign mem_perf_if.mem = '0;
|
assign mem_perf_if.mem = '0;
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ import VX_fpu_pkg::*;
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if.slave mem_perf_if,
|
VX_mem_perf_if.slave mem_perf_if,
|
||||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||||
VX_sfu_perf_if.slave sfu_perf_if,
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_commit_csr_if.slave commit_csr_if,
|
VX_commit_csr_if.slave commit_csr_if,
|
||||||
@@ -187,103 +186,107 @@ import VX_fpu_pkg::*;
|
|||||||
`VX_DCR_MPM_CLASS_CORE: begin
|
`VX_DCR_MPM_CLASS_CORE: begin
|
||||||
case (read_addr)
|
case (read_addr)
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||||
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
|
||||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
|
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
|
||||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0];
|
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||||
`else
|
`else
|
||||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||||
`endif
|
`endif
|
||||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
|
||||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
|
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
|
||||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
|
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
|
||||||
|
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
|
||||||
|
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
|
||||||
|
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||||
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||||
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
`VX_DCR_MPM_CLASS_MEM: begin
|
`VX_DCR_MPM_CLASS_MEM: begin
|
||||||
case (read_addr)
|
case (read_addr)
|
||||||
// PERF: icache
|
// PERF: icache
|
||||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: dcache
|
// PERF: dcache
|
||||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: smem
|
// PERF: smem
|
||||||
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
|
||||||
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
|
||||||
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: l2cache
|
// PERF: l2cache
|
||||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: l3cache
|
// PERF: l3cache
|
||||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||||
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||||
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||||
default:;
|
default:;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
@@ -303,8 +306,6 @@ import VX_fpu_pkg::*;
|
|||||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
|
|
||||||
`UNUSED_VAR (perf_wctl_stalls);
|
|
||||||
`UNUSED_VAR (mem_perf_if.icache);
|
`UNUSED_VAR (mem_perf_if.icache);
|
||||||
`UNUSED_VAR (mem_perf_if.smem);
|
`UNUSED_VAR (mem_perf_if.smem);
|
||||||
`endif
|
`endif
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
VX_mem_perf_if.slave mem_perf_if,
|
VX_mem_perf_if.slave mem_perf_if,
|
||||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||||
VX_sfu_perf_if.slave sfu_perf_if,
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
@@ -81,7 +80,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (mem_perf_if),
|
.mem_perf_if (mem_perf_if),
|
||||||
.pipeline_perf_if(pipeline_perf_if),
|
.pipeline_perf_if(pipeline_perf_if),
|
||||||
.sfu_perf_if (sfu_perf_if),
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
.commit_csr_if (commit_csr_if),
|
.commit_csr_if (commit_csr_if),
|
||||||
|
|||||||
@@ -61,7 +61,8 @@ module VX_issue #(
|
|||||||
.reset (scoreboard_reset),
|
.reset (scoreboard_reset),
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||||
.perf_scb_uses (perf_issue_if.scb_uses),
|
.perf_units_uses(perf_issue_if.units_uses),
|
||||||
|
.perf_sfu_uses (perf_issue_if.sfu_uses),
|
||||||
`endif
|
`endif
|
||||||
.writeback_if (writeback_if),
|
.writeback_if (writeback_if),
|
||||||
.ibuffer_if (ibuffer_if),
|
.ibuffer_if (ibuffer_if),
|
||||||
|
|||||||
@@ -21,7 +21,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS],
|
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
|
||||||
|
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||||
@@ -32,10 +33,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle;
|
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
|
||||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle;
|
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||||
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_uses_per_cycle;
|
|
||||||
|
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
|
||||||
|
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||||
|
|
||||||
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
|
||||||
|
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
|
||||||
|
|
||||||
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||||
|
|
||||||
@@ -43,10 +48,51 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
.DATAW_IN (`NUM_EX_UNITS),
|
.DATAW_IN (`NUM_EX_UNITS),
|
||||||
.N (`ISSUE_WIDTH),
|
.N (`ISSUE_WIDTH),
|
||||||
.OP ("|")
|
.OP ("|")
|
||||||
) reduce (
|
) perf_units_reduce (
|
||||||
.data_in (perf_issue_uses_per_cycle),
|
.data_in (perf_issue_units_per_cycle),
|
||||||
.data_out (perf_uses_per_cycle)
|
.data_out (perf_units_per_cycle)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
VX_reduce #(
|
||||||
|
.DATAW_IN (`NUM_SFU_UNITS),
|
||||||
|
.N (`ISSUE_WIDTH),
|
||||||
|
.OP ("|")
|
||||||
|
) perf_sfu_reduce (
|
||||||
|
.data_in (perf_issue_sfu_per_cycle),
|
||||||
|
.data_out (perf_sfu_per_cycle)
|
||||||
|
);
|
||||||
|
|
||||||
|
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||||
|
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
|
||||||
|
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
|
||||||
|
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_scb_stalls <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_units_uses[i] <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||||
|
always @(posedge clk) begin
|
||||||
|
if (reset) begin
|
||||||
|
perf_sfu_uses[i] <= '0;
|
||||||
|
end else begin
|
||||||
|
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||||
@@ -60,21 +106,46 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||||
|
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||||
|
|
||||||
|
reg [`SFU_WIDTH-1:0] sfu_type;
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
perf_issue_uses_per_cycle[i] = '0;
|
case (scoreboard_if[i].data.op_type)
|
||||||
|
`INST_SFU_CSRRW,
|
||||||
|
`INST_SFU_CSRRS,
|
||||||
|
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
|
||||||
|
default: sfu_type = `SFU_WCTL;
|
||||||
|
endcase
|
||||||
|
end
|
||||||
|
|
||||||
|
always @(*) begin
|
||||||
|
perf_issue_units_per_cycle[i] = '0;
|
||||||
|
perf_issue_sfu_per_cycle[i] = '0;
|
||||||
if (ibuffer_if[i].valid) begin
|
if (ibuffer_if[i].valid) begin
|
||||||
if (inuse_rd) begin
|
if (inuse_rd) begin
|
||||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
|
||||||
|
end
|
||||||
end
|
end
|
||||||
if (inuse_rs1) begin
|
if (inuse_rs1) begin
|
||||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
|
||||||
|
end
|
||||||
end
|
end
|
||||||
if (inuse_rs2) begin
|
if (inuse_rs2) begin
|
||||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
|
||||||
|
end
|
||||||
end
|
end
|
||||||
if (inuse_rs3) begin
|
if (inuse_rs3) begin
|
||||||
perf_issue_uses_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||||
|
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
|
||||||
|
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
@@ -109,6 +180,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
|
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
|
||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
|
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
|
||||||
|
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
|
||||||
|
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
|
||||||
|
end
|
||||||
`endif
|
`endif
|
||||||
end
|
end
|
||||||
valid_out_r <= 0;
|
valid_out_r <= 0;
|
||||||
@@ -155,30 +229,4 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle_r;
|
|
||||||
wire [`NUM_EX_UNITS-1:0] perf_uses_per_cycle_r;
|
|
||||||
|
|
||||||
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
|
||||||
`BUFFER(perf_uses_per_cycle_r, perf_uses_per_cycle);
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (reset) begin
|
|
||||||
perf_scb_stalls <= '0;
|
|
||||||
end else begin
|
|
||||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (reset) begin
|
|
||||||
perf_scb_uses[i] <= '0;
|
|
||||||
end else begin
|
|
||||||
perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(perf_uses_per_cycle_r[i]);
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
`endif
|
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
|
||||||
localparam RSP_ARB_SIZE = 1 + 1;
|
localparam RSP_ARB_SIZE = 1 + 1;
|
||||||
localparam RSP_ARB_IDX_WCTL = 0;
|
localparam RSP_ARB_IDX_WCTL = 0;
|
||||||
localparam RSP_ARB_IDX_CSR = 1;
|
localparam RSP_ARB_IDX_CSRS = 1;
|
||||||
|
|
||||||
VX_execute_if #(
|
VX_execute_if #(
|
||||||
.NUM_LANES (NUM_LANES)
|
.NUM_LANES (NUM_LANES)
|
||||||
@@ -71,9 +71,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
||||||
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
VX_sfu_perf_if sfu_perf_if();
|
|
||||||
`endif
|
|
||||||
|
|
||||||
// Warp control block
|
// Warp control block
|
||||||
VX_execute_if #(
|
VX_execute_if #(
|
||||||
@@ -129,7 +126,6 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
`ifdef PERF_ENABLE
|
`ifdef PERF_ENABLE
|
||||||
.mem_perf_if (mem_perf_if),
|
.mem_perf_if (mem_perf_if),
|
||||||
.pipeline_perf_if(pipeline_perf_if),
|
.pipeline_perf_if(pipeline_perf_if),
|
||||||
.sfu_perf_if (sfu_perf_if),
|
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifdef EXT_F_ENABLE
|
`ifdef EXT_F_ENABLE
|
||||||
@@ -141,18 +137,18 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
.commit_if (csr_commit_if)
|
.commit_if (csr_commit_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSR] = csr_commit_if.valid;
|
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
|
||||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSR] = csr_commit_if.data;
|
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
|
||||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSR];
|
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
|
||||||
|
|
||||||
// can accept new request?
|
// can accept new request?
|
||||||
|
|
||||||
reg sfu_req_ready;
|
reg sfu_req_ready;
|
||||||
always @(*) begin
|
always @(*) begin
|
||||||
case (execute_if[0].data.op_type)
|
case (execute_if[0].data.op_type)
|
||||||
`INST_SFU_CSRRW,
|
`INST_SFU_CSRRW,
|
||||||
`INST_SFU_CSRRS,
|
`INST_SFU_CSRRS,
|
||||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||||
default: sfu_req_ready = wctl_execute_if.ready;
|
default: sfu_req_ready = wctl_execute_if.ready;
|
||||||
endcase
|
endcase
|
||||||
end
|
end
|
||||||
@@ -194,19 +190,4 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||||||
.commit_out_if (commit_if)
|
.commit_out_if (commit_if)
|
||||||
);
|
);
|
||||||
|
|
||||||
`ifdef PERF_ENABLE
|
|
||||||
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
|
|
||||||
|
|
||||||
wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready;
|
|
||||||
|
|
||||||
always @(posedge clk) begin
|
|
||||||
if (reset) begin
|
|
||||||
perf_wctl_stalls <= '0;
|
|
||||||
end else begin
|
|
||||||
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall);
|
|
||||||
end
|
|
||||||
end
|
|
||||||
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;
|
|
||||||
`endif
|
|
||||||
|
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -18,7 +18,8 @@ interface VX_pipeline_perf_if ();
|
|||||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||||
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
|
||||||
|
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
|
||||||
|
|
||||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||||
wire [`PERF_CTR_BITS-1:0] loads;
|
wire [`PERF_CTR_BITS-1:0] loads;
|
||||||
@@ -34,7 +35,8 @@ interface VX_pipeline_perf_if ();
|
|||||||
modport issue (
|
modport issue (
|
||||||
output ibf_stalls,
|
output ibf_stalls,
|
||||||
output scb_stalls,
|
output scb_stalls,
|
||||||
output scb_uses
|
output units_uses,
|
||||||
|
output sfu_uses
|
||||||
);
|
);
|
||||||
|
|
||||||
modport slave (
|
modport slave (
|
||||||
@@ -42,7 +44,8 @@ interface VX_pipeline_perf_if ();
|
|||||||
input sched_stalls,
|
input sched_stalls,
|
||||||
input ibf_stalls,
|
input ibf_stalls,
|
||||||
input scb_stalls,
|
input scb_stalls,
|
||||||
input scb_uses,
|
input units_uses,
|
||||||
|
input sfu_uses,
|
||||||
input ifetches,
|
input ifetches,
|
||||||
input loads,
|
input loads,
|
||||||
input stores,
|
input stores,
|
||||||
|
|||||||
@@ -208,6 +208,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
uint64_t scrb_fpu = 0;
|
uint64_t scrb_fpu = 0;
|
||||||
uint64_t scrb_lsu = 0;
|
uint64_t scrb_lsu = 0;
|
||||||
uint64_t scrb_sfu = 0;
|
uint64_t scrb_sfu = 0;
|
||||||
|
uint64_t scrb_wctl = 0;
|
||||||
|
uint64_t scrb_csrs = 0;
|
||||||
uint64_t ifetches = 0;
|
uint64_t ifetches = 0;
|
||||||
uint64_t loads = 0;
|
uint64_t loads = 0;
|
||||||
uint64_t stores = 0;
|
uint64_t stores = 0;
|
||||||
@@ -269,43 +271,68 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
// scheduler idles
|
// scheduler idles
|
||||||
{
|
{
|
||||||
uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
|
uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
|
||||||
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
if (num_cores > 1) {
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
|
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
||||||
|
fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
|
||||||
|
}
|
||||||
sched_idles += sched_idles_per_core;
|
sched_idles += sched_idles_per_core;
|
||||||
}
|
}
|
||||||
// scheduler stalls
|
// scheduler stalls
|
||||||
{
|
{
|
||||||
uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
|
uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
|
||||||
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
if (num_cores > 1) {
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
|
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
||||||
|
fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
|
||||||
|
}
|
||||||
sched_stalls += sched_stalls_per_core;
|
sched_stalls += sched_stalls_per_core;
|
||||||
}
|
}
|
||||||
// ibuffer_stalls
|
// ibuffer_stalls
|
||||||
{
|
{
|
||||||
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
|
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
|
||||||
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
if (num_cores > 1) {
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
|
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
||||||
|
fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
|
||||||
|
}
|
||||||
ibuffer_stalls += ibuffer_stalls_per_core;
|
ibuffer_stalls += ibuffer_stalls_per_core;
|
||||||
}
|
}
|
||||||
// scrb_stalls
|
// issue_stalls
|
||||||
{
|
{
|
||||||
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
|
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
|
||||||
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
|
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
|
||||||
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
|
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
|
||||||
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
|
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
|
||||||
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
|
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
|
||||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
|
||||||
scrb_alu += scrb_alu_per_core;
|
scrb_alu += scrb_alu_per_core;
|
||||||
scrb_fpu += scrb_fpu_per_core;
|
scrb_fpu += scrb_fpu_per_core;
|
||||||
scrb_lsu += scrb_lsu_per_core;
|
scrb_lsu += scrb_lsu_per_core;
|
||||||
scrb_sfu += scrb_sfu_per_core;
|
scrb_sfu += scrb_sfu_per_core;
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
if (num_cores > 1) {
|
||||||
|
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||||
|
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
||||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||||
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
||||||
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
||||||
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
||||||
|
}
|
||||||
scrb_stalls += scrb_stalls_per_core;
|
scrb_stalls += scrb_stalls_per_core;
|
||||||
}
|
}
|
||||||
|
// sfu_stalls
|
||||||
|
{
|
||||||
|
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
|
||||||
|
uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL);
|
||||||
|
uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS);
|
||||||
|
if (num_cores > 1) {
|
||||||
|
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core + scrb_tex_per_core + scrb_raster_per_core + scrb_om_per_core;
|
||||||
|
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||||
|
, core_id
|
||||||
|
, scrb_sfu_per_core
|
||||||
|
, calcAvgPercent(scrb_csrs_per_core, sfu_total)
|
||||||
|
, calcAvgPercent(scrb_wctl_per_core, sfu_total)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
scrb_wctl += scrb_wctl_per_core;
|
||||||
|
scrb_csrs += scrb_csrs_per_core;
|
||||||
|
}
|
||||||
// PERF: memory
|
// PERF: memory
|
||||||
// ifetches
|
// ifetches
|
||||||
{
|
{
|
||||||
@@ -314,8 +341,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
ifetches += ifetches_per_core;
|
ifetches += ifetches_per_core;
|
||||||
|
|
||||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
|
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
|
||||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
if (num_cores > 1) {
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||||
|
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||||
|
}
|
||||||
ifetch_lat += ifetch_lat_per_core;
|
ifetch_lat += ifetch_lat_per_core;
|
||||||
}
|
}
|
||||||
// loads
|
// loads
|
||||||
@@ -325,8 +354,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
loads += loads_per_core;
|
loads += loads_per_core;
|
||||||
|
|
||||||
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
|
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
|
||||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
if (num_cores > 1) {
|
||||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||||
|
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||||
|
}
|
||||||
load_lat += load_lat_per_core;
|
load_lat += load_lat_per_core;
|
||||||
}
|
}
|
||||||
// stores
|
// stores
|
||||||
@@ -428,14 +459,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
||||||
fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
uint64_t sfu_total = scrb_wctl + scrb_csrs;
|
||||||
|
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||||
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
||||||
calcAvgPercent(scrb_alu, scrb_total),
|
calcAvgPercent(scrb_alu, scrb_total),
|
||||||
calcAvgPercent(scrb_fpu, scrb_total),
|
calcAvgPercent(scrb_fpu, scrb_total),
|
||||||
calcAvgPercent(scrb_lsu, scrb_total),
|
calcAvgPercent(scrb_lsu, scrb_total),
|
||||||
calcAvgPercent(scrb_sfu, scrb_total));
|
calcAvgPercent(scrb_sfu, scrb_total));
|
||||||
|
fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||||
|
, scrb_sfu
|
||||||
|
, calcAvgPercent(scrb_csrs, sfu_total)
|
||||||
|
, calcAvgPercent(scrb_wctl, sfu_total)
|
||||||
|
);
|
||||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||||
|
|||||||
@@ -18,20 +18,20 @@ using namespace vortex;
|
|||||||
Cluster::Cluster(const SimContext& ctx,
|
Cluster::Cluster(const SimContext& ctx,
|
||||||
uint32_t cluster_id,
|
uint32_t cluster_id,
|
||||||
ProcessorImpl* processor,
|
ProcessorImpl* processor,
|
||||||
const Arch &arch, const
|
const Arch &arch,
|
||||||
DCRS &dcrs)
|
const DCRS &dcrs)
|
||||||
: SimObject(ctx, "cluster")
|
: SimObject(ctx, "cluster")
|
||||||
, mem_req_port(this)
|
, mem_req_port(this)
|
||||||
, mem_rsp_port(this)
|
, mem_rsp_port(this)
|
||||||
, cluster_id_(cluster_id)
|
, cluster_id_(cluster_id)
|
||||||
|
, processor_(processor)
|
||||||
, sockets_(NUM_SOCKETS)
|
, sockets_(NUM_SOCKETS)
|
||||||
, barriers_(arch.num_barriers(), 0)
|
, barriers_(arch.num_barriers(), 0)
|
||||||
, processor_(processor)
|
|
||||||
, cores_per_socket_(arch.socket_size())
|
, cores_per_socket_(arch.socket_size())
|
||||||
{
|
{
|
||||||
char sname[100];
|
char sname[100];
|
||||||
|
|
||||||
auto sockets_per_cluster = sockets_.size();
|
uint32_t sockets_per_cluster = sockets_.size();
|
||||||
|
|
||||||
// create sockets
|
// create sockets
|
||||||
|
|
||||||
@@ -43,7 +43,10 @@ Cluster::Cluster(const SimContext& ctx,
|
|||||||
|
|
||||||
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
|
for (uint32_t i = 0; i < sockets_per_cluster; ++i) {
|
||||||
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
|
uint32_t socket_id = cluster_id * sockets_per_cluster + i;
|
||||||
auto socket = Socket::Create(socket_id, this, arch, dcrs);
|
auto socket = Socket::Create(socket_id,
|
||||||
|
this,
|
||||||
|
arch,
|
||||||
|
dcrs);
|
||||||
|
|
||||||
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
|
socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i));
|
||||||
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
|
icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port);
|
||||||
@@ -154,7 +157,7 @@ void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Cluster::PerfStats Cluster::perf_stats() const {
|
Cluster::PerfStats Cluster::perf_stats() const {
|
||||||
Cluster::PerfStats perf;
|
PerfStats perf_stats;
|
||||||
perf.l2cache = l2cache_->perf_stats();
|
perf_stats.l2cache = l2cache_->perf_stats();
|
||||||
return perf;
|
return perf_stats;
|
||||||
}
|
}
|
||||||
@@ -17,6 +17,7 @@
|
|||||||
#include "dcrs.h"
|
#include "dcrs.h"
|
||||||
#include "arch.h"
|
#include "arch.h"
|
||||||
#include "cache_cluster.h"
|
#include "cache_cluster.h"
|
||||||
|
#include "shared_mem.h"
|
||||||
#include "core.h"
|
#include "core.h"
|
||||||
#include "socket.h"
|
#include "socket.h"
|
||||||
#include "constants.h"
|
#include "constants.h"
|
||||||
@@ -29,11 +30,6 @@ class Cluster : public SimObject<Cluster> {
|
|||||||
public:
|
public:
|
||||||
struct PerfStats {
|
struct PerfStats {
|
||||||
CacheSim::PerfStats l2cache;
|
CacheSim::PerfStats l2cache;
|
||||||
|
|
||||||
PerfStats& operator+=(const PerfStats& rhs) {
|
|
||||||
this->l2cache += rhs.l2cache;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
SimPort<MemReq> mem_req_port;
|
SimPort<MemReq> mem_req_port;
|
||||||
@@ -67,15 +63,15 @@ public:
|
|||||||
|
|
||||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||||
|
|
||||||
Cluster::PerfStats perf_stats() const;
|
PerfStats perf_stats() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint32_t cluster_id_;
|
uint32_t cluster_id_;
|
||||||
std::vector<Socket::Ptr> sockets_;
|
ProcessorImpl* processor_;
|
||||||
std::vector<CoreMask> barriers_;
|
std::vector<Socket::Ptr> sockets_;
|
||||||
CacheSim::Ptr l2cache_;
|
std::vector<CoreMask> barriers_;
|
||||||
ProcessorImpl* processor_;
|
CacheSim::Ptr l2cache_;
|
||||||
uint32_t cores_per_socket_;
|
uint32_t cores_per_socket_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace vortex
|
} // namespace vortex
|
||||||
@@ -28,13 +28,18 @@
|
|||||||
|
|
||||||
using namespace vortex;
|
using namespace vortex;
|
||||||
|
|
||||||
Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs)
|
Core::Core(const SimContext& ctx,
|
||||||
|
uint32_t core_id,
|
||||||
|
Socket* socket,
|
||||||
|
const Arch &arch,
|
||||||
|
const DCRS &dcrs)
|
||||||
: SimObject(ctx, "core")
|
: SimObject(ctx, "core")
|
||||||
, icache_req_ports(1, this)
|
, icache_req_ports(1, this)
|
||||||
, icache_rsp_ports(1, this)
|
, icache_rsp_ports(1, this)
|
||||||
, dcache_req_ports(NUM_LSU_LANES, this)
|
, dcache_req_ports(NUM_LSU_LANES, this)
|
||||||
, dcache_rsp_ports(NUM_LSU_LANES, this)
|
, dcache_rsp_ports(NUM_LSU_LANES, this)
|
||||||
, core_id_(core_id)
|
, core_id_(core_id)
|
||||||
|
, socket_(socket)
|
||||||
, arch_(arch)
|
, arch_(arch)
|
||||||
, dcrs_(dcrs)
|
, dcrs_(dcrs)
|
||||||
, decoder_(arch)
|
, decoder_(arch)
|
||||||
@@ -51,7 +56,6 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
|||||||
, decode_latch_("decode")
|
, decode_latch_("decode")
|
||||||
, pending_icache_(arch_.num_warps())
|
, pending_icache_(arch_.num_warps())
|
||||||
, csrs_(arch.num_warps())
|
, csrs_(arch.num_warps())
|
||||||
, socket_(socket)
|
|
||||||
, commit_arbs_(ISSUE_WIDTH)
|
, commit_arbs_(ISSUE_WIDTH)
|
||||||
{
|
{
|
||||||
char sname[100];
|
char sname[100];
|
||||||
@@ -69,6 +73,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
|||||||
}
|
}
|
||||||
|
|
||||||
// initialize shared memory
|
// initialize shared memory
|
||||||
|
snprintf(sname, 100, "core%d-shared_mem", core_id);
|
||||||
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
|
shared_mem_ = SharedMem::Create(sname, SharedMem::Config{
|
||||||
(1 << SMEM_LOG_SIZE),
|
(1 << SMEM_LOG_SIZE),
|
||||||
sizeof(Word),
|
sizeof(Word),
|
||||||
@@ -77,17 +82,17 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
|||||||
false
|
false
|
||||||
});
|
});
|
||||||
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||||
snprintf(sname, 100, "smem_demux%d_%d", core_id, i);
|
snprintf(sname, 100, "core%d-smem_demux%d", core_id, i);
|
||||||
auto smem_demux = SMemDemux::Create(sname);
|
auto smem_demux = SMemDemux::Create(sname);
|
||||||
|
|
||||||
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
|
smem_demux->ReqDC.bind(&dcache_req_ports.at(i));
|
||||||
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
|
dcache_rsp_ports.at(i).bind(&smem_demux->RspDC);
|
||||||
|
|
||||||
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
|
smem_demux->ReqSM.bind(&shared_mem_->Inputs.at(i));
|
||||||
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
|
shared_mem_->Outputs.at(i).bind(&smem_demux->RspSM);
|
||||||
|
|
||||||
smem_demuxs_.at(i) = smem_demux;
|
smem_demuxs_.at(i) = smem_demux;
|
||||||
}
|
}
|
||||||
|
|
||||||
// initialize dispatchers
|
// initialize dispatchers
|
||||||
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
dispatchers_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
||||||
@@ -103,7 +108,7 @@ Core::Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &
|
|||||||
|
|
||||||
// bind commit arbiters
|
// bind commit arbiters
|
||||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||||
snprintf(sname, 100, "commit-arb%d", i);
|
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
|
||||||
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
|
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
|
||||||
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
|
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
|
||||||
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
|
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
|
||||||
@@ -184,7 +189,7 @@ void Core::schedule() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (scheduled_warp == -1) {
|
if (scheduled_warp == -1) {
|
||||||
++perf_stats_.sched_idles;
|
++perf_stats_.sched_idle;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -311,7 +316,21 @@ void Core::issue() {
|
|||||||
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
|
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
|
||||||
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
|
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||||
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
|
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||||
case ExeType::SFU: ++perf_stats_.scrb_sfu; break;
|
case ExeType::SFU: {
|
||||||
|
++perf_stats_.scrb_sfu;
|
||||||
|
switch (use.sfu_type) {
|
||||||
|
case SfuType::TMC:
|
||||||
|
case SfuType::WSPAWN:
|
||||||
|
case SfuType::SPLIT:
|
||||||
|
case SfuType::JOIN:
|
||||||
|
case SfuType::BAR:
|
||||||
|
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
|
||||||
|
case SfuType::CSRRW:
|
||||||
|
case SfuType::CSRRS:
|
||||||
|
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
|
||||||
|
default: assert(false);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: assert(false);
|
default: assert(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -356,7 +375,6 @@ void Core::commit() {
|
|||||||
auto& commit_arb = commit_arbs_.at(i);
|
auto& commit_arb = commit_arbs_.at(i);
|
||||||
if (commit_arb->Outputs.at(0).empty())
|
if (commit_arb->Outputs.at(0).empty())
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
auto trace = commit_arb->Outputs.at(0).front();
|
auto trace = commit_arb->Outputs.at(0).front();
|
||||||
|
|
||||||
// advance to commit stage
|
// advance to commit stage
|
||||||
@@ -558,8 +576,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
break;
|
break;
|
||||||
case VX_DCR_MPM_CLASS_CORE: {
|
case VX_DCR_MPM_CLASS_CORE: {
|
||||||
switch (addr) {
|
switch (addr) {
|
||||||
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff;
|
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
|
||||||
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32;
|
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
|
||||||
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
||||||
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||||
@@ -574,6 +592,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
||||||
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
||||||
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
||||||
|
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
|
||||||
|
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
|
||||||
|
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
|
||||||
|
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
|
||||||
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
||||||
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
||||||
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
||||||
@@ -588,6 +610,7 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
} break;
|
} break;
|
||||||
case VX_DCR_MPM_CLASS_MEM: {
|
case VX_DCR_MPM_CLASS_MEM: {
|
||||||
auto proc_perf = socket_->cluster()->processor()->perf_stats();
|
auto proc_perf = socket_->cluster()->processor()->perf_stats();
|
||||||
|
auto cluster_perf = socket_->cluster()->perf_stats();
|
||||||
auto socket_perf = socket_->perf_stats();
|
auto socket_perf = socket_->perf_stats();
|
||||||
auto smem_perf = shared_mem_->perf_stats();
|
auto smem_perf = shared_mem_->perf_stats();
|
||||||
switch (addr) {
|
switch (addr) {
|
||||||
@@ -611,18 +634,18 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
||||||
|
|
||||||
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
|
||||||
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
|
||||||
case VX_CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
|
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
|
||||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
|
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
|
||||||
case VX_CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
|
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
|
||||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
|
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
|
||||||
case VX_CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
|
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
|
||||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
|
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
|
||||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
|
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
|
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
|
||||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
|
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
|
||||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
|
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
|
||||||
|
|
||||||
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
||||||
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
||||||
@@ -652,6 +675,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||||||
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
|
case VX_CSR_MPM_SMEM_BANK_ST_H: return smem_perf.bank_stalls >> 32;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
default: {
|
||||||
|
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
|
||||||
|
std::abort();
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ public:
|
|||||||
struct PerfStats {
|
struct PerfStats {
|
||||||
uint64_t cycles;
|
uint64_t cycles;
|
||||||
uint64_t instrs;
|
uint64_t instrs;
|
||||||
uint64_t sched_idles;
|
uint64_t sched_idle;
|
||||||
uint64_t sched_stalls;
|
uint64_t sched_stalls;
|
||||||
uint64_t ibuf_stalls;
|
uint64_t ibuf_stalls;
|
||||||
uint64_t scrb_stalls;
|
uint64_t scrb_stalls;
|
||||||
@@ -57,6 +57,8 @@ public:
|
|||||||
uint64_t scrb_fpu;
|
uint64_t scrb_fpu;
|
||||||
uint64_t scrb_lsu;
|
uint64_t scrb_lsu;
|
||||||
uint64_t scrb_sfu;
|
uint64_t scrb_sfu;
|
||||||
|
uint64_t scrb_wctl;
|
||||||
|
uint64_t scrb_csrs;
|
||||||
uint64_t ifetches;
|
uint64_t ifetches;
|
||||||
uint64_t loads;
|
uint64_t loads;
|
||||||
uint64_t stores;
|
uint64_t stores;
|
||||||
@@ -66,7 +68,7 @@ public:
|
|||||||
PerfStats()
|
PerfStats()
|
||||||
: cycles(0)
|
: cycles(0)
|
||||||
, instrs(0)
|
, instrs(0)
|
||||||
, sched_idles(0)
|
, sched_idle(0)
|
||||||
, sched_stalls(0)
|
, sched_stalls(0)
|
||||||
, ibuf_stalls(0)
|
, ibuf_stalls(0)
|
||||||
, scrb_stalls(0)
|
, scrb_stalls(0)
|
||||||
@@ -74,6 +76,8 @@ public:
|
|||||||
, scrb_fpu(0)
|
, scrb_fpu(0)
|
||||||
, scrb_lsu(0)
|
, scrb_lsu(0)
|
||||||
, scrb_sfu(0)
|
, scrb_sfu(0)
|
||||||
|
, scrb_wctl(0)
|
||||||
|
, scrb_csrs(0)
|
||||||
, ifetches(0)
|
, ifetches(0)
|
||||||
, loads(0)
|
, loads(0)
|
||||||
, stores(0)
|
, stores(0)
|
||||||
@@ -88,7 +92,11 @@ public:
|
|||||||
std::vector<SimPort<MemReq>> dcache_req_ports;
|
std::vector<SimPort<MemReq>> dcache_req_ports;
|
||||||
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
|
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
|
||||||
|
|
||||||
Core(const SimContext& ctx, uint32_t core_id, Socket* socket, const Arch &arch, const DCRS &dcrs);
|
Core(const SimContext& ctx,
|
||||||
|
uint32_t core_id,
|
||||||
|
Socket* socket,
|
||||||
|
const Arch &arch,
|
||||||
|
const DCRS &dcrs);
|
||||||
|
|
||||||
~Core();
|
~Core();
|
||||||
|
|
||||||
@@ -158,6 +166,7 @@ private:
|
|||||||
void cout_flush();
|
void cout_flush();
|
||||||
|
|
||||||
uint32_t core_id_;
|
uint32_t core_id_;
|
||||||
|
Socket* socket_;
|
||||||
const Arch& arch_;
|
const Arch& arch_;
|
||||||
const DCRS &dcrs_;
|
const DCRS &dcrs_;
|
||||||
|
|
||||||
@@ -193,10 +202,9 @@ private:
|
|||||||
|
|
||||||
PerfStats perf_stats_;
|
PerfStats perf_stats_;
|
||||||
|
|
||||||
Socket* socket_;
|
|
||||||
|
|
||||||
std::vector<TraceSwitch::Ptr> commit_arbs_;
|
std::vector<TraceSwitch::Ptr> commit_arbs_;
|
||||||
|
|
||||||
|
uint32_t commit_exe_;
|
||||||
uint32_t ibuffer_idx_;
|
uint32_t ibuffer_idx_;
|
||||||
|
|
||||||
friend class Warp;
|
friend class Warp;
|
||||||
|
|||||||
@@ -113,6 +113,7 @@ void ProcessorImpl::reset() {
|
|||||||
perf_mem_writes_ = 0;
|
perf_mem_writes_ = 0;
|
||||||
perf_mem_latency_ = 0;
|
perf_mem_latency_ = 0;
|
||||||
perf_mem_pending_reads_ = 0;
|
perf_mem_pending_reads_ = 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
|
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
|
||||||
@@ -125,9 +126,6 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
|
|||||||
perf.mem_writes = perf_mem_writes_;
|
perf.mem_writes = perf_mem_writes_;
|
||||||
perf.mem_latency = perf_mem_latency_;
|
perf.mem_latency = perf_mem_latency_;
|
||||||
perf.l3cache = l3cache_->perf_stats();
|
perf.l3cache = l3cache_->perf_stats();
|
||||||
for (auto cluster : clusters_) {
|
|
||||||
perf.clusters += cluster->perf_stats();
|
|
||||||
}
|
|
||||||
return perf;
|
return perf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -24,17 +24,10 @@ namespace vortex {
|
|||||||
class ProcessorImpl {
|
class ProcessorImpl {
|
||||||
public:
|
public:
|
||||||
struct PerfStats {
|
struct PerfStats {
|
||||||
|
CacheSim::PerfStats l3cache;
|
||||||
uint64_t mem_reads;
|
uint64_t mem_reads;
|
||||||
uint64_t mem_writes;
|
uint64_t mem_writes;
|
||||||
uint64_t mem_latency;
|
uint64_t mem_latency;
|
||||||
CacheSim::PerfStats l3cache;
|
|
||||||
Cluster::PerfStats clusters;
|
|
||||||
|
|
||||||
PerfStats()
|
|
||||||
: mem_reads(0)
|
|
||||||
, mem_writes(0)
|
|
||||||
, mem_latency(0)
|
|
||||||
{}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
ProcessorImpl(const Arch& arch);
|
ProcessorImpl(const Arch& arch);
|
||||||
@@ -46,7 +39,7 @@ public:
|
|||||||
|
|
||||||
void write_dcr(uint32_t addr, uint32_t value);
|
void write_dcr(uint32_t addr, uint32_t value);
|
||||||
|
|
||||||
ProcessorImpl::PerfStats perf_stats() const;
|
PerfStats perf_stats() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@@ -55,7 +48,7 @@ private:
|
|||||||
const Arch& arch_;
|
const Arch& arch_;
|
||||||
std::vector<std::shared_ptr<Cluster>> clusters_;
|
std::vector<std::shared_ptr<Cluster>> clusters_;
|
||||||
DCRS dcrs_;
|
DCRS dcrs_;
|
||||||
MemSim::Ptr memsim_;
|
MemSim::Ptr memsim_;
|
||||||
CacheSim::Ptr l3cache_;
|
CacheSim::Ptr l3cache_;
|
||||||
uint64_t perf_mem_reads_;
|
uint64_t perf_mem_reads_;
|
||||||
uint64_t perf_mem_writes_;
|
uint64_t perf_mem_writes_;
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ public:
|
|||||||
RegType reg_type;
|
RegType reg_type;
|
||||||
uint32_t reg_id;
|
uint32_t reg_id;
|
||||||
ExeType exe_type;
|
ExeType exe_type;
|
||||||
|
SfuType sfu_type;
|
||||||
uint64_t uuid;
|
uint64_t uuid;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -62,7 +63,7 @@ public:
|
|||||||
if (used_iregs.test(r)) {
|
if (used_iregs.test(r)) {
|
||||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
|
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
|
||||||
auto owner = owners_.at(tag);
|
auto owner = owners_.at(tag);
|
||||||
out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid});
|
out.push_back({RegType::Integer, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,7 +71,7 @@ public:
|
|||||||
if (used_fregs.test(r)) {
|
if (used_fregs.test(r)) {
|
||||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
|
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
|
||||||
auto owner = owners_.at(tag);
|
auto owner = owners_.at(tag);
|
||||||
out.push_back({RegType::Float, r, owner->exe_type, owner->uuid});
|
out.push_back({RegType::Float, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -78,7 +79,7 @@ public:
|
|||||||
if (used_vregs.test(r)) {
|
if (used_vregs.test(r)) {
|
||||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
|
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
|
||||||
auto owner = owners_.at(tag);
|
auto owner = owners_.at(tag);
|
||||||
out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid});
|
out.push_back({RegType::Vector, r, owner->exe_type, owner->sfu_type, owner->uuid});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,16 +19,16 @@ using namespace vortex;
|
|||||||
Socket::Socket(const SimContext& ctx,
|
Socket::Socket(const SimContext& ctx,
|
||||||
uint32_t socket_id,
|
uint32_t socket_id,
|
||||||
Cluster* cluster,
|
Cluster* cluster,
|
||||||
const Arch &arch, const
|
const Arch &arch,
|
||||||
DCRS &dcrs)
|
const DCRS &dcrs)
|
||||||
: SimObject(ctx, "socket")
|
: SimObject(ctx, "socket")
|
||||||
, icache_mem_req_port(this)
|
, icache_mem_req_port(this)
|
||||||
, icache_mem_rsp_port(this)
|
, icache_mem_rsp_port(this)
|
||||||
, dcache_mem_req_port(this)
|
, dcache_mem_req_port(this)
|
||||||
, dcache_mem_rsp_port(this)
|
, dcache_mem_rsp_port(this)
|
||||||
, socket_id_(socket_id)
|
, socket_id_(socket_id)
|
||||||
, cores_(arch.socket_size())
|
|
||||||
, cluster_(cluster)
|
, cluster_(cluster)
|
||||||
|
, cores_(arch.socket_size())
|
||||||
{
|
{
|
||||||
auto cores_per_socket = cores_.size();
|
auto cores_per_socket = cores_.size();
|
||||||
|
|
||||||
@@ -77,7 +77,10 @@ Socket::Socket(const SimContext& ctx,
|
|||||||
|
|
||||||
for (uint32_t i = 0; i < cores_per_socket; ++i) {
|
for (uint32_t i = 0; i < cores_per_socket; ++i) {
|
||||||
uint32_t core_id = socket_id * cores_per_socket + i;
|
uint32_t core_id = socket_id * cores_per_socket + i;
|
||||||
cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
|
cores_.at(i) = Core::Create(core_id,
|
||||||
|
this,
|
||||||
|
arch,
|
||||||
|
dcrs);
|
||||||
|
|
||||||
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
||||||
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
||||||
@@ -139,8 +142,8 @@ void Socket::resume(uint32_t core_index) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Socket::PerfStats Socket::perf_stats() const {
|
Socket::PerfStats Socket::perf_stats() const {
|
||||||
Socket::PerfStats perf;
|
PerfStats perf_stats;
|
||||||
perf.icache = icaches_->perf_stats();
|
perf_stats.icache = icaches_->perf_stats();
|
||||||
perf.dcache = dcaches_->perf_stats();
|
perf_stats.dcache = dcaches_->perf_stats();
|
||||||
return perf;
|
return perf_stats;
|
||||||
}
|
}
|
||||||
@@ -30,12 +30,6 @@ public:
|
|||||||
struct PerfStats {
|
struct PerfStats {
|
||||||
CacheSim::PerfStats icache;
|
CacheSim::PerfStats icache;
|
||||||
CacheSim::PerfStats dcache;
|
CacheSim::PerfStats dcache;
|
||||||
|
|
||||||
PerfStats& operator+=(const PerfStats& rhs) {
|
|
||||||
this->icache += rhs.icache;
|
|
||||||
this->dcache += rhs.dcache;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
SimPort<MemReq> icache_mem_req_port;
|
SimPort<MemReq> icache_mem_req_port;
|
||||||
@@ -74,14 +68,14 @@ public:
|
|||||||
|
|
||||||
void resume(uint32_t core_id);
|
void resume(uint32_t core_id);
|
||||||
|
|
||||||
Socket::PerfStats perf_stats() const;
|
PerfStats perf_stats() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint32_t socket_id_;
|
uint32_t socket_id_;
|
||||||
|
Cluster* cluster_;
|
||||||
std::vector<Core::Ptr> cores_;
|
std::vector<Core::Ptr> cores_;
|
||||||
CacheCluster::Ptr icaches_;
|
CacheCluster::Ptr icaches_;
|
||||||
CacheCluster::Ptr dcaches_;
|
CacheCluster::Ptr dcaches_;
|
||||||
Cluster* cluster_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace vortex
|
} // namespace vortex
|
||||||
@@ -15,10 +15,10 @@ all:
|
|||||||
$(MAKE) -C blackscholes
|
$(MAKE) -C blackscholes
|
||||||
$(MAKE) -C transpose
|
$(MAKE) -C transpose
|
||||||
$(MAKE) -C convolution
|
$(MAKE) -C convolution
|
||||||
# $(MAKE) -C cutcp
|
$(MAKE) -C cutcp
|
||||||
# $(MAKE) -C sgemm2
|
$(MAKE) -C sgemm2
|
||||||
# $(MAKE) -C vectorhypot
|
$(MAKE) -C vectorhypot
|
||||||
# $(MAKE) -C mri-q run-simx
|
$(MAKE) -C mri-q run-simx
|
||||||
|
|
||||||
run-simx:
|
run-simx:
|
||||||
$(MAKE) -C vecadd run-simx
|
$(MAKE) -C vecadd run-simx
|
||||||
@@ -125,7 +125,7 @@ clean-all:
|
|||||||
$(MAKE) -C oclprintf clean-all
|
$(MAKE) -C oclprintf clean-all
|
||||||
$(MAKE) -C blackscholes clean-all
|
$(MAKE) -C blackscholes clean-all
|
||||||
$(MAKE) -C convolution clean-all
|
$(MAKE) -C convolution clean-all
|
||||||
# $(MAKE) -C cutcp clean-all
|
$(MAKE) -C cutcp clean-all
|
||||||
# $(MAKE) -C sgemm2 clean-all
|
$(MAKE) -C sgemm2 clean-all
|
||||||
# $(MAKE) -C vectorhypot clean-all
|
$(MAKE) -C vectorhypot clean-all
|
||||||
# $(MAKE) -C mri-q clean-all
|
$(MAKE) -C mri-q clean-all
|
||||||
|
|||||||
Reference in New Issue
Block a user