diff --git a/Makefile b/Makefile index 410dc008..58c66a48 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,11 @@ all: $(MAKE) -C simX $(MAKE) -C benchmarks/opencl +perf-demo: + $(MAKE) -C hw + $(MAKE) -C driver rtlsim + $(MAKE) -C driver/tests/demo/ run-rtlsim + clean: $(MAKE) -C hw clean $(MAKE) -C driver clean diff --git a/ci/blackbox.sh b/ci/blackbox.sh index ef7a8c39..4bc40722 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -6,7 +6,7 @@ set -e show_usage() { echo "Vortex BlackBox Test Driver v1.0" - echo "Usage: [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=rtlsim|vlsim] [--debug] [--scope] [--app=vecadd|sgemm|basic|demo|dogfood] [--args=] [--help]]" + echo "Usage: [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=rtlsim|vlsim] [--debug] [--scope] [--perf] [--app=vecadd|sgemm|basic|demo|dogfood] [--args=] [--help]]" } DRIVER=vlsim @@ -64,6 +64,10 @@ case $i in SCOPE=1 shift ;; + --perf) + PERF=-DPERF_ENABLE + shift + ;; --args=*) ARGS=${i#*=} HAS_ARGS=1 @@ -125,7 +129,7 @@ case $APP in ;; esac -CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3" +CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF" echo "CONFIGS=$CONFIGS" diff --git a/driver/common/vx_utils.cpp b/driver/common/vx_utils.cpp index dbe1d3e1..174b601b 100644 --- a/driver/common/vx_utils.cpp +++ b/driver/common/vx_utils.cpp @@ -91,24 +91,240 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) { return err; } -extern int vx_get_perf(vx_device_h device, int core_id, size_t* instrs, size_t* cycles) { +int vx_csr_get_l(vx_device_h device, int core_id, int addr, int addr_h, uint64_t* value) { + int ret = 0; + unsigned value_lo, value_hi; + ret |= vx_csr_get(device, core_id, addr, &value_lo); + ret |= vx_csr_get(device, core_id, addr_h, &value_hi); + *value = (uint64_t(value_hi) << 32) | value_lo; + return ret; +} + +extern int vx_dump_perf(vx_device_h device, FILE* stream) { int ret = 0; - unsigned value; - - if (instrs) { - ret |= vx_csr_get(device, core_id, CSR_INSTRET_H, &value); - *instrs = value; - ret |= vx_csr_get(device, core_id, CSR_INSTRET, &value); - *instrs = (*instrs << 32) | value; - } + unsigned num_cores; + vx_csr_get(device, 0, CSR_NC, &num_cores); - if (cycles) { - ret |= vx_csr_get(device, core_id, CSR_CYCLE_H, &value); - *cycles = value; - ret |= vx_csr_get(device, core_id, CSR_CYCLE, &value); - *cycles = (*cycles << 32) | value; - } + uint64_t instrs = 0; + uint64_t cycles = 0; +#ifdef PERF_ENABLE + // PERF: pipeline stalls + uint64_t lsu_stalls = 0; + uint64_t fpu_stalls = 0; + uint64_t mul_stalls = 0; + uint64_t csr_stalls = 0; + uint64_t alu_stalls = 0; + uint64_t gpu_stalls = 0; + uint64_t ibuffer_stalls = 0; + uint64_t scoreboard_stalls = 0; + uint64_t icache_stalls = 0; + // PERF: Icache + uint64_t icache_reads = 0; + uint64_t icache_read_misses = 0; + uint64_t icache_pipe_stalls = 0; + uint64_t icache_dram_stalls = 0; + uint64_t icache_mshr_stalls = 0; + uint64_t icache_rsp_stalls = 0; + // PERF: Dcache + uint64_t dcache_reads = 0; + uint64_t dcache_writes = 0; + uint64_t dcache_read_misses = 0; + uint64_t dcache_write_misses = 0; + uint64_t dcache_pipe_stalls = 0; + uint64_t dcache_dram_stalls = 0; + uint64_t dcache_mshr_stalls = 0; + uint64_t dcache_rsp_stalls = 0; + uint64_t dcache_evictions = 0; + // PERF: memory + uint64_t dram_req = 0; + uint64_t dram_rsp = 0; + uint64_t dram_lat = 0; +#endif + + for (unsigned core_id = 0; core_id < num_cores; ++core_id) { + + uint64_t instrs_per_core, cycles_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MINSTRET, CSR_MINSTRET_H, &instrs_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MCYCLE, CSR_MCYCLE_H, &cycles_per_core); + float IPC = (float)(double(instrs_per_core) / double(cycles_per_core)); + if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC); + instrs += instrs_per_core; + cycles = std::max(cycles_per_core, cycles); + + #ifdef PERF_ENABLE + // PERF: pipeline + // icache_stall + uint64_t icache_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_ST, CSR_MPM_ICACHE_ST_H, &icache_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache stalls=%ld\n", core_id, icache_stalls_per_core); + icache_stalls += icache_stalls_per_core; + // ibuffer_stall + uint64_t ibuffer_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_IBUF_ST, CSR_MPM_IBUF_ST_H, &ibuffer_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core); + ibuffer_stalls += ibuffer_stalls_per_core; + // scoreboard_stall + uint64_t scoreboard_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_SCRB_ST, CSR_MPM_SCRB_ST_H, &scoreboard_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core); + scoreboard_stalls += scoreboard_stalls_per_core; + // alu_stall + uint64_t alu_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ALU_ST, CSR_MPM_ALU_ST_H, &alu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: alu stalls=%ld\n", core_id, alu_stalls_per_core); + alu_stalls += alu_stalls_per_core; + // lsu_stall + uint64_t lsu_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_LSU_ST, CSR_MPM_LSU_ST_H, &lsu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu stalls=%ld\n", core_id, lsu_stalls_per_core); + lsu_stalls += lsu_stalls_per_core; + // csr_stall + uint64_t csr_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_CSR_ST, CSR_MPM_CSR_ST_H, &csr_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: csr stalls=%ld\n", core_id, csr_stalls_per_core); + csr_stalls += csr_stalls_per_core; + // mul_stall + uint64_t mul_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_MUL_ST, CSR_MPM_MUL_ST_H, &mul_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: mul stalls=%ld\n", core_id, mul_stalls_per_core); + mul_stalls += mul_stalls_per_core; + // fpu_stall + uint64_t fpu_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_FPU_ST, CSR_MPM_FPU_ST_H, &fpu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu stalls=%ld\n", core_id, fpu_stalls_per_core); + fpu_stalls += fpu_stalls_per_core; + // gpu_stall + uint64_t gpu_stalls_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_GPU_ST, CSR_MPM_GPU_ST_H, &gpu_stalls_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu stalls=%ld\n", core_id, gpu_stalls_per_core); + gpu_stalls += gpu_stalls_per_core; + + // PERF: Icache + // total reads + uint64_t icache_reads_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_READS, CSR_MPM_ICACHE_READS_H, &icache_reads_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads_per_core); + icache_reads += icache_reads_per_core; + // read misses + uint64_t icache_miss_r_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MISS_R, CSR_MPM_ICACHE_MISS_R_H, &icache_miss_r_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld\n", core_id, icache_miss_r_per_core); + icache_read_misses += icache_miss_r_per_core; + // pipeline stalls + uint64_t icache_pipe_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_PIPE_ST, CSR_MPM_ICACHE_PIPE_ST_H, &icache_pipe_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core); + icache_pipe_stalls += icache_pipe_st_per_core; + // response stalls + uint64_t icache_crsp_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_CRSP_ST, CSR_MPM_ICACHE_CRSP_ST_H, &icache_crsp_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core); + icache_rsp_stalls += icache_crsp_st_per_core; + // dram_stalls + uint64_t icache_dram_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_DREQ_ST, CSR_MPM_ICACHE_DREQ_ST_H, &icache_dram_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache dram stalls=%ld\n", core_id, icache_dram_st_per_core); + icache_dram_stalls += icache_dram_st_per_core; + // mshr_stalls + uint64_t icache_mshr_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MSHR_ST, CSR_MPM_ICACHE_MSHR_ST_H, &icache_mshr_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: icache mshr stalls=%ld\n", core_id, icache_mshr_st_per_core); + icache_mshr_stalls += icache_mshr_st_per_core; + + // PERF: Dcache + // total reads + uint64_t dcache_reads_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_READS, CSR_MPM_DCACHE_READS_H, &dcache_reads_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads_per_core); + dcache_reads += dcache_reads_per_core; + // total write + uint64_t dcache_writes_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_WRITES, CSR_MPM_DCACHE_WRITES_H, &dcache_writes_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes_per_core); + dcache_writes += dcache_writes_per_core; + // read misses + uint64_t dcache_miss_r_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_R, CSR_MPM_DCACHE_MISS_R_H, &dcache_miss_r_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld\n", core_id, dcache_miss_r_per_core); + dcache_read_misses += dcache_miss_r_per_core; + // read misses + uint64_t dcache_miss_w_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld\n", core_id, dcache_miss_w_per_core); + dcache_write_misses += dcache_miss_w_per_core; + // total_evictions + uint64_t dcache_evictions_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_EVICTS, CSR_MPM_DCACHE_EVICTS_H, &dcache_evictions_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache evictions_per_core=%ld\n", core_id, dcache_evictions_per_core); + dcache_evictions += dcache_evictions_per_core; + // pipeline stalls + uint64_t dcache_pipe_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core); + dcache_pipe_stalls += dcache_pipe_st_per_core; + // response stalls + uint64_t dcache_crsp_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core); + dcache_rsp_stalls += dcache_crsp_st_per_core; + // dram_stalls + uint64_t dcache_dram_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_DREQ_ST, CSR_MPM_DCACHE_DREQ_ST_H, &dcache_dram_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache dram stalls=%ld\n", core_id, dcache_dram_st_per_core); + dcache_dram_stalls += dcache_dram_st_per_core; + // mshr_stalls + uint64_t dcache_mshr_st_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MSHR_ST, CSR_MPM_DCACHE_MSHR_ST_H, &dcache_mshr_st_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core); + dcache_mshr_stalls += dcache_mshr_st_per_core; + + // PERF: dram_latency + uint64_t dram_req_per_core, dram_rsp_per_core, dram_lat_per_core; + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_REQ, CSR_MPM_DRAM_REQ_H, &dram_req_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_RSP, CSR_MPM_DRAM_RSP_H, &dram_rsp_per_core); + ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core); + int avg_dram_lat_per_core = (int)(double(dram_lat_per_core) / double(dram_rsp_per_core)); + if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, dram_req_per_core, dram_rsp_per_core, dram_req_per_core - dram_rsp_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat_per_core); + dram_req += dram_req_per_core; + dram_rsp += dram_rsp_per_core; + dram_lat += dram_lat_per_core; + #endif + } + + float IPC = (float)(double(instrs) / double(cycles)); + fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + +#ifdef PERF_ENABLE + fprintf(stream, "PERF: icache stalls=%ld\n", icache_stalls); + fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); + fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); + fprintf(stream, "PERF: alu stalls=%ld\n", alu_stalls); + fprintf(stream, "PERF: lsu stalls=%ld\n", lsu_stalls); + fprintf(stream, "PERF: csr stalls=%ld\n", csr_stalls); + fprintf(stream, "PERF: mul stalls=%ld\n", mul_stalls); + fprintf(stream, "PERF: fpu stalls=%ld\n", fpu_stalls); + fprintf(stream, "PERF: gpu stalls=%ld\n", gpu_stalls); + fprintf(stream, "PERF: icache reads=%ld\n", icache_reads); + fprintf(stream, "PERF: icache read misses=%ld\n", icache_read_misses); + fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls); + fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls); + fprintf(stream, "PERF: icache dram stalls=%ld\n", icache_dram_stalls); + fprintf(stream, "PERF: icache mshr stalls=%ld\n", icache_mshr_stalls); + fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads); + fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes); + fprintf(stream, "PERF: dcache read misses=%ld\n", dcache_read_misses); + fprintf(stream, "PERF: dcache wrire misses=%ld\n", dcache_write_misses); + fprintf(stream, "PERF: dcache evictions=%ld\n", dcache_evictions); + fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls); + fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls); + fprintf(stream, "PERF: dcache dram stalls=%ld\n", dcache_dram_stalls); + fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls); + fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", dram_req, dram_rsp, dram_req - dram_rsp); + int avg_dram_lat = (int)(double(dram_lat) / double(dram_rsp)); + fprintf(stream, "PERF: average dram latency=%d cycles\n", avg_dram_lat); +#endif return ret; } \ No newline at end of file diff --git a/driver/include/vortex.h b/driver/include/vortex.h index e115834d..e53e8431 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -2,6 +2,7 @@ #define __VX_DRIVER_H__ #include +#include #ifdef __cplusplus extern "C" { @@ -71,8 +72,8 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) // upload kernel file to device int vx_upload_kernel_file(vx_device_h device, const char* filename); -// get performance counters -int vx_get_perf(vx_device_h device, int core_id, size_t* instrs, size_t* cycles); +// dump performance counters +int vx_dump_perf(vx_device_h device, FILE* stream); #ifdef __cplusplus } diff --git a/driver/opae/Makefile b/driver/opae/Makefile index ac43161c..fb6471e8 100644 --- a/driver/opae/Makefile +++ b/driver/opae/Makefile @@ -58,6 +58,12 @@ ifdef SCOPE SCOPE_H = scope-defs.h endif +# Enable perf counters +ifdef PERF + CXXFLAGS += -DPERF_ENABLE + PERF_ENABLE = PERF=1 +endif + all: vlsim # AFU info from JSON file, including AFU UUID @@ -71,7 +77,7 @@ scope-defs.h: $(SCRIPT_DIR)/scope.json scope: scope-defs.h vlsim-hw: $(SCOPE_H) - $(SCOPE_ENABLE) $(MAKE) -C vlsim + $(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C vlsim fpga: $(SRCS) $(SCOPE_H) $(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT) @@ -94,7 +100,6 @@ $(ASE_DIR): clean: rm -rf $(PROJECT) $(PROJECT_ASE) $(PROJECT_VLSIM) *.o .depend $(MAKE) -C vlsim clean - $(MAKE) -C ase clean ifneq ($(MAKECMDGOALS),clean) -include .depend diff --git a/driver/opae/vlsim/Makefile b/driver/opae/vlsim/Makefile index 07e31069..709cc17d 100644 --- a/driver/opae/vlsim/Makefile +++ b/driver/opae/vlsim/Makefile @@ -43,8 +43,9 @@ RTL_DIR=../../../hw/rtl SRCS = fpga.cpp opae_sim.cpp SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp -FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src +FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) +RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) VL_FLAGS += -Wno-DECLFILENAME @@ -70,6 +71,12 @@ ifdef SCOPE CFLAGS += -DSCOPE endif +# Enable perf counters +ifdef PERF + VL_FLAGS += -DPERF_ENABLE + CFLAGS += -DPERF_ENABLE +endif + # use our OPAE shim VL_FLAGS += -DNOPAE CFLAGS += -DNOPAE @@ -77,8 +84,6 @@ CFLAGS += -DNOPAE # use DPI FPU VL_FLAGS += -DFPU_FAST -RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip - PROJECT = libopae-c-vlsim.so all: $(PROJECT) diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index 318c6988..1a5c0ce6 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -244,27 +244,7 @@ extern int vx_dev_close(vx_device_h hdevice) { #endif #ifdef DUMP_PERF_STATS - // Dump perf stats - if (device->num_cores > 1) { - uint64_t total_instrs = 0, total_cycles = 0; - for (unsigned core_id = 0; core_id < device->num_cores; ++core_id) { - uint64_t instrs, cycles; - int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles); - assert(ret == 0); - float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stdout, "[VXDRV] PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); - total_instrs += instrs; - total_cycles = std::max(total_cycles, cycles); - } - float IPC = (float)(double(total_instrs) / double(total_cycles)); - fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); - } else { - uint64_t instrs, cycles; - int ret = vx_get_perf(hdevice, 0, &instrs, &cycles); - float IPC = (float)(double(instrs) / double(cycles)); - assert(ret == 0); - fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); - } + vx_dump_perf(device, stdout); #endif fpgaClose(device->fpga); diff --git a/driver/rtlsim/Makefile b/driver/rtlsim/Makefile index 4ed67b5e..50bd7a4b 100644 --- a/driver/rtlsim/Makefile +++ b/driver/rtlsim/Makefile @@ -64,6 +64,12 @@ else CFLAGS += -DNDEBUG endif +# Enable perf counters +ifdef PERF + VL_FLAGS += -DPERF_ENABLE + CFLAGS += -DPERF_ENABLE +endif + # use DPI FPU VL_FLAGS += -DFPU_FAST diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 2420fd03..e6667e29 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -239,26 +239,7 @@ extern int vx_dev_close(vx_device_h hdevice) { vx_device *device = ((vx_device*)hdevice); #ifdef DUMP_PERF_STATS - unsigned num_cores; - vx_csr_get(hdevice, 0, CSR_NC, &num_cores); - if (num_cores > 1) { - uint64_t total_instrs = 0, total_cycles = 0; - for (unsigned core_id = 0; core_id < num_cores; ++core_id) { - uint64_t instrs, cycles; - vx_get_perf(hdevice, core_id, &instrs, &cycles); - float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC); - total_instrs += instrs; - total_cycles = std::max(total_cycles, cycles); - } - float IPC = (float)(double(total_instrs) / double(total_cycles)); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC); - } else { - uint64_t instrs, cycles; - vx_get_perf(hdevice, 0, &instrs, &cycles); - float IPC = (float)(double(instrs) / double(cycles)); - fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); - } + vx_dump_perf(device, stdout); #endif delete device; diff --git a/hw/opae/sources_1c.txt b/hw/opae/sources_1c.txt index 46b3521f..4671c87a 100644 --- a/hw/opae/sources_1c.txt +++ b/hw/opae/sources_1c.txt @@ -4,6 +4,7 @@ +define+QUARTUS +define+FPU_FAST #+define+SCOPE +#+define+PERF_ENABLE #+define+DBG_PRINT_CORE_ICACHE #+define+DBG_PRINT_CORE_DCACHE diff --git a/hw/rtl/VX_cluster.v b/hw/rtl/VX_cluster.v index 5df7e9dc..cf40c94c 100644 --- a/hw/rtl/VX_cluster.v +++ b/hw/rtl/VX_cluster.v @@ -297,6 +297,9 @@ module VX_cluster #( ); if (`L2_ENABLE) begin + `ifdef PERF_ENABLE + VX_perf_cache_if perf_l2cache_if(); + `endif wire [`NUM_CORES-1:0] per_core_dram_req_valid_qual; wire [`NUM_CORES-1:0] per_core_dram_req_rw_qual; @@ -345,10 +348,14 @@ module VX_cluster #( .SNP_TAG_WIDTH (`L2SNP_TAG_WIDTH) ) l2cache ( `SCOPE_BIND_VX_cluster_l2cache - + .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_cache_if (perf_l2cache_if), + `endif + // Core request .core_req_valid (per_core_dram_req_valid_qual), .core_req_rw (per_core_dram_req_rw_qual), diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index d4932f85..984b0bc0 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -126,10 +126,12 @@ // CSR Addresses ////////////////////////////////////////////////////////////// +// User Floating-Point CSRs `define CSR_FFLAGS 12'h001 `define CSR_FRM 12'h002 `define CSR_FCSR 12'h003 +// SIMT CSRs `define CSR_LTID 12'h020 `define CSR_LWID 12'h021 `define CSR_GTID 12'h022 @@ -153,11 +155,73 @@ `define CSR_MEPC 12'h341 -`define CSR_CYCLE 12'hC00 -`define CSR_CYCLE_H 12'hC80 -`define CSR_INSTRET 12'hC02 -`define CSR_INSTRET_H 12'hC82 +// Machine Counter/Timers +`define CSR_MCYCLE 12'hB00 +`define CSR_MCYCLE_H 12'hB80 +`define CSR_MINSTRET 12'hB02 +`define CSR_MINSTRET_H 12'hB82 +// Machine Performance-monitoring counters +// PERF: pipeline +`define CSR_MPM_ICACHE_ST 12'hB03 +`define CSR_MPM_ICACHE_ST_H 12'hB83 +`define CSR_MPM_IBUF_ST 12'hB04 +`define CSR_MPM_IBUF_ST_H 12'hB84 +`define CSR_MPM_SCRB_ST 12'hB05 +`define CSR_MPM_SCRB_ST_H 12'hB85 +`define CSR_MPM_ALU_ST 12'hB06 +`define CSR_MPM_ALU_ST_H 12'hB86 +`define CSR_MPM_LSU_ST 12'hB07 +`define CSR_MPM_LSU_ST_H 12'hB87 +`define CSR_MPM_CSR_ST 12'hB08 +`define CSR_MPM_CSR_ST_H 12'hB88 +`define CSR_MPM_MUL_ST 12'hB09 +`define CSR_MPM_MUL_ST_H 12'hB89 +`define CSR_MPM_FPU_ST 12'hB0A +`define CSR_MPM_FPU_ST_H 12'hB8A +`define CSR_MPM_GPU_ST 12'hB0B +`define CSR_MPM_GPU_ST_H 12'hB8B +// PERF: icache +`define CSR_MPM_ICACHE_MISS_R 12'hB0C // read misses +`define CSR_MPM_ICACHE_MISS_R_H 12'hB8C +`define CSR_MPM_ICACHE_DREQ_ST 12'hB0D // dram request stalls +`define CSR_MPM_ICACHE_DREQ_ST_H 12'hB8D +`define CSR_MPM_ICACHE_CRSP_ST 12'hB0E // core response stalls +`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8E +`define CSR_MPM_ICACHE_MSHR_ST 12'hB0F // MSHR stalls +`define CSR_MPM_ICACHE_MSHR_ST_H 12'hB8F +`define CSR_MPM_ICACHE_PIPE_ST 12'hB10 // pipeline stalls +`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB90 +`define CSR_MPM_ICACHE_READS 12'hB11 // total reads +`define CSR_MPM_ICACHE_READS_H 12'hB91 +// PERF: dcache +`define CSR_MPM_DCACHE_MISS_R 12'hB12 // read misses +`define CSR_MPM_DCACHE_MISS_R_H 12'hB92 +`define CSR_MPM_DCACHE_MISS_W 12'hB13 // write misses +`define CSR_MPM_DCACHE_MISS_W_H 12'hB93 +`define CSR_MPM_DCACHE_DREQ_ST 12'hB14 // dram request stalls +`define CSR_MPM_DCACHE_DREQ_ST_H 12'hB94 +`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls +`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95 +`define CSR_MPM_DCACHE_MSHR_ST 12'hB16 // MSHR stalls +`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB96 +`define CSR_MPM_DCACHE_PIPE_ST 12'hB17 // pipeline stalls +`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB97 +`define CSR_MPM_DCACHE_READS 12'hB18 // total reads +`define CSR_MPM_DCACHE_READS_H 12'hB98 +`define CSR_MPM_DCACHE_WRITES 12'hB19 // total writes +`define CSR_MPM_DCACHE_WRITES_H 12'hB99 +`define CSR_MPM_DCACHE_EVICTS 12'hB1A // total evictions +`define CSR_MPM_DCACHE_EVICTS_H 12'hB9A +// PERF: memory +`define CSR_MPM_DRAM_LAT 12'hB1B // dram latency (total) +`define CSR_MPM_DRAM_LAT_H 12'hB9B +`define CSR_MPM_DRAM_REQ 12'hB1C // dram requests +`define CSR_MPM_DRAM_REQ_H 12'hB9C +`define CSR_MPM_DRAM_RSP 12'hB1D // dram responses +`define CSR_MPM_DRAM_RSP_H 12'hB9D + +// Machine Information Registers `define CSR_MVENDORID 12'hF11 `define CSR_MARCHID 12'hF12 `define CSR_MIMPID 12'hF13 @@ -185,6 +249,38 @@ `define FPUQ_SIZE 4 `endif +// Icache Configurable Knobs ////////////////////////////////////////////////// + +// Size of cache in bytes +`ifndef ICACHE_SIZE +`define ICACHE_SIZE 4096 +`endif + +// Core Request Queue Size +`ifndef ICREQ_SIZE +`define ICREQ_SIZE 4 +`endif + +// Core Response Queue Size +`ifndef ICRSQ_SIZE +`define ICRSQ_SIZE 4 +`endif + +// Miss Handling Register Size +`ifndef IMSHR_SIZE +`define IMSHR_SIZE `NUM_WARPS +`endif + +// DRAM Request Queue Size +`ifndef IDREQ_SIZE +`define IDREQ_SIZE 4 +`endif + +// DRAM Response Queue Size +`ifndef IDRSQ_SIZE +`define IDRSQ_SIZE 4 +`endif + // Dcache Configurable Knobs ////////////////////////////////////////////////// // Size of cache in bytes @@ -232,38 +328,6 @@ `define DSRSQ_SIZE 4 `endif -// Icache Configurable Knobs ////////////////////////////////////////////////// - -// Size of cache in bytes -`ifndef ICACHE_SIZE -`define ICACHE_SIZE 4096 -`endif - -// Core Request Queue Size -`ifndef ICREQ_SIZE -`define ICREQ_SIZE 4 -`endif - -// Core Response Queue Size -`ifndef ICRSQ_SIZE -`define ICRSQ_SIZE 4 -`endif - -// Miss Handling Register Size -`ifndef IMSHR_SIZE -`define IMSHR_SIZE `NUM_WARPS -`endif - -// DRAM Request Queue Size -`ifndef IDREQ_SIZE -`define IDREQ_SIZE 4 -`endif - -// DRAM Response Queue Size -`ifndef IDRSQ_SIZE -`define IDRSQ_SIZE 4 -`endif - // SM Configurable Knobs ////////////////////////////////////////////////////// // Size of cache in bytes diff --git a/hw/rtl/VX_core.v b/hw/rtl/VX_core.v index 08f05413..79d5ce9d 100644 --- a/hw/rtl/VX_core.v +++ b/hw/rtl/VX_core.v @@ -66,6 +66,10 @@ module VX_core #( output wire busy, output wire ebreak ); +`ifdef PERF_ENABLE + VX_perf_memsys_if perf_memsys_if(); +`endif + VX_cache_dram_req_if #( .DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH), .DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH), @@ -174,6 +178,9 @@ module VX_core #( .CORE_ID(CORE_ID) ) pipeline ( `SCOPE_BIND_VX_core_pipeline + `ifdef PERF_ENABLE + .perf_memsys_if (perf_memsys_if), + `endif .clk(clk), .reset(reset), @@ -231,6 +238,9 @@ module VX_core #( .CORE_ID(CORE_ID) ) mem_unit ( `SCOPE_BIND_VX_core_mem_unit + `ifdef PERF_ENABLE + .perf_memsys_if (perf_memsys_if), + `endif .clk (clk), .reset (reset), @@ -238,7 +248,7 @@ module VX_core #( // Core <-> Dcache .core_dcache_req_if (core_dcache_req_if), .core_dcache_rsp_if (core_dcache_rsp_if), - + // Core <-> Icache .core_icache_req_if (core_icache_req_if), .core_icache_rsp_if (core_icache_rsp_if), diff --git a/hw/rtl/VX_csr_data.v b/hw/rtl/VX_csr_data.v index b150f281..a821e33e 100644 --- a/hw/rtl/VX_csr_data.v +++ b/hw/rtl/VX_csr_data.v @@ -6,6 +6,11 @@ module VX_csr_data #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_perf_memsys_if perf_memsys_if, + VX_perf_pipeline_if perf_pipeline_if, +`endif + VX_cmt_to_csr_if cmt_to_csr_if, VX_fpu_to_csr_if fpu_to_csr_if, @@ -114,6 +119,67 @@ module VX_csr_data #( `CSR_NW : read_data_r = `NUM_WARPS; `CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS; + `ifdef PERF_ENABLE + // PERF: pipeline + `CSR_MPM_ICACHE_ST : read_data_r = perf_pipeline_if.icache_stalls[31:0]; + `CSR_MPM_ICACHE_ST_H : read_data_r = perf_pipeline_if.icache_stalls[63:32]; + `CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibuffer_stalls[31:0]; + `CSR_MPM_IBUF_ST_H : read_data_r = perf_pipeline_if.ibuffer_stalls[63:32]; + `CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scoreboard_stalls[31:0]; + `CSR_MPM_SCRB_ST_H : read_data_r = perf_pipeline_if.scoreboard_stalls[63:32]; + `CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0]; + `CSR_MPM_ALU_ST_H : read_data_r = perf_pipeline_if.alu_stalls[63:32]; + `CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0]; + `CSR_MPM_LSU_ST_H : read_data_r = perf_pipeline_if.lsu_stalls[63:32]; + `CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0]; + `CSR_MPM_CSR_ST_H : read_data_r = perf_pipeline_if.csr_stalls[63:32]; + `CSR_MPM_MUL_ST : read_data_r = perf_pipeline_if.mul_stalls[31:0]; + `CSR_MPM_MUL_ST_H : read_data_r = perf_pipeline_if.mul_stalls[63:32]; + `CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0]; + `CSR_MPM_FPU_ST_H : read_data_r = perf_pipeline_if.fpu_stalls[63:32]; + `CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0]; + `CSR_MPM_GPU_ST_H : read_data_r = perf_pipeline_if.gpu_stalls[63:32]; + // PERF: icache + `CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_if.read_misses[31:0]; + `CSR_MPM_ICACHE_MISS_R_H : read_data_r = perf_memsys_if.icache_if.read_misses[63:32]; + `CSR_MPM_ICACHE_DREQ_ST : read_data_r = perf_memsys_if.icache_if.dreq_stalls[31:0]; + `CSR_MPM_ICACHE_DREQ_ST_H : read_data_r = perf_memsys_if.icache_if.dreq_stalls[63:32]; + `CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_if.crsp_stalls[31:0]; + `CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = perf_memsys_if.icache_if.crsp_stalls[63:32]; + `CSR_MPM_ICACHE_MSHR_ST : read_data_r = perf_memsys_if.icache_if.mshr_stalls[31:0]; + `CSR_MPM_ICACHE_MSHR_ST_H : read_data_r = perf_memsys_if.icache_if.mshr_stalls[63:32]; + `CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_if.pipe_stalls[31:0]; + `CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = perf_memsys_if.icache_if.pipe_stalls[63:32]; + `CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_if.reads[31:0]; + `CSR_MPM_ICACHE_READS_H : read_data_r = perf_memsys_if.icache_if.reads[63:32]; + // PERF: dcache + `CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_if.read_misses[31:0]; + `CSR_MPM_DCACHE_MISS_R_H : read_data_r = perf_memsys_if.dcache_if.read_misses[63:32]; + `CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_if.write_misses[31:0]; + `CSR_MPM_DCACHE_MISS_W_H : read_data_r = perf_memsys_if.dcache_if.write_misses[63:32]; + `CSR_MPM_DCACHE_DREQ_ST : read_data_r = perf_memsys_if.dcache_if.dreq_stalls[31:0]; + `CSR_MPM_DCACHE_DREQ_ST_H : read_data_r = perf_memsys_if.dcache_if.dreq_stalls[63:32]; + `CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_if.crsp_stalls[31:0]; + `CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = perf_memsys_if.dcache_if.crsp_stalls[63:32]; + `CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_if.mshr_stalls[31:0]; + `CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = perf_memsys_if.dcache_if.mshr_stalls[63:32]; + `CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_if.pipe_stalls[31:0]; + `CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = perf_memsys_if.dcache_if.pipe_stalls[63:32]; + `CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_if.reads[31:0]; + `CSR_MPM_DCACHE_READS_H : read_data_r = perf_memsys_if.dcache_if.reads[63:32]; + `CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_if.writes[31:0]; + `CSR_MPM_DCACHE_WRITES_H : read_data_r = perf_memsys_if.dcache_if.writes[63:32]; + `CSR_MPM_DCACHE_EVICTS : read_data_r = perf_memsys_if.dcache_if.evictions[31:0]; + `CSR_MPM_DCACHE_EVICTS_H : read_data_r = perf_memsys_if.dcache_if.evictions[63:32]; + // PERF: memory + `CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0]; + `CSR_MPM_DRAM_LAT_H : read_data_r = perf_memsys_if.dram_latency[63:32]; + `CSR_MPM_DRAM_REQ : read_data_r = perf_memsys_if.dram_requests[31:0]; + `CSR_MPM_DRAM_REQ_H : read_data_r = perf_memsys_if.dram_requests[63:32]; + `CSR_MPM_DRAM_RSP : read_data_r = perf_memsys_if.dram_responses[31:0]; + `CSR_MPM_DRAM_RSP_H : read_data_r = perf_memsys_if.dram_responses[63:32]; + `endif + `CSR_SATP : read_data_r = 32'(csr_satp); `CSR_MSTATUS : read_data_r = 32'(csr_mstatus); @@ -128,10 +194,10 @@ module VX_csr_data #( `CSR_PMPCFG0 : read_data_r = 32'(csr_pmpcfg[0]); `CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]); - `CSR_CYCLE : read_data_r = csr_cycle[31:0]; - `CSR_CYCLE_H : read_data_r = csr_cycle[63:32]; - `CSR_INSTRET : read_data_r = csr_instret[31:0]; - `CSR_INSTRET_H : read_data_r = csr_instret[63:32]; + `CSR_MCYCLE : read_data_r = csr_cycle[31:0]; + `CSR_MCYCLE_H : read_data_r = csr_cycle[63:32]; + `CSR_MINSTRET : read_data_r = csr_instret[31:0]; + `CSR_MINSTRET_H: read_data_r = csr_instret[63:32]; `CSR_MVENDORID : read_data_r = `VENDOR_ID; `CSR_MARCHID : read_data_r = `ARCHITECTURE_ID; diff --git a/hw/rtl/VX_csr_unit.v b/hw/rtl/VX_csr_unit.v index b2fcc54c..111a45d9 100644 --- a/hw/rtl/VX_csr_unit.v +++ b/hw/rtl/VX_csr_unit.v @@ -6,6 +6,11 @@ module VX_csr_unit #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_perf_memsys_if perf_memsys_if, + VX_perf_pipeline_if perf_pipeline_if, +`endif + VX_cmt_to_csr_if cmt_to_csr_if, VX_fpu_to_csr_if fpu_to_csr_if, @@ -51,6 +56,10 @@ module VX_csr_unit #( ) csr_data ( .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_memsys_if (perf_memsys_if), + .perf_pipeline_if (perf_pipeline_if), + `endif .cmt_to_csr_if (cmt_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if), .read_enable (csr_pipe_req_if.valid), diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 2bdf55ec..d00c8e2c 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -240,45 +240,10 @@ `define DBG_CACHE_REQ_MDATAW 0 `endif -////////////////////////// Dcache Configurable Knobs ////////////////////////// - -// Cache ID -`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0) - -// Block size in bytes -`define DBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE) - -// Word size in bytes -`define DWORD_SIZE 4 - -// TAG sharing enable -`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) - -// Core request tag bits -`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) - -// DRAM request data bits -`define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8) - -// DRAM request address bits -`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DBANK_LINE_SIZE)) - -// DRAM byte enable bits -`define DDRAM_BYTEEN_WIDTH `DBANK_LINE_SIZE - -// DRAM request tag bits -`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH - -// Core request size -`define DNUM_REQUESTS `NUM_THREADS - -// Snoop request tag bits -`define DSNP_TAG_WIDTH ((`NUM_CORES > 1) ? `LOG2UP(`L2SREQ_SIZE) : `L2SNP_TAG_WIDTH) - ////////////////////////// Icache Configurable Knobs ////////////////////////// // Cache ID -`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1) +`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0) // Block size in bytes `define IBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE) @@ -316,6 +281,41 @@ // Core request size `define INUM_REQUESTS 1 +////////////////////////// Dcache Configurable Knobs ////////////////////////// + +// Cache ID +`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1) + +// Block size in bytes +`define DBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE) + +// Word size in bytes +`define DWORD_SIZE 4 + +// TAG sharing enable +`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE) + +// Core request tag bits +`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS) + +// DRAM request data bits +`define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8) + +// DRAM request address bits +`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DBANK_LINE_SIZE)) + +// DRAM byte enable bits +`define DDRAM_BYTEEN_WIDTH `DBANK_LINE_SIZE + +// DRAM request tag bits +`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH + +// Core request size +`define DNUM_REQUESTS `NUM_THREADS + +// Snoop request tag bits +`define DSNP_TAG_WIDTH ((`NUM_CORES > 1) ? `LOG2UP(`L2SREQ_SIZE) : `L2SNP_TAG_WIDTH) + ////////////////////////// SM Configurable Knobs ////////////////////////////// // Cache ID diff --git a/hw/rtl/VX_execute.v b/hw/rtl/VX_execute.v index 2d9f98f2..f5be2d57 100644 --- a/hw/rtl/VX_execute.v +++ b/hw/rtl/VX_execute.v @@ -16,8 +16,13 @@ module VX_execute #( VX_cache_core_req_if dcache_req_if, VX_cache_core_rsp_if dcache_rsp_if, - // perf + // commit status VX_cmt_to_csr_if cmt_to_csr_if, + +`ifdef PERF_ENABLE + VX_perf_memsys_if perf_memsys_if, + VX_perf_pipeline_if perf_pipeline_if, + `endif // inputs VX_alu_req_if alu_req_if, @@ -72,7 +77,11 @@ module VX_execute #( .CORE_ID(CORE_ID) ) csr_unit ( .clk (clk), - .reset (reset), + .reset (reset), + `ifdef PERF_ENABLE + .perf_memsys_if (perf_memsys_if), + .perf_pipeline_if (perf_pipeline_if), + `endif .cmt_to_csr_if (cmt_to_csr_if), .fpu_to_csr_if (fpu_to_csr_if), .csr_io_req_if (csr_io_req_if), diff --git a/hw/rtl/VX_issue.v b/hw/rtl/VX_issue.v index eab416d8..8cb02fe8 100644 --- a/hw/rtl/VX_issue.v +++ b/hw/rtl/VX_issue.v @@ -8,6 +8,10 @@ module VX_issue #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_perf_pipeline_if perf_pipeline_if, +`endif + VX_decode_if decode_if, VX_writeback_if writeback_if, @@ -120,6 +124,21 @@ module VX_issue #( `SCOPE_ASSIGN (writeback_rd, writeback_if.rd); `SCOPE_ASSIGN (writeback_data, writeback_if.data); +`ifdef PERF_ENABLE + reg [63:0] perf_scoreboard_stalls; + always @(posedge clk) begin + if (reset) begin + perf_scoreboard_stalls <= 0; + end else begin + // scoreboard_stall + if (ibuf_deq_if.valid & scoreboard_delay) begin + perf_scoreboard_stalls <= perf_scoreboard_stalls + 64'd1; + end + end + end + assign perf_pipeline_if.scoreboard_stalls = perf_scoreboard_stalls; +`endif + `ifdef DBG_PRINT_PIPELINE always @(posedge clk) begin if (alu_req_if.valid && alu_req_if.ready) begin diff --git a/hw/rtl/VX_mem_unit.v b/hw/rtl/VX_mem_unit.v index eb4e3dc3..17a9f794 100644 --- a/hw/rtl/VX_mem_unit.v +++ b/hw/rtl/VX_mem_unit.v @@ -7,6 +7,10 @@ module VX_mem_unit # ( input wire clk, input wire reset, + +`ifdef PERF_ENABLE + VX_perf_memsys_if perf_memsys_if, +`endif // Core <-> Dcache VX_cache_core_req_if core_dcache_req_if, @@ -28,6 +32,11 @@ module VX_mem_unit # ( VX_cache_core_req_if io_req_if, VX_cache_core_rsp_if io_rsp_if ); + +`ifdef PERF_ENABLE + VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if(); +`endif + VX_cache_dram_req_if #( .DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH), .DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH), @@ -80,6 +89,82 @@ module VX_mem_unit # ( .smem_rsp_if (smem_rsp_if), .io_rsp_if (io_rsp_if), .core_rsp_if (core_dcache_rsp_if) + ); + + VX_cache #( + .CACHE_ID (`ICACHE_ID), + .CACHE_SIZE (`ICACHE_SIZE), + .BANK_LINE_SIZE (`IBANK_LINE_SIZE), + .NUM_BANKS (`INUM_BANKS), + .WORD_SIZE (`IWORD_SIZE), + .NUM_REQS (`INUM_REQUESTS), + .CREQ_SIZE (`ICREQ_SIZE), + .MSHR_SIZE (`IMSHR_SIZE), + .DRSQ_SIZE (`IDRSQ_SIZE), + .SREQ_SIZE (1), + .CRSQ_SIZE (`ICRSQ_SIZE), + .DREQ_SIZE (`IDREQ_SIZE), + .SRSQ_SIZE (1), + .DRAM_ENABLE (1), + .FLUSH_ENABLE (0), + .WRITE_ENABLE (0), + .CORE_TAG_WIDTH (`ICORE_TAG_WIDTH), + .CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS), + .DRAM_TAG_WIDTH (`IDRAM_TAG_WIDTH) + ) icache ( + `SCOPE_BIND_VX_mem_unit_icache + + .clk (clk), + .reset (reset), + + // Core request + .core_req_valid (core_icache_req_if.valid), + .core_req_rw (core_icache_req_if.rw), + .core_req_byteen (core_icache_req_if.byteen), + .core_req_addr (core_icache_req_if.addr), + .core_req_data (core_icache_req_if.data), + .core_req_tag (core_icache_req_if.tag), + .core_req_ready (core_icache_req_if.ready), + + // Core response + .core_rsp_valid (core_icache_rsp_if.valid), + .core_rsp_data (core_icache_rsp_if.data), + .core_rsp_tag (core_icache_rsp_if.tag), + .core_rsp_ready (core_icache_rsp_if.ready), + + `ifdef PERF_ENABLE + .perf_cache_if (perf_icache_if), + `endif + + // DRAM Req + .dram_req_valid (icache_dram_req_if.valid), + .dram_req_rw (icache_dram_req_if.rw), + .dram_req_byteen (icache_dram_req_if.byteen), + .dram_req_addr (icache_dram_req_if.addr), + .dram_req_data (icache_dram_req_if.data), + .dram_req_tag (icache_dram_req_if.tag), + .dram_req_ready (icache_dram_req_if.ready), + + // DRAM response + .dram_rsp_valid (icache_dram_rsp_if.valid), + .dram_rsp_data (icache_dram_rsp_if.data), + .dram_rsp_tag (icache_dram_rsp_if.tag), + .dram_rsp_ready (icache_dram_rsp_if.ready), + + // Snoop request + .snp_req_valid (1'b0), + .snp_req_addr (0), + .snp_req_inv (1'b0), + .snp_req_tag (0), + `UNUSED_PIN (snp_req_ready), + + // Snoop response + `UNUSED_PIN (snp_rsp_valid), + `UNUSED_PIN (snp_rsp_tag), + .snp_rsp_ready (1'b0), + + // Miss status + `UNUSED_PIN (miss_vec) ); VX_cache #( @@ -124,6 +209,10 @@ module VX_mem_unit # ( .core_rsp_tag (dcache_rsp_if.tag), .core_rsp_ready (dcache_rsp_if.ready), + `ifdef PERF_ENABLE + .perf_cache_if (perf_dcache_if), + `endif + // DRAM request .dram_req_valid (dcache_dram_req_if.valid), .dram_req_rw (dcache_dram_req_if.rw), @@ -151,78 +240,6 @@ module VX_mem_unit # ( .snp_rsp_tag (dcache_snp_rsp_if.tag), .snp_rsp_ready (dcache_snp_rsp_if.ready), - // Miss status - `UNUSED_PIN (miss_vec) - ); - - VX_cache #( - .CACHE_ID (`ICACHE_ID), - .CACHE_SIZE (`ICACHE_SIZE), - .BANK_LINE_SIZE (`IBANK_LINE_SIZE), - .NUM_BANKS (`INUM_BANKS), - .WORD_SIZE (`IWORD_SIZE), - .NUM_REQS (`INUM_REQUESTS), - .CREQ_SIZE (`ICREQ_SIZE), - .MSHR_SIZE (`IMSHR_SIZE), - .DRSQ_SIZE (`IDRSQ_SIZE), - .SREQ_SIZE (1), - .CRSQ_SIZE (`ICRSQ_SIZE), - .DREQ_SIZE (`IDREQ_SIZE), - .SRSQ_SIZE (1), - .DRAM_ENABLE (1), - .FLUSH_ENABLE (0), - .WRITE_ENABLE (0), - .CORE_TAG_WIDTH (`ICORE_TAG_WIDTH), - .CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS), - .DRAM_TAG_WIDTH (`IDRAM_TAG_WIDTH) - ) icache ( - `SCOPE_BIND_VX_mem_unit_icache - - .clk (clk), - .reset (reset), - - // Core request - .core_req_valid (core_icache_req_if.valid), - .core_req_rw (core_icache_req_if.rw), - .core_req_byteen (core_icache_req_if.byteen), - .core_req_addr (core_icache_req_if.addr), - .core_req_data (core_icache_req_if.data), - .core_req_tag (core_icache_req_if.tag), - .core_req_ready (core_icache_req_if.ready), - - // Core response - .core_rsp_valid (core_icache_rsp_if.valid), - .core_rsp_data (core_icache_rsp_if.data), - .core_rsp_tag (core_icache_rsp_if.tag), - .core_rsp_ready (core_icache_rsp_if.ready), - - // DRAM Req - .dram_req_valid (icache_dram_req_if.valid), - .dram_req_rw (icache_dram_req_if.rw), - .dram_req_byteen (icache_dram_req_if.byteen), - .dram_req_addr (icache_dram_req_if.addr), - .dram_req_data (icache_dram_req_if.data), - .dram_req_tag (icache_dram_req_if.tag), - .dram_req_ready (icache_dram_req_if.ready), - - // DRAM response - .dram_rsp_valid (icache_dram_rsp_if.valid), - .dram_rsp_data (icache_dram_rsp_if.data), - .dram_rsp_tag (icache_dram_rsp_if.tag), - .dram_rsp_ready (icache_dram_rsp_if.ready), - - // Snoop request - .snp_req_valid (1'b0), - .snp_req_addr (0), - .snp_req_inv (1'b0), - .snp_req_tag (0), - `UNUSED_PIN (snp_req_ready), - - // Snoop response - `UNUSED_PIN (snp_rsp_valid), - `UNUSED_PIN (snp_rsp_tag), - .snp_rsp_ready (1'b0), - // Miss status `UNUSED_PIN (miss_vec) ); @@ -268,6 +285,10 @@ module VX_mem_unit # ( .core_rsp_tag (smem_rsp_if.tag), .core_rsp_ready (smem_rsp_if.ready), + `ifdef PERF_ENABLE + .perf_cache_if (perf_smem_if), + `endif + // DRAM request `UNUSED_PIN (dram_req_valid), `UNUSED_PIN (dram_req_rw), @@ -340,4 +361,65 @@ module VX_mem_unit # ( .rsp_ready_in (dram_rsp_if.ready) ); +`ifdef PERF_ENABLE + + assign perf_memsys_if.icache_if.read_misses = perf_icache_if.read_misses; + assign perf_memsys_if.icache_if.write_misses = perf_icache_if.write_misses; + assign perf_memsys_if.icache_if.mshr_stalls = perf_icache_if.mshr_stalls; + assign perf_memsys_if.icache_if.crsp_stalls = perf_icache_if.crsp_stalls; + assign perf_memsys_if.icache_if.dreq_stalls = perf_icache_if.dreq_stalls; + assign perf_memsys_if.icache_if.pipe_stalls = perf_icache_if.pipe_stalls; + assign perf_memsys_if.icache_if.reads = perf_icache_if.reads; + assign perf_memsys_if.icache_if.writes = perf_icache_if.writes; + assign perf_memsys_if.icache_if.evictions = perf_icache_if.evictions; + + assign perf_memsys_if.dcache_if.read_misses = perf_dcache_if.read_misses; + assign perf_memsys_if.dcache_if.write_misses = perf_dcache_if.write_misses; + assign perf_memsys_if.dcache_if.mshr_stalls = perf_dcache_if.mshr_stalls; + assign perf_memsys_if.dcache_if.crsp_stalls = perf_dcache_if.crsp_stalls; + assign perf_memsys_if.dcache_if.dreq_stalls = perf_dcache_if.dreq_stalls; + assign perf_memsys_if.dcache_if.pipe_stalls = perf_dcache_if.pipe_stalls; + assign perf_memsys_if.dcache_if.reads = perf_dcache_if.reads; + assign perf_memsys_if.dcache_if.writes = perf_dcache_if.writes; + assign perf_memsys_if.dcache_if.evictions = perf_dcache_if.evictions; + + reg [63:0] perf_dram_lat_per_cycle; + + always @(posedge clk) begin + if (reset) begin + perf_dram_lat_per_cycle <= 0; + end else begin + if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready & dram_rsp_if.valid & dram_rsp_if.ready) begin + perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle; + end else if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready) begin + perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1; + end else if (dram_rsp_if.valid & dram_rsp_if.ready) begin + perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1; + end + end + end + + reg [63:0] perf_dram_req, perf_dram_rsp, perf_dram_lat; + + always @(posedge clk) begin + if (reset) begin + perf_dram_req <= 0; + perf_dram_rsp <= 0; + perf_dram_lat <= 0; + end else begin + if (dram_req_if.valid & dram_req_if.ready) begin + perf_dram_req <= perf_dram_req + 64'd1; + end + if (dram_rsp_if.valid & dram_rsp_if.ready) begin + perf_dram_rsp <= perf_dram_rsp + 64'd1; + end + perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle; + end + end + + assign perf_memsys_if.dram_requests = perf_dram_req; + assign perf_memsys_if.dram_responses = perf_dram_rsp; + assign perf_memsys_if.dram_latency = perf_dram_lat; +`endif + endmodule diff --git a/hw/rtl/VX_pipeline.v b/hw/rtl/VX_pipeline.v index e0f0a294..b7072e7c 100644 --- a/hw/rtl/VX_pipeline.v +++ b/hw/rtl/VX_pipeline.v @@ -51,6 +51,10 @@ module VX_pipeline #( output wire[31:0] csr_io_rsp_data, input wire csr_io_rsp_ready, +`ifdef PERF_ENABLE + VX_perf_memsys_if perf_memsys_if, +`endif + // Status output wire busy, output wire ebreak @@ -171,6 +175,10 @@ module VX_pipeline #( VX_commit_if fpu_commit_if(); VX_commit_if gpu_commit_if(); +`ifdef PERF_ENABLE + VX_perf_pipeline_if perf_pipeline_if(); +`endif + VX_fetch #( .CORE_ID(CORE_ID) ) fetch ( @@ -206,6 +214,10 @@ module VX_pipeline #( .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_pipeline_if (perf_pipeline_if), + `endif + .decode_if (decode_if), .writeback_if (writeback_if), @@ -224,7 +236,12 @@ module VX_pipeline #( .clk (clk), .reset (reset), - + + `ifdef PERF_ENABLE + .perf_memsys_if (perf_memsys_if), + .perf_pipeline_if (perf_pipeline_if), + `endif + .dcache_req_if (core_dcache_req_if), .dcache_rsp_if (core_dcache_rsp_if), @@ -272,4 +289,78 @@ module VX_pipeline #( .cmt_to_csr_if (cmt_to_csr_if) ); +`ifdef PERF_ENABLE + reg [63:0] perf_icache_stalls; + reg [63:0] perf_ibuffer_stalls; + reg [63:0] perf_alu_stalls; + reg [63:0] perf_lsu_stalls; + reg [63:0] perf_csr_stalls; + reg [63:0] perf_gpu_stalls; +`ifdef EXT_M_ENABLE + reg [63:0] perf_mul_stalls; +`endif +`ifdef EXT_F_ENABLE + reg [63:0] perf_fpu_stalls; +`endif + + always @(posedge clk) begin + if (reset) begin + perf_icache_stalls <= 0; + perf_ibuffer_stalls <= 0; + perf_alu_stalls <= 0; + perf_lsu_stalls <= 0; + perf_csr_stalls <= 0; + perf_gpu_stalls <= 0; + `ifdef EXT_M_ENABLE + perf_mul_stalls <= 0; + `endif + `ifdef EXT_F_ENABLE + perf_fpu_stalls <= 0; + `endif + end else begin + if (core_icache_req_if.valid & !core_icache_req_if.ready) begin + perf_icache_stalls <= perf_icache_stalls + 64'd1; + end + if (decode_if.valid & !decode_if.ready) begin + perf_ibuffer_stalls <= perf_ibuffer_stalls + 64'd1; + end + if (alu_req_if.valid & !alu_req_if.ready) begin + perf_alu_stalls <= perf_alu_stalls + 64'd1; + end + if (lsu_req_if.valid & !lsu_req_if.ready) begin + perf_lsu_stalls <= perf_lsu_stalls + 64'd1; + end + if (csr_req_if.valid & !csr_req_if.ready) begin + perf_csr_stalls <= perf_csr_stalls + 64'd1; + end + if (gpu_req_if.valid & !gpu_req_if.ready) begin + perf_gpu_stalls <= perf_gpu_stalls + 64'd1; + end + `ifdef EXT_M_ENABLE + if (mul_req_if.valid & !mul_req_if.ready) begin + perf_mul_stalls <= perf_mul_stalls + 64'd1; + end + `endif + `ifdef EXT_F_ENABLE + if (fpu_req_if.valid & !fpu_req_if.ready) begin + perf_fpu_stalls <= perf_fpu_stalls + 64'd1; + end + `endif + end + end + + assign perf_pipeline_if.icache_stalls = perf_icache_stalls; + assign perf_pipeline_if.ibuffer_stalls = perf_ibuffer_stalls; + assign perf_pipeline_if.alu_stalls = perf_alu_stalls; + assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls; + assign perf_pipeline_if.csr_stalls = perf_csr_stalls; + assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls; +`ifdef EXT_M_ENABLE + assign perf_pipeline_if.mul_stalls = perf_mul_stalls; +`endif +`ifdef EXT_F_ENABLE + assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls; +`endif +`endif + endmodule diff --git a/hw/rtl/Vortex.v b/hw/rtl/Vortex.v index c7be43cb..d9625e25 100644 --- a/hw/rtl/Vortex.v +++ b/hw/rtl/Vortex.v @@ -299,6 +299,9 @@ module Vortex ( ); if (`L3_ENABLE) begin + `ifdef PERF_ENABLE + VX_perf_cache_if perf_l3cache_if(); + `endif wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid_qual; wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw_qual; @@ -347,10 +350,14 @@ module Vortex ( .SNP_TAG_WIDTH (`L3SNP_TAG_WIDTH) ) l3cache ( `SCOPE_BIND_Vortex_l3cache - + .clk (clk), .reset (reset), + `ifdef PERF_ENABLE + .perf_cache_if (perf_l3cache_if), + `endif + // Core request .core_req_valid (per_cluster_dram_req_valid_qual), .core_req_rw (per_cluster_dram_req_rw_qual), diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index b5758eef..43232d90 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -96,6 +96,14 @@ module VX_bank #( output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag, input wire snp_rsp_ready, +`ifdef PERF_ENABLE + output wire perf_mshr_stall, + output wire perf_pipe_stall, + output wire perf_evict, + output wire perf_read_miss, + output wire perf_write_miss, +`endif + // Misses output wire misses ); @@ -567,7 +575,6 @@ end else begin assign incoming_fill_st2 = 0; assign misses = 0; - end `ifdef DBG_CACHE_REQ_INFO @@ -951,6 +958,18 @@ end `SCOPE_ASSIGN (addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID)); `SCOPE_ASSIGN (addr_st3, `LINE_TO_BYTE_ADDR(addr_st3, BANK_ID)); +`ifdef PERF_ENABLE + assign perf_pipe_stall = pipeline_stall; + assign perf_mshr_stall = mshr_going_full; + assign perf_read_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & !mem_rw_st1; + assign perf_write_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & mem_rw_st1; + if (DRAM_ENABLE) begin + assign perf_evict = dreq_push & do_writeback_st3 & !is_snp_st3; + end else begin + assign perf_evict = 0; + end +`endif + `ifdef DBG_PRINT_CACHE_BANK wire incoming_fill_dfp_st3 = drsq_push && (addr_st3 == dram_rsp_addr); always @(posedge clk) begin diff --git a/hw/rtl/cache/VX_cache.v b/hw/rtl/cache/VX_cache.v index 5dc12063..3127420e 100644 --- a/hw/rtl/cache/VX_cache.v +++ b/hw/rtl/cache/VX_cache.v @@ -70,7 +70,12 @@ module VX_cache #( output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data, output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag, input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready, - + + // PERF +`ifdef PERF_ENABLE + VX_perf_cache_if perf_cache_if, +`endif + // DRAM request output wire dram_req_valid, output wire dram_req_rw, @@ -130,7 +135,16 @@ module VX_cache #( wire [NUM_BANKS-1:0] per_bank_miss; assign miss_vec = per_bank_miss; - + + +`ifdef PERF_ENABLE + wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; + wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank; + wire [NUM_BANKS-1:0] perf_evict_per_bank; + wire [NUM_BANKS-1:0] perf_read_miss_per_bank; + wire [NUM_BANKS-1:0] perf_write_miss_per_bank; +`endif + if (NUM_BANKS == 1) begin assign snp_req_ready = per_bank_snp_req_ready; end else begin @@ -139,9 +153,9 @@ module VX_cache #( VX_cache_core_req_bank_sel #( .BANK_LINE_SIZE (BANK_LINE_SIZE), - .NUM_BANKS (NUM_BANKS), - .WORD_SIZE (WORD_SIZE), - .NUM_REQS (NUM_REQS), + .NUM_BANKS (NUM_BANKS), + .WORD_SIZE (WORD_SIZE), + .NUM_REQS (NUM_REQS), .CORE_TAG_ID_BITS (CORE_TAG_ID_BITS) ) cache_core_req_bank_sel ( .core_req_valid (core_req_valid), @@ -312,6 +326,14 @@ module VX_cache #( .dram_rsp_addr (curr_bank_dram_rsp_addr), .dram_rsp_ready (curr_bank_dram_rsp_ready), + `ifdef PERF_ENABLE + .perf_mshr_stall (perf_mshr_stall_per_bank[i]), + .perf_pipe_stall (perf_pipe_stall_per_bank[i]), + .perf_evict (perf_evict_per_bank[i]), + .perf_read_miss (perf_read_miss_per_bank[i]), + .perf_write_miss (perf_write_miss_per_bank[i]), + `endif + // Snoop request .snp_req_valid (curr_bank_snp_req_valid), .snp_req_addr (curr_bank_snp_req_addr), @@ -408,4 +430,150 @@ module VX_cache #( `UNUSED_VAR (snp_rsp_ready) end +`ifdef PERF_ENABLE + // per cycle: core_req_r, core_req_w + reg [($clog2(NUM_REQS+1)-1):0] perf_core_req_r_per_cycle, perf_core_req_w_per_cycle; + reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle; + + if (CORE_TAG_ID_BITS != 0) begin + VX_countones #( // core_req_r + .N(NUM_REQS) + ) perf_countones_core_req_r_count ( + .valids (core_req_valid & {NUM_REQS{core_req_ready & ~core_req_rw}}), + .count (perf_core_req_r_per_cycle) + ); + + VX_countones #( // core_req_w + .N(NUM_REQS) + ) perf_countones_core_req_w_count ( + .valids (core_req_valid & {NUM_REQS{core_req_ready & core_req_rw}}), + .count (perf_core_req_w_per_cycle) + ); + + VX_countones #( // core_rsp + .N(NUM_REQS) + ) perf_countones_core_rsp_count ( + .valids (core_rsp_valid & {NUM_REQS{!core_rsp_ready}}), + .count (perf_crsp_stall_per_cycle) + ); + end else begin + VX_countones #( // core_req_r + .N(NUM_REQS) + ) perf_countones_core_req_r_count ( + .valids (core_req_valid & core_req_ready & ~core_req_rw), + .count (perf_core_req_r_per_cycle) + ); + + VX_countones #( // core_req_w + .N(NUM_REQS) + ) perf_countones_core_req_w_count ( + .valids (core_req_valid & core_req_ready & core_req_rw), + .count (perf_core_req_w_per_cycle) + ); + + VX_countones #( // core_rsp + .N(NUM_REQS) + ) perf_countones_core_rsp_count ( + .valids (core_rsp_valid & ~core_rsp_ready), + .count (perf_crsp_stall_per_cycle) + ); + end + + // per cycle: msrq stalls, pipeline stalls, evictions, read misses, write misses + reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_evictions_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle; + reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle; + + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_mshr_stall_count ( + .valids (perf_mshr_stall_per_bank), + .count (perf_mshr_stall_per_cycle) + ); + + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_total_stall_count ( + .valids (perf_pipe_stall_per_bank), + .count (perf_pipe_stall_per_cycle) + ); + + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_EVICTSict_count ( + .valids (perf_evict_per_bank), + .count (perf_evictions_per_cycle) + ); + + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_read_miss_count ( + .valids (perf_read_miss_per_bank), + .count (perf_read_miss_per_cycle) + ); + + VX_countones #( + .N(NUM_BANKS) + ) perf_countones_write_miss_count ( + .valids (perf_write_miss_per_bank), + .count (perf_write_miss_per_cycle) + ); + + reg [63:0] perf_core_req_r; + reg [63:0] perf_core_req_w; + reg [63:0] perf_mshr_stall; + reg [63:0] perf_pipe_stall; + reg [63:0] perf_evictions; + reg [63:0] perf_read_miss; + reg [63:0] perf_write_miss; + reg [63:0] perf_crsp_stall; + reg [63:0] perf_dreq_stall; + + always @(posedge clk) begin + if (reset) begin + perf_core_req_r <= 0; + perf_core_req_w <= 0; + perf_crsp_stall <= 0; + perf_mshr_stall <= 0; + perf_pipe_stall <= 0; + perf_evictions <= 0; + perf_read_miss <= 0; + perf_write_miss <= 0; + perf_dreq_stall <= 0; + end else begin + // core requests + perf_core_req_r <= perf_core_req_r + $bits(perf_core_req_r)'(perf_core_req_r_per_cycle); + perf_core_req_w <= perf_core_req_w + $bits(perf_core_req_w)'(perf_core_req_w_per_cycle); + // core response stalls + perf_crsp_stall <= perf_crsp_stall + $bits(perf_crsp_stall)'(perf_crsp_stall_per_cycle); + // miss reserve queue stalls + perf_mshr_stall <= perf_mshr_stall + $bits(perf_mshr_stall)'(perf_mshr_stall_per_cycle); + // pipeline stalls + perf_pipe_stall <= perf_pipe_stall + $bits(perf_pipe_stall)'(perf_pipe_stall_per_cycle); + // total evictions + perf_evictions <= perf_evictions + $bits(perf_evictions)'(perf_evictions_per_cycle); + // read misses + perf_read_miss <= perf_read_miss + $bits(perf_read_miss)'(perf_read_miss_per_cycle); + // write misses + perf_write_miss <= perf_write_miss + $bits(perf_write_miss)'(perf_write_miss_per_cycle); + // dram request stalls + if (dram_req_valid & !dram_req_ready) begin + perf_dreq_stall <= perf_dreq_stall + 64'd1; + end + end + end + + assign perf_cache_if.reads = perf_core_req_r; + assign perf_cache_if.writes = perf_core_req_w; + assign perf_cache_if.read_misses = perf_read_miss; + assign perf_cache_if.write_misses = perf_write_miss; + assign perf_cache_if.evictions = perf_evictions; + assign perf_cache_if.mshr_stalls = perf_mshr_stall; + assign perf_cache_if.pipe_stalls = perf_pipe_stall; + assign perf_cache_if.crsp_stalls = perf_crsp_stall; + assign perf_cache_if.dreq_stalls = perf_dreq_stall; +`endif + endmodule diff --git a/hw/rtl/fp_cores/VX_fpnew.v b/hw/rtl/fp_cores/VX_fpnew.v index e664a880..8fa35215 100644 --- a/hw/rtl/fp_cores/VX_fpnew.v +++ b/hw/rtl/fp_cores/VX_fpnew.v @@ -159,7 +159,7 @@ module VX_fpnew .tag_o ({fpu_tag_out, fpu_has_fflags_out}), .out_valid_o (fpu_valid_out), .out_ready_i (fpu_ready_out), - `UNUSED_PIN (busy_o) + `UNUSED_PIN (busy_o) ); end else begin fpnew_top #( @@ -179,14 +179,14 @@ module VX_fpnew .vectorial_op_i (1'b0), .tag_i (1'b0), .in_valid_i (fpu_valid_in), - `UNUSED_PIN (in_ready_o), + `UNUSED_PIN (in_ready_o), .flush_i (reset), .result_o (fpu_result[i]), .status_o (fpu_status[i]), - `UNUSED_PIN (tag_o), - `UNUSED_PIN (out_valid_o), + `UNUSED_PIN (tag_o), + `UNUSED_PIN (out_valid_o), .out_ready_i (fpu_ready_out), - `UNUSED_PIN (busy_o) + `UNUSED_PIN (busy_o) ); end end diff --git a/hw/rtl/interfaces/VX_perf_cache_if.v b/hw/rtl/interfaces/VX_perf_cache_if.v new file mode 100644 index 00000000..9fcde291 --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_cache_if.v @@ -0,0 +1,20 @@ +`ifndef VX_PERF_CACHE_IF +`define VX_PERF_CACHE_IF + +`include "VX_define.vh" + +interface VX_perf_cache_if (); + + wire [63:0] reads; + wire [63:0] writes; + wire [63:0] read_misses; + wire [63:0] write_misses; + wire [63:0] evictions; + wire [63:0] mshr_stalls; + wire [63:0] crsp_stalls; + wire [63:0] dreq_stalls; + wire [63:0] pipe_stalls; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_perf_memsys_if.v b/hw/rtl/interfaces/VX_perf_memsys_if.v new file mode 100644 index 00000000..11ee96fb --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_memsys_if.v @@ -0,0 +1,17 @@ +`ifndef VX_PERF_MEMSYS_IF +`define VX_PERF_MEMSYS_IF + +`include "VX_define.vh" + +interface VX_perf_memsys_if (); + + VX_perf_cache_if dcache_if; + VX_perf_cache_if icache_if; + + wire [63:0] dram_latency; + wire [63:0] dram_requests; + wire [63:0] dram_responses; + +endinterface + +`endif \ No newline at end of file diff --git a/hw/rtl/interfaces/VX_perf_pipeline_if.v b/hw/rtl/interfaces/VX_perf_pipeline_if.v new file mode 100644 index 00000000..a1adb7fd --- /dev/null +++ b/hw/rtl/interfaces/VX_perf_pipeline_if.v @@ -0,0 +1,25 @@ +`ifndef VX_PERF_PIPELINE_IF +`define VX_PERF_PIPELINE_IF + +`include "VX_define.vh" + +interface VX_perf_pipeline_if (); + // from pipeline + wire [63:0] icache_stalls; + wire [63:0] ibuffer_stalls; + // from issue + wire [63:0] scoreboard_stalls; + // from execute + wire [63:0] lsu_stalls; + wire [63:0] csr_stalls; + wire [63:0] alu_stalls; + wire [63:0] gpu_stalls; +`ifdef EXT_M_ENABLE + wire [63:0] mul_stalls; +`endif +`ifdef EXT_F_ENABLE + wire [63:0] fpu_stalls; +`endif +endinterface + +`endif \ No newline at end of file