merging perf counters

This commit is contained in:
Blaise Tine
2020-12-08 21:02:39 -08:00
27 changed files with 1047 additions and 230 deletions

View File

@@ -91,24 +91,240 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
return err;
}
extern int vx_get_perf(vx_device_h device, int core_id, size_t* instrs, size_t* cycles) {
int vx_csr_get_l(vx_device_h device, int core_id, int addr, int addr_h, uint64_t* value) {
int ret = 0;
unsigned value_lo, value_hi;
ret |= vx_csr_get(device, core_id, addr, &value_lo);
ret |= vx_csr_get(device, core_id, addr_h, &value_hi);
*value = (uint64_t(value_hi) << 32) | value_lo;
return ret;
}
extern int vx_dump_perf(vx_device_h device, FILE* stream) {
int ret = 0;
unsigned value;
if (instrs) {
ret |= vx_csr_get(device, core_id, CSR_INSTRET_H, &value);
*instrs = value;
ret |= vx_csr_get(device, core_id, CSR_INSTRET, &value);
*instrs = (*instrs << 32) | value;
}
unsigned num_cores;
vx_csr_get(device, 0, CSR_NC, &num_cores);
if (cycles) {
ret |= vx_csr_get(device, core_id, CSR_CYCLE_H, &value);
*cycles = value;
ret |= vx_csr_get(device, core_id, CSR_CYCLE, &value);
*cycles = (*cycles << 32) | value;
}
uint64_t instrs = 0;
uint64_t cycles = 0;
#ifdef PERF_ENABLE
// PERF: pipeline stalls
uint64_t lsu_stalls = 0;
uint64_t fpu_stalls = 0;
uint64_t mul_stalls = 0;
uint64_t csr_stalls = 0;
uint64_t alu_stalls = 0;
uint64_t gpu_stalls = 0;
uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0;
uint64_t icache_stalls = 0;
// PERF: Icache
uint64_t icache_reads = 0;
uint64_t icache_read_misses = 0;
uint64_t icache_pipe_stalls = 0;
uint64_t icache_dram_stalls = 0;
uint64_t icache_mshr_stalls = 0;
uint64_t icache_rsp_stalls = 0;
// PERF: Dcache
uint64_t dcache_reads = 0;
uint64_t dcache_writes = 0;
uint64_t dcache_read_misses = 0;
uint64_t dcache_write_misses = 0;
uint64_t dcache_pipe_stalls = 0;
uint64_t dcache_dram_stalls = 0;
uint64_t dcache_mshr_stalls = 0;
uint64_t dcache_rsp_stalls = 0;
uint64_t dcache_evictions = 0;
// PERF: memory
uint64_t dram_req = 0;
uint64_t dram_rsp = 0;
uint64_t dram_lat = 0;
#endif
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t instrs_per_core, cycles_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MINSTRET, CSR_MINSTRET_H, &instrs_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MCYCLE, CSR_MCYCLE_H, &cycles_per_core);
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core;
cycles = std::max<uint64_t>(cycles_per_core, cycles);
#ifdef PERF_ENABLE
// PERF: pipeline
// icache_stall
uint64_t icache_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_ST, CSR_MPM_ICACHE_ST_H, &icache_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache stalls=%ld\n", core_id, icache_stalls_per_core);
icache_stalls += icache_stalls_per_core;
// ibuffer_stall
uint64_t ibuffer_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_IBUF_ST, CSR_MPM_IBUF_ST_H, &ibuffer_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
ibuffer_stalls += ibuffer_stalls_per_core;
// scoreboard_stall
uint64_t scoreboard_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SCRB_ST, CSR_MPM_SCRB_ST_H, &scoreboard_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
scoreboard_stalls += scoreboard_stalls_per_core;
// alu_stall
uint64_t alu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ALU_ST, CSR_MPM_ALU_ST_H, &alu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu stalls=%ld\n", core_id, alu_stalls_per_core);
alu_stalls += alu_stalls_per_core;
// lsu_stall
uint64_t lsu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_LSU_ST, CSR_MPM_LSU_ST_H, &lsu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu stalls=%ld\n", core_id, lsu_stalls_per_core);
lsu_stalls += lsu_stalls_per_core;
// csr_stall
uint64_t csr_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_CSR_ST, CSR_MPM_CSR_ST_H, &csr_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: csr stalls=%ld\n", core_id, csr_stalls_per_core);
csr_stalls += csr_stalls_per_core;
// mul_stall
uint64_t mul_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MUL_ST, CSR_MPM_MUL_ST_H, &mul_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: mul stalls=%ld\n", core_id, mul_stalls_per_core);
mul_stalls += mul_stalls_per_core;
// fpu_stall
uint64_t fpu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_FPU_ST, CSR_MPM_FPU_ST_H, &fpu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu stalls=%ld\n", core_id, fpu_stalls_per_core);
fpu_stalls += fpu_stalls_per_core;
// gpu_stall
uint64_t gpu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_GPU_ST, CSR_MPM_GPU_ST_H, &gpu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu stalls=%ld\n", core_id, gpu_stalls_per_core);
gpu_stalls += gpu_stalls_per_core;
// PERF: Icache
// total reads
uint64_t icache_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_READS, CSR_MPM_ICACHE_READS_H, &icache_reads_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads_per_core);
icache_reads += icache_reads_per_core;
// read misses
uint64_t icache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MISS_R, CSR_MPM_ICACHE_MISS_R_H, &icache_miss_r_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld\n", core_id, icache_miss_r_per_core);
icache_read_misses += icache_miss_r_per_core;
// pipeline stalls
uint64_t icache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_PIPE_ST, CSR_MPM_ICACHE_PIPE_ST_H, &icache_pipe_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core);
icache_pipe_stalls += icache_pipe_st_per_core;
// response stalls
uint64_t icache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_CRSP_ST, CSR_MPM_ICACHE_CRSP_ST_H, &icache_crsp_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core);
icache_rsp_stalls += icache_crsp_st_per_core;
// dram_stalls
uint64_t icache_dram_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_DREQ_ST, CSR_MPM_ICACHE_DREQ_ST_H, &icache_dram_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache dram stalls=%ld\n", core_id, icache_dram_st_per_core);
icache_dram_stalls += icache_dram_st_per_core;
// mshr_stalls
uint64_t icache_mshr_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MSHR_ST, CSR_MPM_ICACHE_MSHR_ST_H, &icache_mshr_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache mshr stalls=%ld\n", core_id, icache_mshr_st_per_core);
icache_mshr_stalls += icache_mshr_st_per_core;
// PERF: Dcache
// total reads
uint64_t dcache_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_READS, CSR_MPM_DCACHE_READS_H, &dcache_reads_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads_per_core);
dcache_reads += dcache_reads_per_core;
// total write
uint64_t dcache_writes_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_WRITES, CSR_MPM_DCACHE_WRITES_H, &dcache_writes_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes_per_core);
dcache_writes += dcache_writes_per_core;
// read misses
uint64_t dcache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_R, CSR_MPM_DCACHE_MISS_R_H, &dcache_miss_r_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld\n", core_id, dcache_miss_r_per_core);
dcache_read_misses += dcache_miss_r_per_core;
// read misses
uint64_t dcache_miss_w_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld\n", core_id, dcache_miss_w_per_core);
dcache_write_misses += dcache_miss_w_per_core;
// total_evictions
uint64_t dcache_evictions_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_EVICTS, CSR_MPM_DCACHE_EVICTS_H, &dcache_evictions_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache evictions_per_core=%ld\n", core_id, dcache_evictions_per_core);
dcache_evictions += dcache_evictions_per_core;
// pipeline stalls
uint64_t dcache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core);
dcache_pipe_stalls += dcache_pipe_st_per_core;
// response stalls
uint64_t dcache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core);
dcache_rsp_stalls += dcache_crsp_st_per_core;
// dram_stalls
uint64_t dcache_dram_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_DREQ_ST, CSR_MPM_DCACHE_DREQ_ST_H, &dcache_dram_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache dram stalls=%ld\n", core_id, dcache_dram_st_per_core);
dcache_dram_stalls += dcache_dram_st_per_core;
// mshr_stalls
uint64_t dcache_mshr_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MSHR_ST, CSR_MPM_DCACHE_MSHR_ST_H, &dcache_mshr_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core);
dcache_mshr_stalls += dcache_mshr_st_per_core;
// PERF: dram_latency
uint64_t dram_req_per_core, dram_rsp_per_core, dram_lat_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_REQ, CSR_MPM_DRAM_REQ_H, &dram_req_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_RSP, CSR_MPM_DRAM_RSP_H, &dram_rsp_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core);
int avg_dram_lat_per_core = (int)(double(dram_lat_per_core) / double(dram_rsp_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, dram_req_per_core, dram_rsp_per_core, dram_req_per_core - dram_rsp_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat_per_core);
dram_req += dram_req_per_core;
dram_rsp += dram_rsp_per_core;
dram_lat += dram_lat_per_core;
#endif
}
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
#ifdef PERF_ENABLE
fprintf(stream, "PERF: icache stalls=%ld\n", icache_stalls);
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu stalls=%ld\n", alu_stalls);
fprintf(stream, "PERF: lsu stalls=%ld\n", lsu_stalls);
fprintf(stream, "PERF: csr stalls=%ld\n", csr_stalls);
fprintf(stream, "PERF: mul stalls=%ld\n", mul_stalls);
fprintf(stream, "PERF: fpu stalls=%ld\n", fpu_stalls);
fprintf(stream, "PERF: gpu stalls=%ld\n", gpu_stalls);
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
fprintf(stream, "PERF: icache read misses=%ld\n", icache_read_misses);
fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls);
fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls);
fprintf(stream, "PERF: icache dram stalls=%ld\n", icache_dram_stalls);
fprintf(stream, "PERF: icache mshr stalls=%ld\n", icache_mshr_stalls);
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
fprintf(stream, "PERF: dcache read misses=%ld\n", dcache_read_misses);
fprintf(stream, "PERF: dcache wrire misses=%ld\n", dcache_write_misses);
fprintf(stream, "PERF: dcache evictions=%ld\n", dcache_evictions);
fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls);
fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls);
fprintf(stream, "PERF: dcache dram stalls=%ld\n", dcache_dram_stalls);
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", dram_req, dram_rsp, dram_req - dram_rsp);
int avg_dram_lat = (int)(double(dram_lat) / double(dram_rsp));
fprintf(stream, "PERF: average dram latency=%d cycles\n", avg_dram_lat);
#endif
return ret;
}

View File

@@ -2,6 +2,7 @@
#define __VX_DRIVER_H__
#include <stddef.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
@@ -71,8 +72,8 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size)
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h device, const char* filename);
// get performance counters
int vx_get_perf(vx_device_h device, int core_id, size_t* instrs, size_t* cycles);
// dump performance counters
int vx_dump_perf(vx_device_h device, FILE* stream);
#ifdef __cplusplus
}

View File

@@ -58,6 +58,12 @@ ifdef SCOPE
SCOPE_H = scope-defs.h
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
PERF_ENABLE = PERF=1
endif
all: vlsim
# AFU info from JSON file, including AFU UUID
@@ -71,7 +77,7 @@ scope-defs.h: $(SCRIPT_DIR)/scope.json
scope: scope-defs.h
vlsim-hw: $(SCOPE_H)
$(SCOPE_ENABLE) $(MAKE) -C vlsim
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C vlsim
fpga: $(SRCS) $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT)
@@ -94,7 +100,6 @@ $(ASE_DIR):
clean:
rm -rf $(PROJECT) $(PROJECT_ASE) $(PROJECT_VLSIM) *.o .depend
$(MAKE) -C vlsim clean
$(MAKE) -C ase clean
ifneq ($(MAKECMDGOALS),clean)
-include .depend

View File

@@ -43,8 +43,9 @@ RTL_DIR=../../../hw/rtl
SRCS = fpga.cpp opae_sim.cpp
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
@@ -70,6 +71,12 @@ ifdef SCOPE
CFLAGS += -DSCOPE
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CFLAGS += -DNOPAE
@@ -77,8 +84,6 @@ CFLAGS += -DNOPAE
# use DPI FPU
VL_FLAGS += -DFPU_FAST
RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip
PROJECT = libopae-c-vlsim.so
all: $(PROJECT)

View File

@@ -244,27 +244,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
#endif
#ifdef DUMP_PERF_STATS
// Dump perf stats
if (device->num_cores > 1) {
uint64_t total_instrs = 0, total_cycles = 0;
for (unsigned core_id = 0; core_id < device->num_cores; ++core_id) {
uint64_t instrs, cycles;
int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles);
assert(ret == 0);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "[VXDRV] PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
} else {
uint64_t instrs, cycles;
int ret = vx_get_perf(hdevice, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
assert(ret == 0);
fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
}
vx_dump_perf(device, stdout);
#endif
fpgaClose(device->fpga);

View File

@@ -64,6 +64,12 @@ else
CFLAGS += -DNDEBUG
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CFLAGS += -DPERF_ENABLE
endif
# use DPI FPU
VL_FLAGS += -DFPU_FAST

View File

@@ -239,26 +239,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_device *device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
unsigned num_cores;
vx_csr_get(hdevice, 0, CSR_NC, &num_cores);
if (num_cores > 1) {
uint64_t total_instrs = 0, total_cycles = 0;
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t instrs, cycles;
vx_get_perf(hdevice, core_id, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
} else {
uint64_t instrs, cycles;
vx_get_perf(hdevice, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
}
vx_dump_perf(device, stdout);
#endif
delete device;