code refactoring: DRAM => MEM renaming
This commit is contained in:
@@ -115,10 +115,10 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
||||
uint64_t smem_writes = 0;
|
||||
uint64_t smem_bank_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t dram_reads = 0;
|
||||
uint64_t dram_writes = 0;
|
||||
uint64_t dram_stalls = 0;
|
||||
uint64_t dram_lat = 0;
|
||||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_stalls = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#endif
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
@@ -255,21 +255,21 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization);
|
||||
smem_bank_stalls += smem_bank_st_per_core;
|
||||
|
||||
// PERF: DRAM
|
||||
uint64_t dram_reads_per_core, dram_writes_per_core, dram_stalls_per_core, dram_lat_per_core;
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_READS, CSR_MPM_DRAM_READS_H, &dram_reads_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core);
|
||||
int dram_utilization = (int)((double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core)) * 100);
|
||||
int dram_avg_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%ld (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram average latency=%d cycles\n", core_id, dram_avg_lat);
|
||||
dram_reads += dram_reads_per_core;
|
||||
dram_writes += dram_writes_per_core;
|
||||
dram_stalls += dram_stalls_per_core;
|
||||
dram_lat += dram_lat_per_core;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads_per_core, mem_writes_per_core, mem_stalls_per_core, mem_lat_per_core;
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_READS, CSR_MPM_MEM_READS_H, &mem_reads_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_WRITES, CSR_MPM_MEM_WRITES_H, &mem_writes_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_ST, CSR_MPM_MEM_ST_H, &mem_stalls_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_LAT, CSR_MPM_MEM_LAT_H, &mem_lat_per_core);
|
||||
int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
mem_reads += mem_reads_per_core;
|
||||
mem_writes += mem_writes_per_core;
|
||||
mem_stalls += mem_stalls_per_core;
|
||||
mem_lat += mem_lat_per_core;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -282,8 +282,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
||||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int dram_utilization = (int)((double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls)) * 100);
|
||||
int dram_avg_lat = (int)(double(dram_lat) / double(dram_reads));
|
||||
int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
|
||||
@@ -306,9 +306,9 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
||||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
||||
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes);
|
||||
fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization);
|
||||
fprintf(stream, "PERF: dram average latency=%d cycles\n", dram_avg_lat);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization);
|
||||
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -13,7 +13,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
|
||||
@@ -10,10 +10,10 @@
|
||||
|
||||
#define RESET_DELAY 4
|
||||
|
||||
#define ENABLE_DRAM_STALLS
|
||||
#define DRAM_LATENCY 24
|
||||
#define DRAM_RQ_SIZE 16
|
||||
#define DRAM_STALLS_MODULO 16
|
||||
#define ENABLE_MEM_STALLS
|
||||
#define MEM_LATENCY 24
|
||||
#define MEM_RQ_SIZE 16
|
||||
#define MEM_STALLS_MODULO 16
|
||||
|
||||
uint64_t timestamp = 0;
|
||||
|
||||
@@ -138,7 +138,7 @@ void opae_sim::flush() {
|
||||
void opae_sim::reset() {
|
||||
|
||||
host_buffers_.clear();
|
||||
dram_reads_.clear();
|
||||
mem_reads_.clear();
|
||||
cci_reads_.clear();
|
||||
cci_writes_.clear();
|
||||
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0;
|
||||
@@ -268,87 +268,87 @@ void opae_sim::sTxPort_bus() {
|
||||
}
|
||||
|
||||
void opae_sim::avs_bus() {
|
||||
// update DRAM responses schedule
|
||||
for (auto& rsp : dram_reads_) {
|
||||
// update memory responses schedule
|
||||
for (auto& rsp : mem_reads_) {
|
||||
if (rsp.cycles_left > 0)
|
||||
rsp.cycles_left -= 1;
|
||||
}
|
||||
|
||||
// schedule DRAM responses in FIFO order
|
||||
std::list<dram_rd_req_t>::iterator dram_rd_it(dram_reads_.end());
|
||||
if (!dram_reads_.empty()
|
||||
&& (0 == dram_reads_.begin()->cycles_left)) {
|
||||
dram_rd_it = dram_reads_.begin();
|
||||
// schedule memory responses in FIFO order
|
||||
std::list<mem_rd_req_t>::iterator mem_rd_it(mem_reads_.end());
|
||||
if (!mem_reads_.empty()
|
||||
&& (0 == mem_reads_.begin()->cycles_left)) {
|
||||
mem_rd_it = mem_reads_.begin();
|
||||
}
|
||||
|
||||
// send DRAM response
|
||||
// send memory response
|
||||
vortex_afu_->avs_readdatavalid = 0;
|
||||
if (dram_rd_it != dram_reads_.end()) {
|
||||
if (mem_rd_it != mem_reads_.end()) {
|
||||
vortex_afu_->avs_readdatavalid = 1;
|
||||
memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), DRAM_BLOCK_SIZE);
|
||||
uint32_t addr = dram_rd_it->addr;
|
||||
dram_reads_.erase(dram_rd_it);
|
||||
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * DRAM_BLOCK_SIZE);
|
||||
for (auto& req : dram_reads_) {
|
||||
memcpy(vortex_afu_->avs_readdata, mem_rd_it->data.data(), MEM_BLOCK_SIZE);
|
||||
uint32_t addr = mem_rd_it->addr;
|
||||
mem_reads_.erase(mem_rd_it);
|
||||
/*printf("%0ld: [sim] MEM Rd Rsp: addr=%x, pending={", timestamp, addr * MEM_BLOCK_SIZE);
|
||||
for (auto& req : mem_reads_) {
|
||||
if (req.cycles_left != 0)
|
||||
printf(" !%0x", req.addr * DRAM_BLOCK_SIZE);
|
||||
printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
|
||||
else
|
||||
printf(" %0x", req.addr * DRAM_BLOCK_SIZE);
|
||||
printf(" %0x", req.addr * MEM_BLOCK_SIZE);
|
||||
}
|
||||
printf("}\n");*/
|
||||
}
|
||||
|
||||
// handle DRAM stalls
|
||||
bool dram_stalled = false;
|
||||
#ifdef ENABLE_DRAM_STALLS
|
||||
if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) {
|
||||
dram_stalled = true;
|
||||
// handle memory stalls
|
||||
bool mem_stalled = false;
|
||||
#ifdef ENABLE_MEM_STALLS
|
||||
if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) {
|
||||
mem_stalled = true;
|
||||
} else
|
||||
if (dram_reads_.size() >= DRAM_RQ_SIZE) {
|
||||
dram_stalled = true;
|
||||
if (mem_reads_.size() >= MEM_RQ_SIZE) {
|
||||
mem_stalled = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// process DRAM requests
|
||||
if (!dram_stalled) {
|
||||
// process memory requests
|
||||
if (!mem_stalled) {
|
||||
assert(!vortex_afu_->avs_read || !vortex_afu_->avs_write);
|
||||
if (vortex_afu_->avs_write) {
|
||||
uint64_t byteen = vortex_afu_->avs_byteenable;
|
||||
unsigned base_addr = vortex_afu_->avs_address * DRAM_BLOCK_SIZE;
|
||||
unsigned base_addr = vortex_afu_->avs_address * MEM_BLOCK_SIZE;
|
||||
uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata);
|
||||
for (int i = 0; i < DRAM_BLOCK_SIZE; i++) {
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
ram_[base_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
/*printf("%0ld: [sim] DRAM Wr Req: addr=%x, data=", timestamp, base_addr);
|
||||
for (int i = 0; i < DRAM_BLOCK_SIZE; i++) {
|
||||
printf("%0x", data[(DRAM_BLOCK_SIZE-1)-i]);
|
||||
/*printf("%0ld: [sim] MEM Wr Req: addr=%x, data=", timestamp, base_addr);
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
printf("%0x", data[(MEM_BLOCK_SIZE-1)-i]);
|
||||
}
|
||||
printf("\n");*/
|
||||
}
|
||||
if (vortex_afu_->avs_read) {
|
||||
dram_rd_req_t dram_req;
|
||||
dram_req.addr = vortex_afu_->avs_address;
|
||||
ram_.read(vortex_afu_->avs_address * DRAM_BLOCK_SIZE, DRAM_BLOCK_SIZE, dram_req.data.data());
|
||||
dram_req.cycles_left = DRAM_LATENCY;
|
||||
for (auto& rsp : dram_reads_) {
|
||||
if (dram_req.addr == rsp.addr) {
|
||||
dram_req.cycles_left = rsp.cycles_left;
|
||||
mem_rd_req_t mem_req;
|
||||
mem_req.addr = vortex_afu_->avs_address;
|
||||
ram_.read(vortex_afu_->avs_address * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.data.data());
|
||||
mem_req.cycles_left = MEM_LATENCY;
|
||||
for (auto& rsp : mem_reads_) {
|
||||
if (mem_req.addr == rsp.addr) {
|
||||
mem_req.cycles_left = rsp.cycles_left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
dram_reads_.emplace_back(dram_req);
|
||||
/*printf("%0ld: [sim] DRAM Rd Req: addr=%x, pending={", timestamp, dram_req.addr * DRAM_BLOCK_SIZE);
|
||||
for (auto& req : dram_reads_) {
|
||||
mem_reads_.emplace_back(mem_req);
|
||||
/*printf("%0ld: [sim] MEM Rd Req: addr=%x, pending={", timestamp, mem_req.addr * MEM_BLOCK_SIZE);
|
||||
for (auto& req : mem_reads_) {
|
||||
if (req.cycles_left != 0)
|
||||
printf(" !%0x", req.addr * DRAM_BLOCK_SIZE);
|
||||
printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
|
||||
else
|
||||
printf(" %0x", req.addr * DRAM_BLOCK_SIZE);
|
||||
printf(" %0x", req.addr * MEM_BLOCK_SIZE);
|
||||
}
|
||||
printf("}\n");*/
|
||||
}
|
||||
}
|
||||
|
||||
vortex_afu_->avs_waitrequest = dram_stalled;
|
||||
vortex_afu_->avs_waitrequest = mem_stalled;
|
||||
}
|
||||
@@ -17,7 +17,7 @@
|
||||
#include <list>
|
||||
#include <unordered_map>
|
||||
|
||||
#define DRAM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
|
||||
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
|
||||
@@ -43,9 +43,9 @@ private:
|
||||
|
||||
typedef struct {
|
||||
int cycles_left;
|
||||
std::array<uint8_t, DRAM_BLOCK_SIZE> data;
|
||||
std::array<uint8_t, MEM_BLOCK_SIZE> data;
|
||||
uint32_t addr;
|
||||
} dram_rd_req_t;
|
||||
} mem_rd_req_t;
|
||||
|
||||
typedef struct {
|
||||
int cycles_left;
|
||||
@@ -80,7 +80,7 @@ private:
|
||||
|
||||
std::unordered_map<int64_t, host_buffer_t> host_buffers_;
|
||||
|
||||
std::list<dram_rd_req_t> dram_reads_;
|
||||
std::list<mem_rd_req_t> mem_reads_;
|
||||
|
||||
std::list<cci_rd_req_t> cci_reads_;
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
|
||||
Reference in New Issue
Block a user