minor updates

This commit is contained in:
Blaise Tine
2021-01-12 15:19:38 -08:00
parent adcd3ad521
commit b4b5d6f0ab
8 changed files with 45 additions and 121 deletions

View File

@@ -246,7 +246,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
// bank_stalls
uint64_t dcache_bank_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_BANK_ST, CSR_MPM_DCACHE_BANK_ST_H, &dcache_bank_st_per_core);
int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core))) * 100);
int dcache_bank_utilization = (int)((double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core)) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_st_per_core, dcache_bank_utilization);
dcache_bank_stalls += dcache_bank_st_per_core;
// mshr_stalls
@@ -279,7 +279,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
// bank_stalls
uint64_t smem_bank_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_BANK_ST, CSR_MPM_SMEM_BANK_ST_H, &smem_bank_st_per_core);
int smem_bank_utilization = (int)((1.0 - (double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core))) * 100);
int smem_bank_utilization = (int)((double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core)) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization);
smem_bank_stalls += smem_bank_st_per_core;
@@ -289,11 +289,11 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core);
int avg_dram_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core));
int dram_utilization = (int)((1.0 - (double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core))) * 100);
int dram_utilization = (int)((double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core)) * 100);
int dram_avg_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%ld (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization);
if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram average latency=%d cycles\n", core_id, dram_avg_lat);
dram_reads += dram_reads_per_core;
dram_writes += dram_writes_per_core;
dram_stalls += dram_stalls_per_core;
@@ -308,10 +308,10 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls))) * 100);
int smem_bank_utilization = (int)((1.0 - (double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls))) * 100);
int dram_utilization = (int)((1.0 - (double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls))) * 100);
int avg_dram_lat = (int)(double(dram_lat) / double(dram_reads));
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
int dram_utilization = (int)((double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls)) * 100);
int dram_avg_lat = (int)(double(dram_lat) / double(dram_reads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
@@ -337,7 +337,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes);
fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization);
fprintf(stream, "PERF: average dram latency=%d cycles\n", avg_dram_lat);
fprintf(stream, "PERF: dram average latency=%d cycles\n", dram_avg_lat);
fprintf(stream, "PERF: dram bandwith=%d cycles\n", dram_avg_lat);
#endif
return ret;

View File

@@ -52,13 +52,7 @@ module VX_commit #(
assign commit_tmask3 = gpu_commit_fire ? gpu_commit_if.tmask : 0;
wire [CMTW-1:0] commit_size;
VX_countones #(
.N(3*`NUM_THREADS)
) commit_ctr1 (
.valids({commit_tmask3, commit_tmask2, commit_tmask1}),
.count (commit_size)
);
assign commit_size = $countones({commit_tmask3, commit_tmask2, commit_tmask1});
VX_pipe_register #(
.DATAW (1 + CMTW),

View File

@@ -152,10 +152,10 @@ module VX_issue #(
`endif
end else begin
if (decode_if.valid & !decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + 64'd1;
perf_ibf_stalls <= perf_ibf_stalls + 64'd1;
end
if (ibuf_deq_if.valid & scoreboard_delay) begin
perf_scb_stalls <= perf_scb_stalls + 64'd1;
perf_scb_stalls <= perf_scb_stalls + 64'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
perf_alu_stalls <= perf_alu_stalls + 64'd1;

View File

@@ -352,13 +352,9 @@ end
if (reset) begin
perf_dram_lat_per_cycle <= 0;
end else begin
if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready && dram_rsp_if.valid && dram_rsp_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle;
end else if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1;
end else if (dram_rsp_if.valid && dram_rsp_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1;
end
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle +
64'($signed(2'((dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) && !(dram_rsp_if.valid && dram_rsp_if.ready)) -
2'((dram_rsp_if.valid && dram_rsp_if.ready) && !(dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready))));
end
end

View File

@@ -161,12 +161,7 @@ module VX_warp_sched #(
`IGNORE_WARNINGS_BEGIN
wire [`NW_BITS:0] active_barrier_count;
`IGNORE_WARNINGS_END
VX_countones #(
.N(`NUM_WARPS)
) barrier_count (
.valids(barrier_stall_mask[warp_ctl_if.barrier.id]),
.count (active_barrier_count)
);
assign active_barrier_count = $countones(barrier_stall_mask[warp_ctl_if.barrier.id]);
assign reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1);

View File

@@ -356,34 +356,13 @@ module VX_cache #(
reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle, perf_core_writes_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle;
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_reads_count (
.valids (core_req_valid & core_req_ready & ~core_req_rw),
.count (perf_core_reads_per_cycle)
);
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_writes_count (
.valids (core_req_valid & core_req_ready & core_req_rw),
.count (perf_core_writes_per_cycle)
);
assign perf_core_reads_per_cycle = $countones(core_req_valid & core_req_ready & ~core_req_rw);
assign perf_core_writes_per_cycle = $countones(core_req_valid & core_req_ready & core_req_rw);
if (CORE_TAG_ID_BITS != 0) begin
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_rsp_count (
.valids (core_rsp_valid & {NUM_REQS{!core_rsp_ready}}),
.count (perf_crsp_stall_per_cycle)
);
assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & {NUM_REQS{!core_rsp_ready}});
end else begin
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_rsp_count (
.valids (core_rsp_valid & ~core_rsp_ready),
.count (perf_crsp_stall_per_cycle)
);
assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready);
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
@@ -392,33 +371,10 @@ module VX_cache #(
reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle;
VX_countones #(
.N(NUM_BANKS)
) perf_countones_read_miss_count (
.valids (perf_read_miss_per_bank),
.count (perf_read_miss_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_write_miss_count (
.valids (perf_write_miss_per_bank),
.count (perf_write_miss_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_mshr_stall_count (
.valids (perf_mshr_stall_per_bank),
.count (perf_mshr_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_total_stall_count (
.valids (perf_pipe_stall_per_bank),
.count (perf_pipe_stall_per_cycle)
);
assign perf_read_miss_per_cycle = $countones(perf_read_miss_per_bank);
assign perf_write_miss_per_cycle = $countones(perf_write_miss_per_bank);
assign perf_mshr_stall_per_cycle = $countones(perf_mshr_stall_per_bank);
assign perf_pipe_stall_per_cycle = $countones(perf_pipe_stall_per_bank);
reg [63:0] perf_core_reads;
reg [63:0] perf_core_writes;

View File

@@ -50,7 +50,7 @@ module VX_cache_core_req_bank_sel #(
reg [NUM_BANKS-1:0] per_bank_core_req_stall;
reg [NUM_REQS-1:0] core_req_ready_r;
reg [NUM_BANKS-1:0] core_req_sel_r;
reg [NUM_REQS-1:0] core_req_sel_r;
wire [NUM_REQS-1:0][`BANK_SELECT_BITS-1:0] core_req_bid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
@@ -81,25 +81,33 @@ module VX_cache_core_req_bank_sel #(
always @(*) begin
core_req_ready_r = 0;
core_req_sel_r = 0;
for (integer j = 0; j < NUM_BANKS; ++j) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
if (core_req_valid[i] && (core_req_bid[i] == `BANK_SELECT_BITS'(j))) begin
core_req_ready_r[i] = ~per_bank_core_req_stall[j];
core_req_sel_r[i] = 1;
break;
end
end
end
end
always @(*) begin
core_req_sel_r = 0;
for (integer j = 0; j < NUM_BANKS; ++j) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
if (core_req_valid[i] && (core_req_bid[i] == `BANK_SELECT_BITS'(j))) begin
core_req_sel_r[i] = ~per_bank_core_req_stall[j];
end
end
end
end
reg [63:0] bank_stalls_r;
always @(posedge clk) begin
if (reset) begin
bank_stalls_r <= 0;
end else begin
bank_stalls_r <= bank_stalls_r + 64'($countones(core_req_valid & ~core_req_sel_r));
bank_stalls_r <= bank_stalls_r + 64'($countones(core_req_sel_r & ~core_req_ready_r));
end
end

View File

@@ -1,26 +0,0 @@
`include "VX_platform.vh"
module VX_countones #(
parameter N = 10,
parameter N_BITS = $clog2(N+1)
) (
input wire [N-1:0] valids,
output wire [N_BITS-1:0] count
);
/*reg [N_BITS-1:0] count_r;
always @(*) begin
count_r = 0;
for (integer i = N-1; i >= 0; i = i - 1) begin
if (valids[i]) begin
count_r = count_r + N_BITS'(1);
end
end
end
assign count = count_r;*/
assign count = $countones(valids);
endmodule