diff --git a/ci/toolchain_install.sh b/ci/toolchain_install.sh index 9ff1585b..0fa38c36 100755 --- a/ci/toolchain_install.sh +++ b/ci/toolchain_install.sh @@ -20,6 +20,8 @@ REPOSITORY=https://github.com/vortexgpgpu/vortex-toolchain-prebuilt/raw/master TOOLDIR=${TOOLDIR:=/opt} OSDIR=${OSDIR:=ubuntu/bionic} +OS="${OS:=ubuntu/bionic}" + riscv() { case $OSDIR in diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 388dc258..80a4a7d7 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -58,6 +58,8 @@ `define VX_CSR_MPM_BASE 12'hB00 `define VX_CSR_MPM_BASE_H 12'hB80 +`define VX_CSR_MPM_USER 12'hB03 +`define VX_CSR_MPM_USER_H 12'hB83 // Machine Performance-monitoring core counters // PERF: Standard @@ -68,29 +70,41 @@ `define VX_CSR_MINSTRET 12'hB02 `define VX_CSR_MINSTRET_H 12'hB82 // PERF: pipeline -`define VX_CSR_MPM_IBUF_ST 12'hB03 -`define VX_CSR_MPM_IBUF_ST_H 12'hB83 -`define VX_CSR_MPM_SCRB_ST 12'hB04 -`define VX_CSR_MPM_SCRB_ST_H 12'hB84 -`define VX_CSR_MPM_ALU_ST 12'hB05 -`define VX_CSR_MPM_ALU_ST_H 12'hB85 -`define VX_CSR_MPM_LSU_ST 12'hB06 -`define VX_CSR_MPM_LSU_ST_H 12'hB86 -`define VX_CSR_MPM_FPU_ST 12'hB07 -`define VX_CSR_MPM_FPU_ST_H 12'hB87 -`define VX_CSR_MPM_SFU_ST 12'hB08 -`define VX_CSR_MPM_SFU_ST_H 12'hB88 +`define VX_CSR_MPM_SCHED_ST 12'hB03 +`define VX_CSR_MPM_SCHED_ST_H 12'hB83 +`define VX_CSR_MPM_FETCH_ST 12'hB04 +`define VX_CSR_MPM_FETCH_ST_H 12'hB84 +`define VX_CSR_MPM_IBUF_ST 12'hB05 +`define VX_CSR_MPM_IBUF_ST_H 12'hB85 +`define VX_CSR_MPM_SCRB_ST 12'hB06 +`define VX_CSR_MPM_SCRB_ST_H 12'hB86 +`define VX_CSR_MPM_ALU_ST 12'hB07 +`define VX_CSR_MPM_ALU_ST_H 12'hB87 +`define VX_CSR_MPM_LSU_ST 12'hB08 +`define VX_CSR_MPM_LSU_ST_H 12'hB88 +`define VX_CSR_MPM_FPU_ST 12'hB09 +`define VX_CSR_MPM_FPU_ST_H 12'hB89 +`define VX_CSR_MPM_SFU_ST 12'hB0A +`define VX_CSR_MPM_SFU_ST_H 12'hB8A +`define VX_CSR_MPM_SCRB_ALU 12'hB0B +`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B +`define VX_CSR_MPM_SCRB_FPU 12'hB0C +`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C +`define VX_CSR_MPM_SCRB_LSU 12'hB0D +`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D +`define VX_CSR_MPM_SCRB_SFU 12'hB0E +`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E // PERF: memory -`define VX_CSR_MPM_IFETCHES 12'hB0A -`define VX_CSR_MPM_IFETCHES_H 12'hB8A -`define VX_CSR_MPM_LOADS 12'hB0B -`define VX_CSR_MPM_LOADS_H 12'hB8B -`define VX_CSR_MPM_STORES 12'hB0C -`define VX_CSR_MPM_STORES_H 12'hB8C -`define VX_CSR_MPM_IFETCH_LAT 12'hB0D -`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D -`define VX_CSR_MPM_LOAD_LAT 12'hB0E -`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E +`define VX_CSR_MPM_IFETCHES 12'hB0F +`define VX_CSR_MPM_IFETCHES_H 12'hB8F +`define VX_CSR_MPM_LOADS 12'hB10 +`define VX_CSR_MPM_LOADS_H 12'hB90 +`define VX_CSR_MPM_STORES 12'hB11 +`define VX_CSR_MPM_STORES_H 12'hB91 +`define VX_CSR_MPM_IFETCH_LAT 12'hB12 +`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92 +`define VX_CSR_MPM_LOAD_LAT 12'hB13 +`define VX_CSR_MPM_LOAD_LAT_H 12'hB93 // Machine Performance-monitoring memory counters // PERF: icache @@ -98,59 +112,61 @@ `define VX_CSR_MPM_ICACHE_READS_H 12'hB83 `define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses `define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84 +`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls +`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85 // PERF: dcache -`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads -`define VX_CSR_MPM_DCACHE_READS_H 12'hB85 -`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes -`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86 -`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses -`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87 -`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses -`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88 -`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts -`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89 -`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls -`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A -// PERF: smem -`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads -`define VX_CSR_MPM_SMEM_READS_H 12'hB8B -`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes -`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C -`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts -`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D +`define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads +`define VX_CSR_MPM_DCACHE_READS_H 12'hB86 +`define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes +`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87 +`define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses +`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88 +`define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses +`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89 +`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts +`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A +`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls +`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B // PERF: l2cache -`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads -`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E -`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes -`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F -`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses -`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90 -`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses -`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91 -`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts -`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92 -`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls -`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93 +`define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads +`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C +`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes +`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D +`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses +`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E +`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses +`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F +`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts +`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90 +`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls +`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91 // PERF: l3cache -`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads -`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94 -`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes -`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95 -`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses -`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96 -`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses -`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97 -`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts -`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98 -`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls -`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99 +`define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads +`define VX_CSR_MPM_L3CACHE_READS_H 12'hB92 +`define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes +`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93 +`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses +`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94 +`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses +`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95 +`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts +`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96 +`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls +`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97 // PERF: memory -`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads -`define VX_CSR_MPM_MEM_READS_H 12'hB9A -`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes -`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B -`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency -`define VX_CSR_MPM_MEM_LAT_H 12'hB9C +`define VX_CSR_MPM_MEM_READS 12'hB18 // total reads +`define VX_CSR_MPM_MEM_READS_H 12'hB98 +`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes +`define VX_CSR_MPM_MEM_WRITES_H 12'hB99 +`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency +`define VX_CSR_MPM_MEM_LAT_H 12'hB9A +// PERF: smem +`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads +`define VX_CSR_MPM_SMEM_READS_H 12'hB9B +`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes +`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C +`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts +`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D // Machine Information Registers diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index e5dbe97c..09667d11 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -44,7 +44,7 @@ module VX_commit import VX_gpu_pkg::*; #( VX_commit_if commit_if[`ISSUE_WIDTH](); - wire [`ISSUE_WIDTH-1:0] commit_fire; + wire [`ISSUE_WIDTH-1:0] commit_fire; wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid; wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask; wire [`ISSUE_WIDTH-1:0] commit_eop; @@ -91,24 +91,24 @@ module VX_commit import VX_gpu_pkg::*; #( `UNUSED_PIN (sel_out) ); - assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready; - assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask; - assign commit_wid[i] = commit_if[i].data.wid; - assign commit_eop[i] = commit_if[i].data.eop; + assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready; + assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask; + assign commit_wid[i] = commit_if[i].data.wid; + assign commit_eop[i] = commit_if[i].data.eop; end // CSRs update wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r; - wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r; + wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr; wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr; assign commit_fire_any = (| commit_fire); for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - wire [COMMIT_SIZEW-1:0] pop_count; - `POP_COUNT(pop_count, commit_tmask[i]); - assign commit_size[i] = pop_count; + wire [COMMIT_SIZEW-1:0] count; + `POP_COUNT(count, commit_tmask[i]); + assign commit_size[i] = count; end VX_pipe_register #( @@ -129,7 +129,7 @@ module VX_commit import VX_gpu_pkg::*; #( .OP ("+") ) commit_size_reduce ( .data_in (commit_size_r), - .data_out (commit_size_all) + .data_out (commit_size_all_r) ); VX_pipe_register #( @@ -139,26 +139,26 @@ module VX_commit import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({commit_fire_any_r, commit_size_all}), - .data_out ({commit_fire_any_rr, commit_size_all_r}) + .data_in ({commit_fire_any_r, commit_size_all_r}), + .data_out ({commit_fire_any_rr, commit_size_all_rr}) ); reg [`PERF_CTR_BITS-1:0] instret; - always @(posedge clk) begin if (reset) begin instret <= '0; end else begin if (commit_fire_any_rr) begin - instret <= instret + `PERF_CTR_BITS'(commit_size_all_r); + instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr); end end end - assign commit_csr_if.instret = instret; // Committed instructions + wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop; + VX_pipe_register #( .DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), .RESETW (`ISSUE_WIDTH) @@ -166,23 +166,23 @@ module VX_commit import VX_gpu_pkg::*; #( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({(commit_fire & commit_eop), commit_wid}), + .data_in ({committed, commit_wid}), .data_out ({commit_sched_if.committed, commit_sched_if.committed_wid}) ); // Writeback for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb; + assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb; assign writeback_if[i].data.uuid = commit_if[i].data.uuid; - assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid); - assign writeback_if[i].data.PC = commit_if[i].data.PC; - assign writeback_if[i].data.tmask = commit_if[i].data.tmask; - assign writeback_if[i].data.rd = commit_if[i].data.rd; + assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid); + assign writeback_if[i].data.PC = commit_if[i].data.PC; + assign writeback_if[i].data.tmask= commit_if[i].data.tmask; + assign writeback_if[i].data.rd = commit_if[i].data.rd; assign writeback_if[i].data.data = commit_if[i].data.data; - assign writeback_if[i].data.sop = commit_if[i].data.sop; - assign writeback_if[i].data.eop = commit_if[i].data.eop; - assign commit_if[i].ready = 1'b1; + assign writeback_if[i].data.sop = commit_if[i].data.sop; + assign writeback_if[i].data.eop = commit_if[i].data.eop; + assign commit_if[i].ready = 1'b1; // writeback has no backpressure end // simulation helper signal to get RISC-V tests Pass/Fail status diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 8aaea911..684a9b84 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -116,7 +116,11 @@ module VX_core import VX_gpu_pkg::*; #( .CORE_ID (CORE_ID) ) schedule ( .clk (clk), - .reset (schedule_reset), + .reset (schedule_reset), + + `ifdef PERF_ENABLE + .perf_schedule_if (pipeline_perf_if.schedule), + `endif .base_dcrs (base_dcrs), diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 9ba0ffd0..44e997ff 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -179,14 +179,18 @@ import VX_fpu_pkg::*; default: begin read_addr_valid_r = 0; - if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32)) - || (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin + if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32)) + || (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin read_addr_valid_r = 1; `ifdef PERF_ENABLE case (base_dcrs.mpm_class) `VX_DCR_MPM_CLASS_CORE: begin case (read_addr) - // PERF: pipeline + // PERF: pipeline + `VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0]; + `VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0]; + `VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; `VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0]; @@ -204,6 +208,19 @@ import VX_fpu_pkg::*; `endif `VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0]; `VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0]; + `ifdef EXT_F_ENABLE + `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0]; + `else + `VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0; + `VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0; + `endif + `VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0]; + `VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0]; // PERF: memory `VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); @@ -214,7 +231,7 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0]; `VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0]; - `VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]); default:; endcase end @@ -225,6 +242,8 @@ import VX_fpu_pkg::*; `VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); `VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; `VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]); + `VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0]; + `VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]); // PERF: dcache `VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; `VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); diff --git a/hw/rtl/core/VX_ipdom_stack.sv b/hw/rtl/core/VX_ipdom_stack.sv index a6524b2d..b6763f7e 100644 --- a/hw/rtl/core/VX_ipdom_stack.sv +++ b/hw/rtl/core/VX_ipdom_stack.sv @@ -14,10 +14,10 @@ `include "VX_platform.vh" module VX_ipdom_stack #( - parameter WIDTH = 1, - parameter DEPTH = 1, + parameter WIDTH = 1, + parameter DEPTH = 1, parameter OUT_REG = 0, - parameter ADDRW = `LOG2UP(DEPTH) + parameter ADDRW = `LOG2UP(DEPTH) ) ( input wire clk, input wire reset, diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index af00014e..53701cc8 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -59,6 +59,10 @@ module VX_issue #( ) scoreboard ( .clk (clk), .reset (scoreboard_reset), + `ifdef PERF_ENABLE + .perf_scb_stalls(perf_issue_if.scb_stalls), + .perf_scb_uses (perf_issue_if.scb_uses), + `endif .writeback_if (writeback_if), .ibuffer_if (ibuffer_if), .scoreboard_if (scoreboard_if) @@ -152,29 +156,17 @@ module VX_issue #( `ifdef PERF_ENABLE reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; - reg [`PERF_CTR_BITS-1:0] perf_scb_stalls; - - wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle; - reg [`ISSUE_WIDTH-1:0] scoreboard_stalls; - for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin - assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; - end - `POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls); - always @(posedge clk) begin if (reset) begin perf_ibf_stalls <= '0; - perf_scb_stalls <= '0; end else begin if (decode_if.valid && ~decode_if.ready) begin perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1); end - perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle); end end assign perf_issue_if.ibf_stalls = perf_ibf_stalls; - assign perf_issue_if.scb_stalls = perf_scb_stalls; `endif endmodule diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index ea96178e..0ffeafc2 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + VX_pipeline_perf_if.schedule perf_schedule_if, +`endif + // configuration input base_dcrs_t base_dcrs, @@ -376,4 +380,21 @@ module VX_schedule import VX_gpu_pkg::*; #( end `RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps)); +`ifdef PERF_ENABLE + reg [`PERF_CTR_BITS-1:0] perf_sched_stalls; + reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls; + always @(posedge clk) begin + if (reset) begin + perf_sched_stalls <= '0; + perf_fetch_stalls <= '0; + end else begin + perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid); + perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready); + end + end + + assign perf_schedule_if.sched_stalls = perf_sched_stalls; + assign perf_schedule_if.fetch_stalls = perf_fetch_stalls; +`endif + endmodule diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 90a58134..c1d09c07 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -19,6 +19,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #( input wire clk, input wire reset, +`ifdef PERF_ENABLE + output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls, + output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS], +`endif + VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH], VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH] @@ -26,81 +31,100 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `UNUSED_PARAM (CORE_ID) localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1; +`ifdef PERF_ENABLE + wire [`NUM_EX_UNITS-1:0] scoreboard_uses_per_cycle; + wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle; + reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] scoreboard_uses; + wire [`ISSUE_WIDTH-1:0] scoreboard_stalls; + + `POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls); + VX_reduce #( + .DATAW_IN (`NUM_EX_UNITS), + .N (`ISSUE_WIDTH), + .OP ("|") + ) reduce ( + .data_in (scoreboard_uses), + .data_out (scoreboard_uses_per_cycle) + ); +`endif + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n; - reg [3:0] ready_masks, ready_masks_n; + reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs; VX_ibuffer_if staging_if(); - + wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop; + wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]; + wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]; + wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]; + wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]; + + `ifdef PERF_ENABLE + reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units; always @(*) begin - inuse_regs_n = inuse_regs; - ready_masks_n = ready_masks; - if (writeback_fire) begin - inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0; - ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}} - & {(writeback_if[i].data.rd == staging_if.data.rd), - (writeback_if[i].data.rd == staging_if.data.rs1), - (writeback_if[i].data.rd == staging_if.data.rs2), - (writeback_if[i].data.rd == staging_if.data.rs3)}; - end - if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin - inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1; - ready_masks_n = '0; + scoreboard_uses[i] = '0; + if (ibuffer_if[i].valid) begin + if (inuse_rd) begin + scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1; + end + if (inuse_rs1) begin + scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1; + end + if (inuse_rs2) begin + scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1; + end + if (inuse_rs3) begin + scoreboard_uses[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1; + end end - if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin - ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd], - inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1], - inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2], - inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]}; - end - end + end + assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready; + `endif + + reg [DATAW-1:0] data_out_r; + reg valid_out_r; + + wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3}; + wire deps_ready = (& ready_masks); always @(posedge clk) begin if (reset) begin - inuse_regs <= '0; - ready_masks <= '0; - end else begin - inuse_regs <= inuse_regs_n; - ready_masks <= ready_masks_n; + valid_out_r <= 0; + inuse_regs <= '0; + end else begin + if (writeback_fire) begin + inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0; + end + if (~valid_out_r) begin + valid_out_r <= ibuffer_if[i].valid && deps_ready; + end else if (staging_if.ready) begin + if (staging_if.data.wb) begin + inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1; + `ifdef PERF_ENABLE + inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type; + `endif + end + valid_out_r <= 0; + end + end + if (~valid_out_r) begin + data_out_r <= ibuffer_if[i].data; end end - // staging buffer - - `RESET_RELAY (stg_buf_reset, reset); - - VX_elastic_buffer #( - .DATAW (DATAW) - ) stg_buf ( - .clk (clk), - .reset (stg_buf_reset), - .valid_in (ibuffer_if[i].valid), - .ready_in (ibuffer_if[i].ready), - .data_in (ibuffer_if[i].data), - .data_out (staging_if.data), - .valid_out (staging_if.valid), - .ready_out (staging_if.ready) - ); - - // output buffer - - wire valid_stg, ready_stg; - wire regs_ready = (& ready_masks); - assign valid_stg = staging_if.valid && regs_ready; - assign staging_if.ready = ready_stg && regs_ready; - - `RESET_RELAY (out_buf_reset, reset); + assign ibuffer_if[i].ready = ~valid_out_r && deps_ready; + assign staging_if.valid = valid_out_r; + assign staging_if.data = data_out_r; VX_elastic_buffer #( .DATAW (DATAW), - .SIZE (2), + .SIZE (0), .OUT_REG (2) ) out_buf ( .clk (clk), - .reset (out_buf_reset), - .valid_in (valid_stg), - .ready_in (ready_stg), + .reset (reset), + .valid_in (staging_if.valid), + .ready_in (staging_if.ready), .data_in (staging_if.data), .data_out (scoreboard_if[i].data), .valid_out (scoreboard_if[i].valid), @@ -108,29 +132,29 @@ module VX_scoreboard import VX_gpu_pkg::*; #( ); `ifdef SIMULATION - reg [31:0] timeout_ctr; - + reg [31:0] timeout_ctr; + always @(posedge clk) begin if (reset) begin timeout_ctr <= '0; end else begin - if (staging_if.valid && ~regs_ready) begin + if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin `ifdef DBG_TRACE_CORE_PIPELINE `TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", - $time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, - ~ready_masks, staging_if.data.uuid)); + $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr, + ~ready_masks, ibuffer_if[i].data.uuid)); `endif timeout_ctr <= timeout_ctr + 1; - end else if (staging_if.valid && staging_if.ready) begin + end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin timeout_ctr <= '0; end end - end - + end + `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), ("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", - $time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr, - ~ready_masks, staging_if.data.uuid)); + $time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr, + ~ready_masks, ibuffer_if[i].data.uuid)); `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0, ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)", @@ -139,4 +163,23 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end +`ifdef PERF_ENABLE + always @(posedge clk) begin + if (reset) begin + perf_scb_stalls <= '0; + end else begin + perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle); + end + end + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin + always @(posedge clk) begin + if (reset) begin + perf_scb_uses[i] <= '0; + end else begin + perf_scb_uses[i] <= perf_scb_uses[i] + `PERF_CTR_BITS'(scoreboard_uses_per_cycle[i]); + end + end + end +`endif + endmodule diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv index b6123b7f..4f6ffb5d 100644 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ b/hw/rtl/interfaces/VX_pipeline_perf_if.sv @@ -14,8 +14,11 @@ `include "VX_define.vh" interface VX_pipeline_perf_if (); + wire [`PERF_CTR_BITS-1:0] sched_stalls; + wire [`PERF_CTR_BITS-1:0] fetch_stalls; wire [`PERF_CTR_BITS-1:0] ibf_stalls; wire [`PERF_CTR_BITS-1:0] scb_stalls; + wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS]; wire [`PERF_CTR_BITS-1:0] ifetches; @@ -24,15 +27,24 @@ interface VX_pipeline_perf_if (); wire [`PERF_CTR_BITS-1:0] ifetch_latency; wire [`PERF_CTR_BITS-1:0] load_latency; + modport schedule ( + output sched_stalls, + output fetch_stalls + ); + modport issue ( output ibf_stalls, output scb_stalls, + output scb_uses, output dsp_stalls - ); + ); modport slave ( + input sched_stalls, + input fetch_stalls, input ibf_stalls, input scb_stalls, + input scb_uses, input dsp_stalls, input ifetches, input loads, diff --git a/hw/rtl/libs/VX_avs_adapter.sv b/hw/rtl/libs/VX_avs_adapter.sv index 779eb45e..4ea53757 100644 --- a/hw/rtl/libs/VX_avs_adapter.sv +++ b/hw/rtl/libs/VX_avs_adapter.sv @@ -21,8 +21,8 @@ module VX_avs_adapter #( parameter NUM_BANKS = 1, parameter TAG_WIDTH = 1, parameter RD_QUEUE_SIZE = 1, - parameter OUT_REG_REQ = 0, - parameter OUT_REG_RSP = 0 + parameter OUT_REG_REQ = 0, + parameter OUT_REG_RSP = 0 ) ( input wire clk, input wire reset, diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 967c3af1..c5919b7a 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -20,7 +20,7 @@ module VX_axi_adapter #( parameter TAG_WIDTH = 8, parameter NUM_BANKS = 1, parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)), - parameter OUT_REG_RSP = 0 + parameter OUT_REG_RSP = 0 ) ( input wire clk, input wire reset, diff --git a/hw/rtl/libs/VX_mem_adapter.sv b/hw/rtl/libs/VX_mem_adapter.sv index 19d65240..ed983836 100644 --- a/hw/rtl/libs/VX_mem_adapter.sv +++ b/hw/rtl/libs/VX_mem_adapter.sv @@ -21,8 +21,8 @@ module VX_mem_adapter #( parameter DST_ADDR_WIDTH = 1, parameter SRC_TAG_WIDTH = 1, parameter DST_TAG_WIDTH = 1, - parameter OUT_REG_REQ = 0, - parameter OUT_REG_RSP = 0 + parameter OUT_REG_REQ = 0, + parameter OUT_REG_RSP = 0 ) ( input wire clk, input wire reset, diff --git a/hw/rtl/libs/VX_stream_arb.sv b/hw/rtl/libs/VX_stream_arb.sv index 58da0b25..a81be3ef 100644 --- a/hw/rtl/libs/VX_stream_arb.sv +++ b/hw/rtl/libs/VX_stream_arb.sv @@ -21,7 +21,7 @@ module VX_stream_arb #( parameter `STRING ARBITER = "P", parameter LOCK_ENABLE = 1, parameter MAX_FANOUT = `MAX_FANOUT, - parameter OUT_REG = 0 , + parameter OUT_REG = 0 , parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS, parameter LOG_NUM_REQS = `CLOG2(NUM_REQS), parameter NUM_REQS_W = `UP(LOG_NUM_REQS) diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index db92cfd0..7c1f0f7a 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -173,8 +173,8 @@ module VX_stream_xbar #( end // compute inputs collision - // we have a collision when there exists a valid transfer with mutiple input candicates - // we caount the unique duplicates each cycle. + // we have a collision when there exists a valid transfer with multiple input candicates + // we count the unique duplicates each cycle. reg [PERF_CTR_BITS-1:0] collisions_r; reg [NUM_INPUTS-1:0] per_cycle_collision; diff --git a/hw/rtl/mem/VX_gbar_arb.sv b/hw/rtl/mem/VX_gbar_arb.sv index 6aa93510..a4cc07c3 100644 --- a/hw/rtl/mem/VX_gbar_arb.sv +++ b/hw/rtl/mem/VX_gbar_arb.sv @@ -15,7 +15,7 @@ module VX_gbar_arb #( parameter NUM_REQS = 1, - parameter OUT_REG = 0, + parameter OUT_REG = 0, parameter `STRING ARBITER = "R" ) ( input wire clk, diff --git a/hw/rtl/mem/VX_mem_arb.sv b/hw/rtl/mem/VX_mem_arb.sv index 939dd6ba..2588a9ea 100644 --- a/hw/rtl/mem/VX_mem_arb.sv +++ b/hw/rtl/mem/VX_mem_arb.sv @@ -21,8 +21,8 @@ module VX_mem_arb #( parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)), parameter TAG_WIDTH = 1, parameter TAG_SEL_IDX = 0, - parameter OUT_REG_REQ = 0, - parameter OUT_REG_RSP = 0, + parameter OUT_REG_REQ = 0, + parameter OUT_REG_RSP = 0, parameter `STRING ARBITER = "R" ) ( input wire clk, diff --git a/hw/rtl/mem/VX_smem_switch.sv b/hw/rtl/mem/VX_smem_switch.sv index 7dc410a9..5fb92915 100644 --- a/hw/rtl/mem/VX_smem_switch.sv +++ b/hw/rtl/mem/VX_smem_switch.sv @@ -19,8 +19,8 @@ module VX_smem_switch #( parameter TAG_WIDTH = 1, parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, parameter TAG_SEL_IDX = 0, - parameter OUT_REG_REQ = 0, - parameter OUT_REG_RSP = 0, + parameter OUT_REG_REQ = 0, + parameter OUT_REG_RSP = 0, parameter `STRING ARBITER = "R" ) ( input wire clk, diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index 72c2b80c..574f64a7 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -186,27 +186,31 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { return int((1.0 - (double(part) / double(total))) * 100); }; - auto caclAvgLatency = [&](uint64_t sum, uint64_t requests)->int { - if (requests == 0) + auto caclAverage = [&](uint64_t part, uint64_t total)->double { + if (total == 0) return 0; - return int(double(sum) / double(requests)); + return double(part) / double(total); }; - auto calcUtilization = [&](uint64_t count, uint64_t stalls)->int { - if (count == 0) - return 0; - return int((double(count) / double(count + stalls)) * 100); + auto calcAvgPercent = [&](uint64_t part, uint64_t total)->int { + return int(caclAverage(part, total) * 100); }; auto perf_class = gAutoPerfDump.get_perf_class(); // PERF: pipeline stalls + uint64_t scheduler_stalls = 0; + uint64_t fetch_stalls = 0; uint64_t ibuffer_stalls = 0; - uint64_t scoreboard_stalls = 0; + uint64_t scrb_stalls = 0; uint64_t lsu_stalls = 0; uint64_t fpu_stalls = 0; uint64_t alu_stalls = 0; - uint64_t sfu_stalls = 0; + uint64_t sfu_stalls = 0; + uint64_t scrb_alu = 0; + uint64_t scrb_fpu = 0; + uint64_t scrb_lsu = 0; + uint64_t scrb_sfu = 0; uint64_t ifetches = 0; uint64_t loads = 0; uint64_t stores = 0; @@ -251,76 +255,121 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { #endif std::vector staging_buf(64* sizeof(uint32_t)); - - for (unsigned core_id = 0; core_id < num_cores; ++core_id) { + + for (unsigned core_id = 0; core_id < num_cores; ++core_id) { uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size(); ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()); if (ret != 0) return ret; + uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE); + uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET); + #ifdef PERF_ENABLE switch (perf_class) { case VX_DCR_MPM_CLASS_CORE: { // PERF: pipeline - // ibuffer_stall - uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core); - ibuffer_stalls += ibuffer_stalls_per_core; - // scoreboard_stall - uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core); - scoreboard_stalls += scoreboard_stalls_per_core; - // alu_stall - uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); - alu_stalls += alu_stalls_per_core; - // lsu_stall - uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); - lsu_stalls += lsu_stalls_per_core; - // fpu_stall - uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); - fpu_stalls += fpu_stalls_per_core; - // sfu_stall - uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST); - if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core); - sfu_stalls += sfu_stalls_per_core; + // schedule stalls + { + uint64_t scheduler_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); + int scheduler_percent_per_core = calcAvgPercent(scheduler_stalls_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: schedule stalls=%ld (%d%%)\n", core_id, scheduler_stalls_per_core, scheduler_percent_per_core); + scheduler_stalls += scheduler_stalls_per_core; + } + // fetch stalls + { + uint64_t fetch_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FETCH_ST); + int fetch_percent_per_core = calcAvgPercent(fetch_stalls_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch stalls=%ld (%d%%)\n", core_id, fetch_stalls_per_core, fetch_percent_per_core); + fetch_stalls += fetch_stalls_per_core; + } + // ibuffer_stalls + { + uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); + int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core); + ibuffer_stalls += ibuffer_stalls_per_core; + } + // scrb_stalls + { + uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); + uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU); + uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU); + uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU); + uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); + uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core; + scrb_alu += scrb_alu_per_core; + scrb_fpu += scrb_fpu_per_core; + scrb_lsu += scrb_lsu_per_core; + scrb_sfu += scrb_sfu_per_core; + if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, + calcAvgPercent(scrb_alu_per_core, scrb_total), + calcAvgPercent(scrb_fpu_per_core, scrb_total), + calcAvgPercent(scrb_lsu_per_core, scrb_total), + calcAvgPercent(scrb_sfu_per_core, scrb_total)); + scrb_stalls += scrb_stalls_per_core; + } + // alu_stalls + { + uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core); + alu_stalls += alu_stalls_per_core; + } + // lsu_stalls + { + uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core); + lsu_stalls += lsu_stalls_per_core; + } + // fpu_stalls + { + uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core); + fpu_stalls += fpu_stalls_per_core; + } + // sfu_stalls + { + uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST); + if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core); + sfu_stalls += sfu_stalls_per_core; + } // PERF: memory // ifetches - uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); - if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); - ifetches += ifetches_per_core; + { + uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); + ifetches += ifetches_per_core; + + uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT); + int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); + ifetch_lat += ifetch_lat_per_core; + } // loads - uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); - if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); - loads += loads_per_core; + { + uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); + if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); + loads += loads_per_core; + + uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT); + int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core); + if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); + load_lat += load_lat_per_core; + } // stores - uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES); - if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); - stores += stores_per_core; - // ifetch latency - uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT); - if (num_cores > 1) { - int mem_avg_lat = caclAvgLatency(ifetch_lat_per_core, ifetches_per_core); - fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); + { + uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES); + if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); + stores += stores_per_core; } - ifetch_lat += ifetch_lat_per_core; - // load latency - uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT); - if (num_cores > 1) { - int mem_avg_lat = caclAvgLatency(load_lat_per_core, loads_per_core); - fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); - } - load_lat += load_lat_per_core; } break; - case VX_DCR_MPM_CLASS_MEM: { + case VX_DCR_MPM_CLASS_MEM: { if (smem_enable) { // PERF: smem uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS); uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES); uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST); - int smem_bank_utilization = calcUtilization(smem_reads + smem_writes, smem_bank_stalls); + int smem_bank_utilization = calcAvgPercent(smem_reads + smem_writes, smem_reads + smem_writes + smem_bank_stalls); fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads); fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes); fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization); @@ -330,9 +379,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { // PERF: Icache uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS); uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R); - int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads); + uint64_t icache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MSHR_ST); + int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads); + int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls); fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads); fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio); + fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization); } if (dcache_enable) { @@ -345,13 +397,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST); int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads); int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes); - int dcache_bank_utilization = calcUtilization(dcache_reads + dcache_writes, dcache_bank_stalls); + int dcache_bank_utilization = calcAvgPercent(dcache_reads + dcache_writes, dcache_reads + dcache_writes + dcache_bank_stalls); + int mshr_utilization = calcAvgPercent(dcache_read_misses + dcache_write_misses, dcache_read_misses + dcache_write_misses + dcache_mshr_stalls); fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads); fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes); fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio); fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio); fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization); - fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_stalls); + fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization); } if (l2cache_enable) { @@ -386,8 +439,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } #endif - uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET); - uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE); float IPC = (float)(double(instrs_per_core) / double(cycles_per_core)); if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC); instrs += instrs_per_core; @@ -397,10 +448,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { #ifdef PERF_ENABLE switch (perf_class) { case VX_DCR_MPM_CLASS_CORE: { + int scheduler_percent = calcAvgPercent(scheduler_stalls, cycles); + int fetch_percent = calcAvgPercent(fetch_stalls, cycles); + int ibuffer_percent = calcAvgPercent(ibuffer_stalls, cycles); int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches)); int load_avg_lat = (int)(double(load_lat) / double(loads)); - fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls); - fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls); + uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu; + fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", scheduler_stalls, scheduler_percent); + fprintf(stream, "PERF: fetch stalls=%ld (%d%%)\n", fetch_stalls, fetch_percent); + fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent); + fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, + calcAvgPercent(scrb_alu, scrb_total), + calcAvgPercent(scrb_fpu, scrb_total), + calcAvgPercent(scrb_lsu, scrb_total), + calcAvgPercent(scrb_sfu, scrb_total)); fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls); fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls); fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls); @@ -419,31 +480,32 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { l2cache_write_misses /= num_cores; l2cache_bank_stalls /= num_cores; l2cache_mshr_stalls /= num_cores; - int l2cache_read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads); - int l2cache_write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes); - int l2cache_bank_utilization = calcUtilization(l2cache_reads + l2cache_writes, l2cache_bank_stalls); - + int read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads); + int write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes); + int bank_utilization = calcAvgPercent(l2cache_reads + l2cache_writes, l2cache_reads + l2cache_writes + l2cache_bank_stalls); + int mshr_utilization = calcAvgPercent(l2cache_read_misses + l2cache_write_misses, l2cache_read_misses + l2cache_write_misses + l2cache_mshr_stalls); fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads); fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes); - fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio); - fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio); - fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization); - fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls); + fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, read_hit_ratio); + fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio); + fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, bank_utilization); + fprintf(stream, "PERF: l2cache mshr stalls=%ld (utilization=%d%%)\n", l2cache_mshr_stalls, mshr_utilization); } if (l3cache_enable) { - int l3cache_read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); - int l3cache_write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); - int l3cache_bank_utilization = calcUtilization(l3cache_reads + l3cache_writes, l3cache_bank_stalls); + int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); + int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); + int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls); + int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls); fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); - fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio); - fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio); - fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization); - fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls); + fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio); + fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio); + fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization); + fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization); } - int mem_avg_lat = caclAvgLatency(mem_lat, mem_reads); + int mem_avg_lat = caclAverage(mem_lat, mem_reads); fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); } break; diff --git a/sim/simx/cache_cluster.h b/sim/simx/cache_cluster.h index be3146d3..aef28d1a 100644 --- a/sim/simx/cache_cluster.h +++ b/sim/simx/cache_cluster.h @@ -45,20 +45,20 @@ public: char sname[100]; - std::vector::Ptr> unit_arbs(num_units); + std::vector unit_arbs(num_units); for (uint32_t u = 0; u < num_units; ++u) { snprintf(sname, 100, "%s-unit-arb-%d", name, u); - unit_arbs.at(u) = Switch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs); + unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs); for (uint32_t i = 0; i < num_requests; ++i) { this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i)); unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i)); } } - std::vector::Ptr> mem_arbs(config.num_inputs); + std::vector mem_arbs(config.num_inputs); for (uint32_t i = 0; i < config.num_inputs; ++i) { snprintf(sname, 100, "%s-mem-arb-%d", name, i); - mem_arbs.at(i) = Switch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches); + mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches); for (uint32_t u = 0; u < num_units; ++u) { unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u)); mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i)); @@ -66,7 +66,7 @@ public: } snprintf(sname, 100, "%s-cache-arb", name); - auto cache_arb = Switch::Create(sname, ArbiterType::RoundRobin, num_caches, 1); + auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1); for (uint32_t i = 0; i < num_caches; ++i) { snprintf(sname, 100, "%s-cache%d", name, i); diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index 5a6906a9..2c9410f4 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -41,19 +41,16 @@ struct params_t { uint32_t tag_select_addr_end; params_t(const CacheSim::Config& config) { - int32_t bank_bits = log2ceil(config.num_banks); - int32_t offset_bits = config.B - config.W; - int32_t log2_bank_size = config.C - bank_bits; - int32_t index_bits = log2_bank_size - (config.B + config.A); - assert(log2_bank_size > 0); + int32_t offset_bits = config.L - config.W; + int32_t index_bits = config.C - (config.L + config.A + config.B); assert(offset_bits >= 0); assert(index_bits >= 0); this->log2_num_inputs = log2ceil(config.num_inputs); - this->words_per_line = 1 << offset_bits; + this->sets_per_bank = 1 << index_bits; this->lines_per_set = 1 << config.A; - this->sets_per_bank = 1 << index_bits; + this->words_per_line = 1 << offset_bits; assert(config.ports_per_bank <= this->words_per_line); @@ -63,7 +60,7 @@ struct params_t { // Bank select this->bank_select_addr_start = (1+this->word_select_addr_end); - this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1); + this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1); // Set select this->set_select_addr_start = (1+this->bank_select_addr_end); @@ -74,23 +71,23 @@ struct params_t { this->tag_select_addr_end = (config.addr_width-1); } - uint32_t addr_bank_id(uint64_t word_addr) const { + uint32_t addr_bank_id(uint64_t addr) const { if (bank_select_addr_end >= bank_select_addr_start) - return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end); + return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end); else return 0; } - uint32_t addr_set_id(uint64_t word_addr) const { + uint32_t addr_set_id(uint64_t addr) const { if (set_select_addr_end >= set_select_addr_start) - return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end); + return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end); else return 0; } - uint64_t addr_tag(uint64_t word_addr) const { + uint64_t addr_tag(uint64_t addr) const { if (tag_select_addr_end >= tag_select_addr_start) - return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end); + return bit_getw(addr, tag_select_addr_start, tag_select_addr_end); else return 0; } @@ -288,8 +285,8 @@ private: Config config_; params_t params_; std::vector banks_; - Switch::Ptr bank_switch_; - Switch::Ptr bypass_switch_; + MemSwitch::Ptr bank_switch_; + MemSwitch::Ptr bypass_switch_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; std::vector pipeline_reqs_; @@ -304,16 +301,16 @@ public: : simobject_(simobject) , config_(config) , params_(config) - , banks_(config.num_banks, {config, params_}) - , mem_req_ports_(config.num_banks, simobject) - , mem_rsp_ports_(config.num_banks, simobject) - , pipeline_reqs_(config.num_banks, config.ports_per_bank) + , banks_((1 << config.B), {config, params_}) + , mem_req_ports_((1 << config.B), simobject) + , mem_rsp_ports_((1 << config.B), simobject) + , pipeline_reqs_((1 << config.B), config.ports_per_bank) { char sname[100]; snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str()); if (config_.bypass) { - bypass_switch_ = Switch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs); + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs); for (uint32_t i = 0; i < config_.num_inputs; ++i) { simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i)); bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); @@ -323,14 +320,14 @@ public: return; } - bypass_switch_ = Switch::Create(sname, ArbiterType::Priority, 2); + bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); - if (config.num_banks > 1) { + if (config.B != 0) { snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); - bank_switch_ = Switch::Create(sname, ArbiterType::RoundRobin, config.num_banks); - for (uint32_t i = 0, n = config.num_banks; i < n; ++i) { + bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); } @@ -383,20 +380,22 @@ public: pipeline_req.clear(); } - // schedule MSHR replay - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + // first: schedule MSHR replay (flush MSHR queue) + for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) { auto& bank = banks_.at(bank_id); auto& pipeline_req = pipeline_reqs_.at(bank_id); bank.mshr.pop(&pipeline_req); } - // schedule memory fill - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + // second: schedule memory fill (flush memory queue) + for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) { auto& mem_rsp_port = mem_rsp_ports_.at(bank_id); if (mem_rsp_port.empty()) continue; auto& pipeline_req = pipeline_reqs_.at(bank_id); + + // skip if bank already busy if (pipeline_req.type != bank_req_t::None) continue; @@ -407,7 +406,7 @@ public: mem_rsp_port.pop(); } - // schedule core requests + // last: schedule core requests (flush core queue) for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) { auto& core_req_port = simobject_->CoreReqPorts.at(req_id); if (core_req_port.empty()) @@ -425,18 +424,21 @@ public: } auto bank_id = params_.addr_bank_id(core_req.addr); - auto set_id = params_.addr_set_id(core_req.addr); - auto tag = params_.addr_tag(core_req.addr); - auto port_id = req_id % config_.ports_per_bank; - auto& bank = banks_.at(bank_id); auto& pipeline_req = pipeline_reqs_.at(bank_id); + // skip if bank already busy + if (pipeline_req.type != bank_req_t::None) + continue; + + auto set_id = params_.addr_set_id(core_req.addr); + auto tag = params_.addr_tag(core_req.addr); + auto port_id = req_id % config_.ports_per_bank; + // check MSHR capacity if ((!core_req.write || !config_.write_through) && bank.mshr.full()) { ++perf_stats_.mshr_stalls; - ++perf_stats_.bank_stalls; continue; } @@ -452,7 +454,7 @@ public: } // extend request ports pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true}; - } else if (pipeline_req.type == bank_req_t::None) { + } else { // schedule new request bank_req_t bank_req(config_.ports_per_bank); bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true}; @@ -463,10 +465,6 @@ public: bank_req.type = bank_req_t::Core; bank_req.write = core_req.write; pipeline_req = bank_req; - } else { - // bank in use - ++perf_stats_.bank_stalls; - continue; } if (core_req.write) @@ -516,7 +514,7 @@ private: } void processBankRequests() { - for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) { + for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) { auto& bank = banks_.at(bank_id); auto pipeline_req = pipeline_reqs_.at(bank_id); @@ -545,11 +543,10 @@ private: } } } break; - case bank_req_t::Core: { - bool hit = false; - bool found_free_line = false; - uint32_t hit_line_id = 0; - uint32_t repl_line_id = 0; + case bank_req_t::Core: { + int32_t hit_line_id = -1; + int32_t free_line_id = -1; + int32_t repl_line_id = 0; uint32_t max_cnt = 0; auto& set = bank.sets.at(pipeline_req.set_id); @@ -557,38 +554,34 @@ private: // tag lookup for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) { auto& line = set.lines.at(i); + if (max_cnt < line.lru_ctr) { + max_cnt = line.lru_ctr; + repl_line_id = i; + } if (line.valid) { - if (line.tag == pipeline_req.tag) { - line.lru_ctr = 0; + if (line.tag == pipeline_req.tag) { hit_line_id = i; - hit = true; + line.lru_ctr = 0; } else { ++line.lru_ctr; } - if (max_cnt < line.lru_ctr) { - max_cnt = line.lru_ctr; - repl_line_id = i; - } } else { - found_free_line = true; - repl_line_id = i; + free_line_id = i; } } - if (hit) { - // - // Hit handling - // + if (hit_line_id != -1) { + // Hit handling if (pipeline_req.write) { - // handle write hit + // handle write has_hit auto& hit_line = set.lines.at(hit_line_id); if (config_.write_through) { // forward write request to memory MemReq mem_req; - mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag); + mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = true; - mem_req.cid = pipeline_req.cid; - mem_req.uuid = pipeline_req.uuid; + mem_req.cid = pipeline_req.cid; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); } else { @@ -606,23 +599,21 @@ private: DT(3, simobject_->name() << "-core-" << core_rsp); } } - } else { - // - // Miss handling - // + } else { + // Miss handling if (pipeline_req.write) ++perf_stats_.write_misses; else ++perf_stats_.read_misses; - if (!found_free_line && !config_.write_through) { + if (free_line_id == -1 && !config_.write_through) { // write back dirty line auto& repl_line = set.lines.at(repl_line_id); if (repl_line.dirty) { MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag); mem_req.write = true; - mem_req.cid = pipeline_req.cid; + mem_req.cid = pipeline_req.cid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); ++perf_stats_.evictions; @@ -635,8 +626,8 @@ private: MemReq mem_req; mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = true; - mem_req.cid = pipeline_req.cid; - mem_req.uuid = pipeline_req.uuid; + mem_req.cid = pipeline_req.cid; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); } @@ -655,7 +646,7 @@ private: auto mshr_pending = bank.mshr.lookup(pipeline_req); // allocate MSHR - auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id); + auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id); // send fill request if (!mshr_pending) { @@ -663,8 +654,8 @@ private: mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag); mem_req.write = false; mem_req.tag = mshr_id; - mem_req.cid = pipeline_req.cid; - mem_req.uuid = pipeline_req.uuid; + mem_req.cid = pipeline_req.cid; + mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).send(mem_req, 1); DT(3, simobject_->name() << "-dram-" << mem_req); ++pending_fill_reqs_; diff --git a/sim/simx/cache_sim.h b/sim/simx/cache_sim.h index 498fb73f..2faea91d 100644 --- a/sim/simx/cache_sim.h +++ b/sim/simx/cache_sim.h @@ -23,16 +23,15 @@ public: struct Config { bool bypass; // cache bypass uint8_t C; // log2 cache size - uint8_t B; // log2 block size + uint8_t L; // log2 line size uint8_t W; // log2 word size uint8_t A; // log2 associativity - uint8_t addr_width; // word address bits - uint8_t num_banks; // number of banks + uint8_t B; // log2 number of banks + uint8_t addr_width; // word address bits uint8_t ports_per_bank; // number of ports per bank uint8_t num_inputs; // number of inputs bool write_through; // is write-through bool write_reponse; // enable write response - uint16_t victim_size; // victim cache size uint16_t mshr_size; // MSHR buffer size uint8_t latency; // pipeline latency }; diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index a129ddaa..d7104915 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -36,16 +36,15 @@ Cluster::Cluster(const SimContext& ctx, l2cache_ = CacheSim::Create(sname, CacheSim::Config{ !L2_ENABLED, log2ceil(L2_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B + log2ceil(MEM_BLOCK_SIZE), // L log2ceil(L2_NUM_WAYS), // W 0, // A + log2ceil(L2_NUM_BANKS), // B XLEN, // address bits - L2_NUM_BANKS, // number of banks 1, // number of ports 5, // request size true, // write-through false, // write response - 0, // victim size L2_MSHR_SIZE, // mshr 2, // pipeline latency }); @@ -57,16 +56,15 @@ Cluster::Cluster(const SimContext& ctx, icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{ !ICACHE_ENABLED, log2ceil(ICACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // B + log2ceil(L1_LINE_SIZE), // L log2ceil(sizeof(uint32_t)), // W log2ceil(ICACHE_NUM_WAYS),// A - XLEN, // address bits - 1, // number of banks + 1, // B + XLEN, // address bits 1, // number of ports 1, // number of inputs true, // write-through false, // write response - 0, // victim size (uint8_t)arch.num_warps(), // mshr 2, // pipeline latency }); @@ -78,16 +76,15 @@ Cluster::Cluster(const SimContext& ctx, dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{ !DCACHE_ENABLED, log2ceil(DCACHE_SIZE), // C - log2ceil(L1_LINE_SIZE), // B + log2ceil(L1_LINE_SIZE), // L log2ceil(sizeof(Word)), // W log2ceil(DCACHE_NUM_WAYS),// A - XLEN, // address bits - DCACHE_NUM_BANKS, // number of banks + log2ceil(DCACHE_NUM_BANKS), // B + XLEN, // address bits 1, // number of ports DCACHE_NUM_BANKS, // number of inputs true, // write-through false, // write response - 0, // victim size DCACHE_MSHR_SIZE, // mshr 4, // pipeline latency }); @@ -129,11 +126,11 @@ Cluster::Cluster(const SimContext& ctx, cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn); smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j)); - smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j)); - dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc); + smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j)); + dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC); - smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j)); - sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm); + smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j)); + sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM); } } } diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index f2931324..b2fe7ea2 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -45,19 +45,21 @@ Core::Core(const SimContext& ctx, , warps_(arch.num_warps()) , barriers_(arch.num_barriers(), 0) , fcsrs_(arch.num_warps(), 0) - , ibuffers_(ISSUE_WIDTH, IBUF_SIZE) + , ibuffers_(arch.num_warps(), IBUF_SIZE) , scoreboard_(arch_) , operands_(ISSUE_WIDTH) - , dispatchers_((uint32_t)ExeType::MAX) - , exe_units_((uint32_t)ExeType::MAX) + , dispatchers_((uint32_t)ExeType::ExeTypeCount) + , exe_units_((uint32_t)ExeType::ExeTypeCount) , sharedmem_(sharedmem) , fetch_latch_("fetch") , decode_latch_("decode") , pending_icache_(arch_.num_warps()) - , committed_traces_(ISSUE_WIDTH, nullptr) , csrs_(arch.num_warps()) , cluster_(cluster) -{ + , commit_arbs_(ISSUE_WIDTH) +{ + char sname[100]; + for (uint32_t i = 0; i < arch_.num_warps(); ++i) { csrs_.at(i).resize(arch.num_threads()); } @@ -82,6 +84,16 @@ Core::Core(const SimContext& ctx, exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object(this); exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object(this); + // bind commit arbiters + for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { + snprintf(sname, 100, "commit-arb%d", i); + auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1); + for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) { + exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j)); + } + commit_arbs_.at(i) = arbiter; + } + this->reset(); } @@ -99,8 +111,12 @@ void Core::reset() { for (auto& exe_unit : exe_units_) { exe_unit->reset(); } + + for (auto& commit_arb : commit_arbs_) { + commit_arb->reset(); + } - for ( auto& barrier : barriers_) { + for (auto& barrier : barriers_) { barrier.reset(); } @@ -112,7 +128,7 @@ void Core::reset() { ibuf.clear(); } - commit_exe_= 0; + ibuffer_idx_ = 0; scoreboard_.clear(); fetch_latch_.clear(); @@ -150,8 +166,10 @@ void Core::schedule() { break; } } - if (scheduled_warp == -1) + if (scheduled_warp == -1) { + ++perf_stats_.sched_stalls; return; + } // suspend warp until decode stalled_warps_.set(scheduled_warp); @@ -192,11 +210,11 @@ void Core::fetch() { mem_req.tag = pending_icache_.allocate(trace); mem_req.cid = trace->cid; mem_req.uuid = trace->uuid; - icache_req_ports.at(0).send(mem_req, 1); + icache_req_ports.at(0).send(mem_req, 2); DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace); - fetch_latch_.pop(); - ++pending_ifetches_; + fetch_latch_.pop(); ++perf_stats_.ifetches; + ++pending_ifetches_; } void Core::decode() { @@ -206,7 +224,7 @@ void Core::decode() { auto trace = decode_latch_.front(); // check ibuffer capacity - auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH); + auto& ibuffer = ibuffers_.at(trace->wid); if (ibuffer.full()) { if (!trace->log_once(true)) { DT(3, "*** ibuffer-stall: " << *trace); @@ -239,7 +257,7 @@ void Core::decode() { } void Core::issue() { - // operands to dispatch + // operands to dispatchers for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { auto& operand = operands_.at(i); if (operand->Output.empty()) @@ -257,7 +275,8 @@ void Core::issue() { // issue ibuffer instructions for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { - auto& ibuffer = ibuffers_.at(i); + uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size(); + auto& ibuffer = ibuffers_.at(ii); if (ibuffer.empty()) continue; @@ -265,17 +284,27 @@ void Core::issue() { // check scoreboard if (scoreboard_.in_use(trace)) { + auto uses = scoreboard_.get_uses(trace); if (!trace->log_once(true)) { - DTH(3, "*** scoreboard-stall: dependents={"); - auto uses = scoreboard_.get_uses(trace); + DTH(3, "*** scoreboard-stall: dependents={"); for (uint32_t j = 0, n = uses.size(); j < n; ++j) { auto& use = uses.at(j); __unused (use); if (j) DTN(3, ", "); - DTN(3, use.type << use.reg << "(#" << use.owner << ")"); + DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")"); } DTN(3, "}, " << *trace << std::endl); } + for (uint32_t j = 0, n = uses.size(); j < n; ++j) { + auto& use = uses.at(j); + switch (use.exe_type) { + case ExeType::ALU: ++perf_stats_.scrb_alu; break; + case ExeType::FPU: ++perf_stats_.scrb_fpu; break; + case ExeType::LSU: ++perf_stats_.scrb_lsu; break; + case ExeType::SFU: ++perf_stats_.scrb_sfu; break; + default: assert(false); + } + } ++perf_stats_.scrb_stalls; continue; } else { @@ -294,10 +323,11 @@ void Core::issue() { ibuffer.pop(); } + ibuffer_idx_ += ISSUE_WIDTH; } void Core::execute() { - for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) { + for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) { auto& dispatch = dispatchers_.at(i); auto& exe_unit = exe_units_.at(i); for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) { @@ -313,10 +343,11 @@ void Core::execute() { void Core::commit() { // process completed instructions for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { - auto trace = committed_traces_.at(i); - if (!trace) + auto& commit_arb = commit_arbs_.at(i); + if (commit_arb->Outputs.at(0).empty()) continue; - committed_traces_.at(i) = nullptr; + + auto trace = commit_arb->Outputs.at(0).front(); // advance to commit stage DT(3, "pipeline-commit: " << *trace); @@ -334,27 +365,11 @@ void Core::commit() { perf_stats_.instrs += trace->tmask.count(); } + commit_arb->Outputs.at(0).pop(); + // delete the trace delete trace; } - - // select completed instructions - for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) { - uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX; - auto& exe_unit = exe_units_.at(ii); - for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) { - auto committed_trace = committed_traces_.at(j); - if (committed_trace) - continue; - auto& output = exe_unit->Outputs.at(j); - if (output.empty()) - continue; - auto trace = output.front(); - committed_traces_.at(j) = trace; - output.pop(); - } - } - ++commit_exe_; } void Core::wspawn(uint32_t num_warps, Word nextPC) { @@ -533,6 +548,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { break; case VX_DCR_MPM_CLASS_CORE: { switch (addr) { + case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff; + case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32; + case VX_CSR_MPM_FETCH_ST: return perf_stats_.fetch_stalls & 0xffffffff; + case VX_CSR_MPM_FETCH_ST_H:return perf_stats_.fetch_stalls >> 32; case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff; case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32; case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff; @@ -545,6 +564,14 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32; case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff; case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32; + case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff; + case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32; + case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff; + case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32; + case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff; + case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32; + case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff; + case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32; case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff; case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32; @@ -561,30 +588,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_DCR_MPM_CLASS_MEM: { auto proc_perf = cluster_->processor()->perf_stats(); switch (addr) { - case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff; - case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32; - case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff; - case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32; + case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff; + case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32; + case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff; + case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32; + case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32; - case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff; - case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32; - case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff; - case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; - case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; - case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff; - case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; - case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32; - case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; - case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32; - - case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; - case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; - case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff; - case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32; - case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; - case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32; + case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff; + case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32; + case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff; + case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32; + case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32; + case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff; + case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32; + case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32; + case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff; + case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32; case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff; case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32; @@ -612,12 +634,19 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff; case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32; - case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff; - case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; - case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; - case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32; - case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff; - case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32; + case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff; + case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32; + case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff; + case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32; + case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff; + case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32; + + case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff; + case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32; + case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff; + case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32; + case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff; + case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32; } } break; } diff --git a/sim/simx/core.h b/sim/simx/core.h index ed06574d..60290bef 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -22,11 +22,11 @@ #include #include #include +#include #include "debug.h" #include "types.h" #include "arch.h" #include "decode.h" -#include "mem.h" #include "warp.h" #include "pipeline.h" #include "cache_sim.h" @@ -42,17 +42,25 @@ namespace vortex { class Cluster; +using TraceSwitch = Mux; + class Core : public SimObject { public: struct PerfStats { uint64_t cycles; uint64_t instrs; + uint64_t sched_stalls; + uint64_t fetch_stalls; uint64_t ibuf_stalls; uint64_t scrb_stalls; uint64_t alu_stalls; uint64_t lsu_stalls; uint64_t fpu_stalls; uint64_t sfu_stalls; + uint64_t scrb_alu; + uint64_t scrb_fpu; + uint64_t scrb_lsu; + uint64_t scrb_sfu; uint64_t ifetches; uint64_t loads; uint64_t stores; @@ -62,12 +70,18 @@ public: PerfStats() : cycles(0) , instrs(0) + , sched_stalls(0) + , fetch_stalls(0) , ibuf_stalls(0) , scrb_stalls(0) , alu_stalls(0) , lsu_stalls(0) , fpu_stalls(0) , sfu_stalls(0) + , scrb_alu(0) + , scrb_fpu(0) + , scrb_lsu(0) + , scrb_sfu(0) , ifetches(0) , loads(0) , stores(0) @@ -173,7 +187,6 @@ private: PipelineLatch decode_latch_; HashTable pending_icache_; - std::vector committed_traces_; WarpMask active_warps_; WarpMask stalled_warps_; uint64_t issued_instrs_; @@ -190,7 +203,9 @@ private: Cluster* cluster_; - uint32_t commit_exe_; + std::vector commit_arbs_; + + uint32_t ibuffer_idx_; friend class Warp; friend class LsuUnit; diff --git a/sim/simx/dispatcher.h b/sim/simx/dispatcher.h index a5c44b64..fe83e9de 100644 --- a/sim/simx/dispatcher.h +++ b/sim/simx/dispatcher.h @@ -66,6 +66,7 @@ public: } auto& output = Outputs.at(i); auto trace = input.front(); + auto new_trace = trace; if (pid_count_ != 1) { auto start_p = start_p_.at(b); if (start_p == -1) { @@ -81,33 +82,30 @@ public: end = j; } start /= num_lanes_; - end /= num_lanes_; - auto new_trace = new pipeline_trace_t(*trace); - new_trace->tmask.reset(); - for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) { - new_trace->tmask[j] = trace->tmask[j]; - } - new_trace->pid = start; - new_trace->sop = (start_p == 0); - if (start == end) { - new_trace->eop = 1; + end /= num_lanes_; + if (start != end) { + new_trace = new pipeline_trace_t(*trace); + new_trace->eop = false; + start_p_.at(b) = start + 1; + } else { start_p_.at(b) = -1; input.pop(); ++block_sent; - delete trace; - } else { - new_trace->eop = 0; - start_p_.at(b) = start + 1; - } - output.send(new_trace, 1); - DT(3, "pipeline-dispatch: " << *new_trace); + } + new_trace->pid = start; + new_trace->sop = (0 == start_p); + ThreadMask tmask; + for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) { + tmask[j] = trace->tmask[j]; + } + new_trace->tmask = tmask; } else { - trace->pid = 0; + new_trace->pid = 0; input.pop(); - output.send(trace, 1); - DT(3, "pipeline-dispatch: " << *trace); ++block_sent; - } + } + DT(3, "pipeline-dispatch: " << *new_trace); + output.send(new_trace, 1); } if (block_sent == block_size_) { batch_idx_ = (batch_idx_ + 1) % batch_count_; @@ -138,4 +136,4 @@ private: std::vector start_p_; }; -} \ No newline at end of file +} diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index da151b62..77021dbd 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -32,18 +32,17 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{ !L3_ENABLED, log2ceil(L3_CACHE_SIZE), // C - log2ceil(MEM_BLOCK_SIZE), // B - log2ceil(L3_NUM_WAYS), // W - 0, // A - XLEN, // address bits - L3_NUM_BANKS, // number of banks - 1, // number of ports + log2ceil(MEM_BLOCK_SIZE), // L + log2ceil(L3_NUM_WAYS), // W + 0, // A + log2ceil(L3_NUM_BANKS), // B + XLEN, // address bits + 1, // number of ports uint8_t(arch.num_clusters()), // request size - true, // write-through - false, // write response - 0, // victim size - L3_MSHR_SIZE, // mshr - 2, // pipeline latency + true, // write-through + false, // write response + L3_MSHR_SIZE, // mshr + 2, // pipeline latency } ); diff --git a/sim/simx/scoreboard.h b/sim/simx/scoreboard.h index 4d311846..5c247b73 100644 --- a/sim/simx/scoreboard.h +++ b/sim/simx/scoreboard.h @@ -22,9 +22,10 @@ class Scoreboard { public: struct reg_use_t { - RegType type; - uint32_t reg; - uint64_t owner; + RegType reg_type; + uint32_t reg_id; + ExeType exe_type; + uint64_t uuid; }; Scoreboard(const Arch &arch) @@ -44,89 +45,81 @@ public: owners_.clear(); } - bool in_use(pipeline_trace_t* state) const { - return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0 - || (state->used_fregs & in_use_fregs_.at(state->wid)) != 0 - || (state->used_vregs & in_use_vregs_.at(state->wid)) != 0; + bool in_use(pipeline_trace_t* trace) const { + return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0 + || (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0 + || (trace->used_vregs & in_use_vregs_.at(trace->wid)) != 0; } - std::vector get_uses(pipeline_trace_t* state) const { - std::vector out; - { - uint32_t r = 0; - auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid); - while (used_iregs.any()) { - if (used_iregs.test(0)) { - uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer; - out.push_back({RegType::Integer, r, owners_.at(tag)}); - } - used_iregs >>= 1; - ++r; + std::vector get_uses(pipeline_trace_t* trace) const { + std::vector out; + + auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid); + auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid); + auto used_vregs = trace->used_vregs & in_use_vregs_.at(trace->wid); + + for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) { + if (used_iregs.test(r)) { + uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer; + auto owner = owners_.at(tag); + out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid}); } } - { - uint32_t r = 0; - auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid); - while (used_fregs.any()) { - if (used_fregs.test(0)) { - uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float; - out.push_back({RegType::Float, r, owners_.at(tag)}); - } - used_fregs >>= 1; - ++r; + + for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) { + if (used_fregs.test(r)) { + uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float; + auto owner = owners_.at(tag); + out.push_back({RegType::Float, r, owner->exe_type, owner->uuid}); } } - { - uint32_t r = 0; - auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid); - while (used_vregs.any()) { - if (used_vregs.test(0)) { - uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector; - out.push_back({RegType::Vector, r, owners_.at(tag)}); - } - used_vregs >>= 1; - ++r; + + for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) { + if (used_vregs.test(r)) { + uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector; + auto owner = owners_.at(tag); + out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid}); } } + return out; } - void reserve(pipeline_trace_t* state) { - assert(state->wb); - switch (state->rdest_type) { + void reserve(pipeline_trace_t* trace) { + assert(trace->wb); + switch (trace->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state->wid).set(state->rdest); + in_use_iregs_.at(trace->wid).set(trace->rdest); break; case RegType::Float: - in_use_fregs_.at(state->wid).set(state->rdest); + in_use_fregs_.at(trace->wid).set(trace->rdest); break; case RegType::Vector: - in_use_vregs_.at(state->wid).set(state->rdest); - break; - default: + in_use_vregs_.at(trace->wid).set(trace->rdest); break; + default: assert(false); } - uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; + uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type; assert(owners_.count(tag) == 0); - owners_[tag] = state->uuid; + owners_[tag] = trace; + assert((int)trace->exe_type < 5); } - void release(pipeline_trace_t* state) { - assert(state->wb); - switch (state->rdest_type) { + void release(pipeline_trace_t* trace) { + assert(trace->wb); + switch (trace->rdest_type) { case RegType::Integer: - in_use_iregs_.at(state->wid).reset(state->rdest); + in_use_iregs_.at(trace->wid).reset(trace->rdest); break; case RegType::Float: - in_use_fregs_.at(state->wid).reset(state->rdest); + in_use_fregs_.at(trace->wid).reset(trace->rdest); break; case RegType::Vector: - in_use_vregs_.at(state->wid).reset(state->rdest); - break; - default: + in_use_vregs_.at(trace->wid).reset(trace->rdest); break; + default: assert(false); } - uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type; + uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type; owners_.erase(tag); } @@ -135,7 +128,7 @@ private: std::vector in_use_iregs_; std::vector in_use_fregs_; std::vector in_use_vregs_; - std::unordered_map owners_; + std::unordered_map owners_; }; } \ No newline at end of file diff --git a/sim/simx/types.h b/sim/simx/types.h index 88b3ce0e..6bba7f9c 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -81,7 +81,7 @@ enum class ExeType { LSU, FPU, SFU, - MAX, + ExeTypeCount }; inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { @@ -90,7 +90,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) { case ExeType::LSU: os << "LSU"; break; case ExeType::FPU: os << "FPU"; break; case ExeType::SFU: os << "SFU"; break; - case ExeType::MAX: break; + default: assert(false); } return os; } @@ -138,7 +138,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { enum class AddrType { Global, Shared, - IO, + IO }; inline std::ostream &operator<<(std::ostream &os, const AddrType& type) { @@ -164,7 +164,7 @@ enum class FpuType { FMA, FDIV, FSQRT, - FCVT, + FCVT }; inline std::ostream &operator<<(std::ostream &os, const FpuType& type) { @@ -190,7 +190,7 @@ enum class SfuType { CSRRW, CSRRS, CSRRC, - CMOV + CMOV }; inline std::ostream &operator<<(std::ostream &os, const SfuType& type) { @@ -351,6 +351,92 @@ private: /////////////////////////////////////////////////////////////////////////////// +template +class Mux : public SimObject> { +public: + std::vector> Inputs; + std::vector> Outputs; + + Mux( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t num_outputs = 1, + uint32_t delay = 1 + ) : SimObject>(ctx, name) + , Inputs(num_inputs, this) + , Outputs(num_outputs, this) + , type_(type) + , delay_(delay) + , cursors_(num_outputs, 0) + , num_reqs_(num_inputs / num_outputs) + { + assert(delay != 0); + assert(num_inputs <= 32); + assert(num_outputs <= 32); + assert(num_inputs >= num_outputs); + + // bypass mode + if (num_inputs == num_outputs) { + for (uint32_t i = 0; i < num_inputs; ++i) { + Inputs.at(i).bind(&Outputs.at(i)); + } + } + } + + void reset() { + for (auto& cursor : cursors_) { + cursor = 0; + } + } + + void tick() { + uint32_t I = Inputs.size(); + uint32_t O = Outputs.size(); + uint32_t R = num_reqs_; + + // skip bypass mode + if (I == O) + return; + + // process inputs + for (uint32_t o = 0; o < O; ++o) { + for (uint32_t r = 0; r < R; ++r) { + uint32_t i = (cursors_.at(o) + r) & (R-1); + uint32_t j = o * R + i; + if (j >= I) + continue; + + auto& req_in = Inputs.at(j); + if (!req_in.empty()) { + auto& req = req_in.front(); + DT(4, this->name() << "-" << req); + Outputs.at(o).send(req, delay_); + req_in.pop(); + this->update_cursor(o, i); + break; + } + } + } + } + +private: + + void update_cursor(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + cursors_.at(index) = grant + 1; + } + } + + ArbiterType type_; + uint32_t delay_; + std::vector cursors_; + uint32_t num_reqs_; +}; + +/////////////////////////////////////////////////////////////////////////////// + template class Switch : public SimObject> { public: @@ -364,13 +450,13 @@ public: const SimContext& ctx, const char* name, ArbiterType type, - uint32_t num_inputs = 1, + uint32_t num_inputs, uint32_t num_outputs = 1, uint32_t delay = 1 ) : SimObject>(ctx, name) - , ReqIn(num_inputs, this) - , RspIn(num_inputs, this) + , ReqIn(num_inputs, this) + , RspIn(num_inputs, this) , ReqOut(num_outputs, this) , RspOut(num_outputs, this) , type_(type) @@ -383,8 +469,8 @@ public: assert(num_outputs <= 32); assert(num_inputs >= num_outputs); + // bypass mode if (num_inputs == num_outputs) { - // bypass mode for (uint32_t i = 0; i < num_inputs; ++i) { ReqIn.at(i).bind(&ReqOut.at(i)); RspOut.at(i).bind(&RspIn.at(i)); @@ -462,14 +548,14 @@ private: class SMemDemux : public SimObject { public: - SimPort ReqIn; - SimPort RspIn; + SimPort ReqIn; + SimPort RspIn; - SimPort ReqSm; - SimPort RspSm; + SimPort ReqSM; + SimPort RspSM; - SimPort ReqDc; - SimPort RspDc; + SimPort ReqDC; + SimPort RspDC; SMemDemux( const SimContext& ctx, @@ -478,45 +564,49 @@ public: ) : SimObject(ctx, name) , ReqIn(this) , RspIn(this) - , ReqSm(this) - , RspSm(this) - , ReqDc(this) - , RspDc(this) + , ReqSM(this) + , RspSM(this) + , ReqDC(this) + , RspDC(this) , delay_(delay) {} void reset() {} - void tick() { + void tick() { + // process incoming reponses + if (!RspSM.empty()) { + auto& rsp = RspSM.front(); + DT(4, this->name() << "-" << rsp); + RspIn.send(rsp, 1); + RspSM.pop(); + } + if (!RspDC.empty()) { + auto& rsp = RspDC.front(); + DT(4, this->name() << "-" << rsp); + RspIn.send(rsp, 1); + RspDC + .pop(); + } // process incomming requests if (!ReqIn.empty()) { auto& req = ReqIn.front(); DT(4, this->name() << "-" << req); if (req.type == AddrType::Shared) { - ReqSm.send(req, delay_); + ReqSM.send(req, delay_); } else { - ReqDc.send(req, delay_); + ReqDC.send(req, delay_); } ReqIn.pop(); } - - // process incoming reponses - if (!RspSm.empty()) { - auto& rsp = RspSm.front(); - DT(4, this->name() << "-" << rsp); - RspIn.send(rsp, 1); - RspSm.pop(); - } - if (!RspDc.empty()) { - auto& rsp = RspDc.front(); - DT(4, this->name() << "-" << rsp); - RspIn.send(rsp, 1); - RspDc.pop(); - } } private: uint32_t delay_; }; -} \ No newline at end of file +/////////////////////////////////////////////////////////////////////////////// + +using MemSwitch = Switch; + +} diff --git a/tests/opencl/Makefile b/tests/opencl/Makefile index 88236559..27ef6f38 100644 --- a/tests/opencl/Makefile +++ b/tests/opencl/Makefile @@ -17,7 +17,7 @@ all: $(MAKE) -C lbm $(MAKE) -C oclprintf $(MAKE) -C blackscholes - $(MAKE) -C matmul + $(MAKE) -C sgemm2 $(MAKE) -C convolution run-simx: @@ -38,7 +38,7 @@ run-simx: $(MAKE) -C transpose run-simx $(MAKE) -C convolution run-simx $(MAKE) -C cutcp run-simx - $(MAKE) -C matmul run-simx + $(MAKE) -C sgemm2 run-simx $(MAKE) -C vectorhypot run-simx $(MAKE) -C mri-q run-simx @@ -60,7 +60,7 @@ run-rtlsim: $(MAKE) -C blackscholes run-rtlsim $(MAKE) -C convolution run-rtlsim # $(MAKE) -C cutcp run-rtlsim -# $(MAKE) -C matmul run-rtlsim +# $(MAKE) -C sgemm2 run-rtlsim # $(MAKE) -C vectorhypot run-rtlsim # $(MAKE) -C mri-q run-rtlsim @@ -82,7 +82,7 @@ run-opae: $(MAKE) -C blackscholes run-opae $(MAKE) -C convolution run-opae # $(MAKE) -C cutcp run-opae -# $(MAKE) -C matmul run-opae +# $(MAKE) -C sgemm2 run-opae # $(MAKE) -C vectorhypot run-opae # $(MAKE) -C mri-q run-opae @@ -105,7 +105,7 @@ clean: $(MAKE) -C lbm clean $(MAKE) -C oclprintf clean $(MAKE) -C blackscholes clean - $(MAKE) -C matmul clean + $(MAKE) -C sgemm2 clean $(MAKE) -C convolution clean clean-all: @@ -128,5 +128,5 @@ clean-all: $(MAKE) -C lbm clean-all $(MAKE) -C oclprintf clean-all $(MAKE) -C blackscholes clean-all - $(MAKE) -C matmul clean-all + $(MAKE) -C sgemm2 clean-all $(MAKE) -C convolution clean-all diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk index 762712b4..cfb436a6 100644 --- a/tests/opencl/common.mk +++ b/tests/opencl/common.mk @@ -73,7 +73,7 @@ OBJS := $(addsuffix .o, $(notdir $(SRCS))) all: $(PROJECT) kernel.pocl kernel.pocl: kernel.cl - LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl %.cc.o: %.cc $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -87,6 +87,9 @@ kernel.pocl: kernel.cl $(PROJECT): $(OBJS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ +run-hostgpu: $(PROJECT) kernel.pocl + ./$(PROJECT) $(OPTS) + run-simx: $(PROJECT) kernel.pocl LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) diff --git a/tests/opencl/matmul/Makefile b/tests/opencl/sgemm2/Makefile similarity index 75% rename from tests/opencl/matmul/Makefile rename to tests/opencl/sgemm2/Makefile index 39b92b36..f507d9ed 100644 --- a/tests/opencl/matmul/Makefile +++ b/tests/opencl/sgemm2/Makefile @@ -1,4 +1,4 @@ -PROJECT = matmul +PROJECT = sgemm2 SRCS = main.cc diff --git a/tests/opencl/matmul/kernel.cl b/tests/opencl/sgemm2/kernel.cl similarity index 92% rename from tests/opencl/matmul/kernel.cl rename to tests/opencl/sgemm2/kernel.cl index 02aa074c..6a764820 100644 --- a/tests/opencl/matmul/kernel.cl +++ b/tests/opencl/sgemm2/kernel.cl @@ -1,4 +1,4 @@ -__kernel void matmul(__global float *A, +__kernel void sgemm2(__global float *A, __global float *B, __global float *C, const unsigned int N, @@ -28,12 +28,15 @@ __kernel void matmul(__global float *A, for (int j = 0; j < localSize; j++) { sum += localA[localRow * localSize + j] * localB[j * localSize + localCol]; } + + // Ensure computation is done before loading next block + barrier(CLK_LOCAL_MEM_FENCE); } C[globalRow * N + globalCol] = sum; } -/*__kernel void matmul(__global float *A, +/*__kernel void sgemm2(__global float *A, __global float *B, __global float *C, const unsigned int N) diff --git a/tests/opencl/matmul/main.cc b/tests/opencl/sgemm2/main.cc similarity index 99% rename from tests/opencl/matmul/main.cc rename to tests/opencl/sgemm2/main.cc index 3d26ff0c..21ec3a54 100644 --- a/tests/opencl/matmul/main.cc +++ b/tests/opencl/sgemm2/main.cc @@ -12,7 +12,7 @@ #define FLOAT_ULP 6 -#define KERNEL_NAME "matmul" +#define KERNEL_NAME "sgemm2" #define CL_CHECK(_expr) \ do { \ diff --git a/tests/regression/Makefile b/tests/regression/Makefile index 89fa25af..d44c82c4 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -10,7 +10,8 @@ all: $(MAKE) -C fence $(MAKE) -C no_mf_ext $(MAKE) -C no_smem - $(MAKE) -C tensor + $(MAKE) -C vecaddx + $(MAKE) -C sgemmx run-simx: $(MAKE) -C basic run-simx @@ -24,7 +25,8 @@ run-simx: $(MAKE) -C fence run-simx $(MAKE) -C no_mf_ext run-simx $(MAKE) -C no_smem run-simx - $(MAKE) -C tensor run-simx + $(MAKE) -C vecaddx run-simx + $(MAKE) -C sgemmx run-simx run-rtlsim: $(MAKE) -C basic run-rtlsim @@ -38,7 +40,8 @@ run-rtlsim: $(MAKE) -C fence run-rtlsim $(MAKE) -C no_mf_ext run-rtlsim $(MAKE) -C no_smem run-rtlsim - $(MAKE) -C tensor run-rtlsim + $(MAKE) -C vecaddx run-rtlsim + $(MAKE) -C sgemmx run-rtlsim run-opae: $(MAKE) -C basic run-opae @@ -52,7 +55,8 @@ run-opae: $(MAKE) -C fence run-opae $(MAKE) -C no_mf_ext run-opae $(MAKE) -C no_smem run-opae - $(MAKE) -C tensor run-opae + $(MAKE) -C vecaddx run-opae + $(MAKE) -C sgemmx run-opae clean: $(MAKE) -C basic clean @@ -66,7 +70,8 @@ clean: $(MAKE) -C fence clean $(MAKE) -C no_mf_ext clean $(MAKE) -C no_smem clean - $(MAKE) -C tensor clean + $(MAKE) -C vecaddx clean + $(MAKE) -C sgemmx clean clean-all: $(MAKE) -C basic clean-all @@ -80,4 +85,5 @@ clean-all: $(MAKE) -C fence clean-all $(MAKE) -C no_mf_ext clean-all $(MAKE) -C no_smem clean-all - $(MAKE) -C tensor clean-all + $(MAKE) -C vecaddx clean-all + $(MAKE) -C sgemmx clean-all diff --git a/tests/regression/tensor/Makefile b/tests/regression/sgemmx/Makefile similarity index 81% rename from tests/regression/tensor/Makefile rename to tests/regression/sgemmx/Makefile index dbb70c3b..2e72b32e 100644 --- a/tests/regression/tensor/Makefile +++ b/tests/regression/sgemmx/Makefile @@ -1,4 +1,4 @@ -PROJECT = tensor +PROJECT = sgemmx SRCS = main.cpp diff --git a/tests/regression/tensor/common.h b/tests/regression/sgemmx/common.h similarity index 100% rename from tests/regression/tensor/common.h rename to tests/regression/sgemmx/common.h diff --git a/tests/regression/tensor/kernel.cpp b/tests/regression/sgemmx/kernel.cpp similarity index 100% rename from tests/regression/tensor/kernel.cpp rename to tests/regression/sgemmx/kernel.cpp diff --git a/tests/regression/tensor/main.cpp b/tests/regression/sgemmx/main.cpp similarity index 95% rename from tests/regression/tensor/main.cpp rename to tests/regression/sgemmx/main.cpp index 81103c10..23008011 100644 --- a/tests/regression/tensor/main.cpp +++ b/tests/regression/sgemmx/main.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "common.h" @@ -122,43 +123,6 @@ void cleanup() { } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - const std::vector& refs) { - // start device - std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); - - // wait for completion - std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); - - // download destination buffer - std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size)); - - // verify result - std::cout << "verify result" << std::endl; - { - int errors = 0; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < refs.size(); ++i) { - auto ref = refs[i]; - auto cur = buf_ptr[i]; - if (!Comparator::compare(cur, ref, i, errors)) { - ++errors; - } - } - if (errors != 0) { - std::cout << "Found " << std::dec << errors << " errors!" << std::endl; - std::cout << "FAILED!" << std::endl; - return 1; - } - } - - return 0; -} - int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); @@ -239,10 +203,43 @@ int main(int argc, char *argv[]) { std::cout << "clear destination buffer" << std::endl; memset(staging_buf.data(), 0, num_points * sizeof(TYPE)); RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size)); + + auto time_start = std::chrono::high_resolution_clock::now(); - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, buf_size, refs)); + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < refs.size(); ++i) { + auto ref = refs[i]; + auto cur = buf_ptr[i]; + if (!Comparator::compare(cur, ref, i, errors)) { + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/vecaddx/Makefile b/tests/regression/vecaddx/Makefile new file mode 100644 index 00000000..af43d3c7 --- /dev/null +++ b/tests/regression/vecaddx/Makefile @@ -0,0 +1,9 @@ +PROJECT = vecaddx + +SRCS = main.cpp + +VX_SRCS = kernel.cpp + +OPTS ?= -n64 + +include ../common.mk \ No newline at end of file diff --git a/tests/regression/vecaddx/common.h b/tests/regression/vecaddx/common.h new file mode 100644 index 00000000..2b8f164a --- /dev/null +++ b/tests/regression/vecaddx/common.h @@ -0,0 +1,17 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +#ifndef TYPE +#define TYPE float +#endif + +typedef struct { + uint32_t num_points; + uint64_t src0_addr; + uint64_t src1_addr; + uint64_t dst_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/vecaddx/kernel.cpp b/tests/regression/vecaddx/kernel.cpp new file mode 100644 index 00000000..6ed42164 --- /dev/null +++ b/tests/regression/vecaddx/kernel.cpp @@ -0,0 +1,18 @@ +#include +#include +#include +#include "common.h" + +void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { + auto src0_ptr = reinterpret_cast(arg->src0_addr); + auto src1_ptr = reinterpret_cast(arg->src1_addr); + auto dst_ptr = reinterpret_cast(arg->dst_addr); + + dst_ptr[task_id] = src0_ptr[task_id] + src1_ptr[task_id]; +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); + return 0; +} diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp new file mode 100644 index 00000000..117f3470 --- /dev/null +++ b/tests/regression/vecaddx/main.cpp @@ -0,0 +1,246 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "integer"; + } + static int generate() { + return rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +private: + union Float_t { float f; int i; }; +public: + static const char* type_str() { + return "float"; + } + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + } + return false; + } + return true; + } +}; + +const char* kernel_file = "kernel.bin"; +uint32_t size = 16; + +vx_device_h device = nullptr; +std::vector source_data; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + size = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(device, kernel_arg.src0_addr); + vx_mem_free(device, kernel_arg.src1_addr); + vx_mem_free(device, kernel_arg.dst_addr); + vx_dev_close(device); + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t num_points) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + auto ref = source_data[2 * i + 0] + source_data[2 * i + 1]; + auto cur = buf_ptr[i]; + if (!Comparator::compare(cur, ref, i, errors)) { + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint64_t num_cores, num_warps, num_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); + std::cout << "number of cores: " << num_cores << std::endl; + std::cout << "number of warps: " << num_warps << std::endl; + std::cout << "number of threads: " << num_threads << std::endl; + + uint32_t num_points = size; + uint32_t buf_size = num_points * sizeof(TYPE); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "data type: " << Comparator::type_str() << std::endl; + std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + + kernel_arg.num_points = num_points; + + std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl; + std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; + + // allocate staging buffer + std::cout << "allocate staging buffer" << std::endl; + uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); + staging_buf.resize(alloc_size); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + + // generate source data + source_data.resize(2 * num_points); + for (uint32_t i = 0; i < source_data.size(); ++i) { + source_data[i] = Comparator::generate(); + } + + // upload source buffer0 + { + std::cout << "upload source buffer0" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = source_data[2 * i + 0]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); + } + + // upload source buffer1 + { + std::cout << "upload source buffer1" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = source_data[2 * i + 1]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); + } + + // clear destination buffer + std::cout << "clear destination buffer" << std::endl; + memset(staging_buf.data(), 0, num_points * sizeof(TYPE)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, buf_size, num_points)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file