From 5d088d67c8686a9bba763936f8777cbd1d06087f Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Wed, 1 Jul 2020 09:30:12 -0700 Subject: [PATCH] Gather FPGA perf stats --- benchmarks/opencl/vecadd/main.cc | 22 ----------------- driver/include/vortex.h | 4 ++-- driver/opae/vortex.cpp | 40 +++++++++++++++++++++++++------ driver/tests/demo/kernel.bin | Bin 6812 -> 6852 bytes hw/opae/vortex_afu.sv | 7 +++++- hw/rtl/VX_alu_unit.v | 2 +- hw/rtl/VX_csr_arb.v | 21 +++++++--------- hw/rtl/VX_csr_pipe.v | 1 - hw/rtl/VX_decode.v | 2 +- hw/rtl/VX_gpr_stage.v | 1 - hw/rtl/libs/VX_divide.v | 2 +- hw/rtl/libs/VX_mult.v | 4 ++-- 12 files changed, 55 insertions(+), 51 deletions(-) diff --git a/benchmarks/opencl/vecadd/main.cc b/benchmarks/opencl/vecadd/main.cc index 237924ba..68c9675a 100644 --- a/benchmarks/opencl/vecadd/main.cc +++ b/benchmarks/opencl/vecadd/main.cc @@ -30,28 +30,6 @@ _ret; \ }) -/*#include -#ifdef __cplusplus -extern "C" { -#endif -int _pocl_register_kernel(const char* name, const void* pfn, uint32_t num_args, uint32_t num_locals, const uint8_t* arg_types, const uint32_t* local_sizes); -void _pocl_kernel_vecadd_workgroup(uint8_t* args, uint8_t*, uint32_t, uint32_t, uint32_t); -#ifdef __cplusplus -} -#endif - -namespace { -class auto_register_kernel_t { -public: - auto_register_kernel_t() { - static uint8_t arg_types[] = {1, 1, 1}; - static uint32_t local_sizes[] = {}; - _pocl_register_kernel("vecadd", (void*)_pocl_kernel_vecadd_workgroup, 3, 0, arg_types, local_sizes); - } -}; -static auto_register_kernel_t __x__; -}*/ - int exitcode = 0; cl_context context = NULL; cl_command_queue commandQueue = NULL; diff --git a/driver/include/vortex.h b/driver/include/vortex.h index d2a00a3f..a9597253 100644 --- a/driver/include/vortex.h +++ b/driver/include/vortex.h @@ -58,10 +58,10 @@ int vx_start(vx_device_h hdevice); int vx_ready_wait(vx_device_h hdevice, long long timeout); // set device constant registers -int vx_csr_set(vx_device_h hdevice, int core, int address, int value); +int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value); // get device constant registers -int vx_csr_get(vx_device_h hdevice, int core, int address, int* value); +int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value); ////////////////////////////// UTILITY FUNCIONS /////////////////////////////// diff --git a/driver/opae/vortex.cpp b/driver/opae/vortex.cpp index baac1204..07e0385c 100755 --- a/driver/opae/vortex.cpp +++ b/driver/opae/vortex.cpp @@ -53,10 +53,10 @@ typedef struct vx_device_ { fpga_handle fpga; size_t mem_allocation; - int implementation_id; - int num_cores; - int num_warps; - int num_threads; + unsigned implementation_id; + unsigned num_cores; + unsigned num_warps; + unsigned num_threads; } vx_device_t; typedef struct vx_buffer_ { @@ -181,6 +181,9 @@ extern int vx_dev_open(vx_device_h* hdevice) { fpgaClose(accel_handle); return ret; } + + fprintf(stdout, "DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n", + device->implementation_id, device->num_cores, device->num_warps, device->num_threads); } #ifdef SCOPE @@ -208,6 +211,29 @@ extern int vx_dev_close(vx_device_h hdevice) { vx_scope_stop(device->fpga, 0); #endif + { + // Dump performance stats + uint64_t instrs, cycles; + unsigned value; + + int ret = 0; + ret |= vx_csr_get(hdevice, 0, CSR_INSTR_H, &value); + instrs = value; + ret |= vx_csr_get(hdevice, 0, CSR_INSTR_L, &value); + instrs = (instrs << 32) | value; + + ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_H, &value); + cycles = value; + ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_L, &value); + cycles = (cycles << 32) | value; + + float IPC = (float)(double(instrs) / double(cycles)); + + fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC); + + assert(ret == 0); + } + fpgaClose(device->fpga); free(device); @@ -468,7 +494,7 @@ extern int vx_start(vx_device_h hdevice) { } // set device constant registers -extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) { +extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value) { if (nullptr == hdevice) return -1; @@ -488,7 +514,7 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) { } // get device constant registers -extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) { +extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value) { if (nullptr == hdevice || nullptr == value) return -1; @@ -510,7 +536,7 @@ extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) { uint64_t value64; CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_READ, &value64)); - *value = (int)value64; + *value = (unsigned)value64; return 0; } \ No newline at end of file diff --git a/driver/tests/demo/kernel.bin b/driver/tests/demo/kernel.bin index aea7c0615c62b4a6f8e72b230346e4d7ff1681dd..06939d4c46190f07fb214341beca87cb3a6ec952 100644 GIT binary patch delta 337 zcmZ9GF-yZx6h^-!@1+C;+diArQbn37W>CaXItdYs;x33T6*`3qMg0S>v{BdaL_0Y9 z2ZHF}(5l^BI}5rBT|{tEUphKm&T!A=p6kkUrFKOD5&G<8{y=S{Z!vwv1l)6&~|*Z%ONypkUW?alq8&qUXIoNF!mHvQ3hgHU7usROiFUUa;SH7 zxO4z9%z>FE-Y@BoxKK9>uW#cLS#f89SEuAdvJr^0tw4OL`?Dlqh7q{Nw>OS0MPPf5 tSBIf8LIQE9mCQuh9xMM=`S?M*&rWov!9V2e6u`xi-i`AX*GSg{{{yW|SF8X4 delta 300 zcmX?NI>$7AIx7Ri|`gjod`vKd$zgj)|D5^iHWY|OUIVKPwN zdxi!paUdpJuzNctU+or zI|D;<2Rnl~D?JL&s6` diff --git a/hw/opae/vortex_afu.sv b/hw/opae/vortex_afu.sv index 89ee968a..5f13fd77 100644 --- a/hw/opae/vortex_afu.sv +++ b/hw/opae/vortex_afu.sv @@ -881,7 +881,6 @@ assign vx_csr_io_req_rw = (STATE_CSR_WRITE == state); assign vx_csr_io_req_addr = cmd_csr_addr; assign vx_csr_io_req_data = cmd_csr_wdata; -assign cmd_csr_rdata = vx_csr_io_rsp_data; assign vx_csr_io_rsp_ready = 1; assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_io_rsp_valid; @@ -890,6 +889,7 @@ always_ff @(posedge clk) begin if (SoftReset) begin csr_io_req_sent <= 0; + cmd_csr_rdata <= 0; end else begin if (vx_csr_io_req_valid && vx_csr_io_req_ready) begin @@ -898,6 +898,11 @@ begin if (cmd_csr_done) begin csr_io_req_sent <= 0; end + if ((STATE_CSR_READ == state) + && vx_csr_io_rsp_ready + && vx_csr_io_rsp_valid) begin + cmd_csr_rdata <= vx_csr_io_rsp_data; + end end end diff --git a/hw/rtl/VX_alu_unit.v b/hw/rtl/VX_alu_unit.v index b0d0c20f..39c1334f 100644 --- a/hw/rtl/VX_alu_unit.v +++ b/hw/rtl/VX_alu_unit.v @@ -119,7 +119,7 @@ module VX_alu_unit ( VX_mult #( .WIDTHA(33), .WIDTHB(33), - .WIDTHP(64), + .WIDTHP(66), .SIGNED(1), .PIPELINE(`MUL_LATENCY) ) multiplier ( diff --git a/hw/rtl/VX_csr_arb.v b/hw/rtl/VX_csr_arb.v index 5703d9ac..84dafa8c 100644 --- a/hw/rtl/VX_csr_arb.v +++ b/hw/rtl/VX_csr_arb.v @@ -18,24 +18,21 @@ module VX_csr_arb ( `UNUSED_VAR (clk) `UNUSED_VAR (reset) - wire pick_core = (| csr_core_req_if.valid); - - // Which request to pick - assign issued_csr_req_if.is_io = !pick_core; + wire pick_core = (| csr_core_req_if.valid); // Mux between core and io assign issued_csr_req_if.valid = pick_core ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}}; assign issued_csr_req_if.is_csr = pick_core ? csr_core_req_if.is_csr : 1'b1; assign issued_csr_req_if.alu_op = pick_core ? csr_core_req_if.alu_op : (csr_io_req_if.rw ? `ALU_CSR_RW : `ALU_CSR_RS); - assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr; - assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); - - assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core); - - // Core arguments - assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num; + assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr; + assign issued_csr_req_if.csr_immed = pick_core ? csr_core_req_if.csr_immed : 0; + assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0); + assign issued_csr_req_if.is_io = !pick_core; + assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num; assign issued_csr_req_if.rd = csr_core_req_if.rd; - assign issued_csr_req_if.wb = csr_core_req_if.wb; + assign issued_csr_req_if.wb = csr_core_req_if.wb; + + assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core); // Core Writeback assign csr_wb_if.valid = csr_pipe_rsp_if.valid & {`NUM_THREADS{~csr_pipe_rsp_if.is_io}}; diff --git a/hw/rtl/VX_csr_pipe.v b/hw/rtl/VX_csr_pipe.v index dd80a811..04185d40 100644 --- a/hw/rtl/VX_csr_pipe.v +++ b/hw/rtl/VX_csr_pipe.v @@ -38,7 +38,6 @@ module VX_csr_pipe #( .wb_valid (| writeback_if.valid) ); - // wire hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2; wire car_hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2; assign csr_read_data = car_hazard ? csr_updated_data_s2 : csr_read_data_unqual; diff --git a/hw/rtl/VX_decode.v b/hw/rtl/VX_decode.v index 0c8d4410..1c001482 100644 --- a/hw/rtl/VX_decode.v +++ b/hw/rtl/VX_decode.v @@ -104,7 +104,7 @@ module VX_decode( assign is_lui = (curr_opcode == `INST_LUI); assign is_auipc = (curr_opcode == `INST_AUIPC); assign is_csr = (curr_opcode == `INST_SYS) && (func3 != 0); - assign is_csr_immed = (is_csr) && (func3[2] == 1); + assign is_csr_immed = is_csr && (func3[2] == 1); assign is_gpgpu = (curr_opcode == `INST_GPGPU); diff --git a/hw/rtl/VX_gpr_stage.v b/hw/rtl/VX_gpr_stage.v index 5c7c628d..aed5b7e5 100644 --- a/hw/rtl/VX_gpr_stage.v +++ b/hw/rtl/VX_gpr_stage.v @@ -30,7 +30,6 @@ module VX_gpr_stage ( wire is_jal = bckE_req_if.is_jal; `DEBUG_END - assign csr_req_if.is_io = 1'b0; // GPR only issues csr requests coming from core VX_gpr_read_if gpr_read_if(); diff --git a/hw/rtl/libs/VX_divide.v b/hw/rtl/libs/VX_divide.v index 09da8dd8..f32aeb13 100644 --- a/hw/rtl/libs/VX_divide.v +++ b/hw/rtl/libs/VX_divide.v @@ -35,7 +35,7 @@ module VX_divide #( quartus_div.lpm_widthd = WIDTHD, quartus_div.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED", quartus_div.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED", - quartus_div.lpm_hint = "LPM_REMAINDERPOSITIVE=FALSE,MAXIMIZE_SPEED=9", + quartus_div.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE", quartus_div.lpm_pipeline = PIPELINE; `else diff --git a/hw/rtl/libs/VX_mult.v b/hw/rtl/libs/VX_mult.v index 466f4213..16d47d6d 100644 --- a/hw/rtl/libs/VX_mult.v +++ b/hw/rtl/libs/VX_mult.v @@ -23,9 +23,9 @@ module VX_mult #( .dataa (dataa), .datab (datab), .result (result), + .sclr (reset), .aclr (1'b0), .clken (1'b1), - .sclr (1'b0), .sum (1'b0) ); @@ -35,7 +35,7 @@ module VX_mult #( quartus_mult.lpm_widthp = WIDTHP, quartus_mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED", quartus_mult.lpm_pipeline = PIPELINE, - quartus_mult.lpm_hint = "MAXIMIZE_SPEED=9"; + quartus_mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9"; `else wire [WIDTHP-1:0] result_unqual;