This commit is contained in:
felsabbagh3
2020-06-29 23:00:53 -07:00
18 changed files with 160 additions and 47 deletions

View File

@@ -1,23 +1,34 @@
# Vortex RISC-V GPGPU # Vortex RISC-V GPGPU
Vortex currently supported RISC-V RV32I ISA Vortex is a full-system RISCV-based GPGPU processor.
/benchmarks containts test benchmarks Specifications
--------------
/docs contains documentation. - Support RISC-V RV32I ISA
- Fully scalable: 1 to 16 cores with optional L2 and L3 caches
- OpenCL 1.2 Support
- FPGA target: Intel Arria 10 @ 200 MHz peak Freq
/hw constains hardware sources. Directory structure
-------------------
/driver contains the driver software. - benchmarks: OpenCL and RISC-V benchmarks
/runtime contains the kernel runtime software. - docs: documentation.
/SimX contains a cycle-approximate simulator for Vortex. - hw: hardware sources.
/evaluation contains the synthesis/runtime reports. - driver: driver software.
Basic Instructions to run OpenCL Benchmarks on Vortex - runtime: runtime software for kernels.
-----------------------------------------------------
- simX: Vortex cycle-approximate simulator.
- evaluation: synthesis and performance data.
Basic Installation
------------------
Install development tools Install development tools
@@ -55,8 +66,8 @@ Install Vortex
$ cd Vortex $ cd Vortex
$ make $ make
Run SGEMM OpenCL Benchmark Quick Test running SGEMM kernel
$ cd Vortex/benchmarks/opencl/sgemm $ cd /Vortex/benchmarks/opencl/sgemm
$ make $ make
$ make run $ make run

View File

@@ -13,11 +13,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_FLAGS += $(DBG_PRINT_FLAGS) #DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO DBG_FLAGS += -DDBG_CORE_REQ_INFO
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 #CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2 #CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2
#DEBUG=1 #DEBUG=1
@@ -38,8 +38,7 @@ RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS) VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME VL_FLAGS += -Wno-DECLFILENAME
VL_FLAGS += --x-initial unique VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += --x-assign unique
# Enable Verilator multithreaded simulation # Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))') #THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')

View File

@@ -2,7 +2,7 @@ ASE_BUILD_DIR=build_ase
FPGA_BUILD_DIR=build_fpga FPGA_BUILD_DIR=build_fpga
all: ase-2c all: ase-1c
ase-1c: setup-ase-1c ase-1c: setup-ase-1c
make -C $(ASE_BUILD_DIR)_1c make -C $(ASE_BUILD_DIR)_1c
@@ -10,47 +10,72 @@ ase-1c: setup-ase-1c
ase-2c: setup-ase-2c ase-2c: setup-ase-2c
make -C $(ASE_BUILD_DIR)_2c make -C $(ASE_BUILD_DIR)_2c
ase-4c: setup-ase-4c
make -C $(ASE_BUILD_DIR)_4c
setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile
setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile
setup-ase-4c: $(ASE_BUILD_DIR)_4c/Makefile
$(ASE_BUILD_DIR)_1c/Makefile: $(ASE_BUILD_DIR)_1c/Makefile:
afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c
$(ASE_BUILD_DIR)_2c/Makefile: $(ASE_BUILD_DIR)_2c/Makefile:
afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c
$(ASE_BUILD_DIR)_4c/Makefile:
afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c
fpga-1c: setup-fpga-1c fpga-1c: setup-fpga-1c
cd $(FPGA_BUILD_DIR)_1c && qsub-synth cd $(FPGA_BUILD_DIR)_1c && qsub-synth
fpga-2c: setup-fpga-2c fpga-2c: setup-fpga-2c
cd $(FPGA_BUILD_DIR)_2c && qsub-synth cd $(FPGA_BUILD_DIR)_2c && qsub-synth
fpga-4c: setup-fpga-4c
cd $(FPGA_BUILD_DIR)_4c && qsub-synth
setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf
setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf
setup-fpga-4c: $(FPGA_BUILD_DIR)_4c/build/dcp.qpf
$(FPGA_BUILD_DIR)_1c/build/dcp.qpf: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf:
afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c
$(FPGA_BUILD_DIR)_2c/build/dcp.qpf: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf:
afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c
$(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
afu_synth_setup -s sources_4c.txt $(FPGA_BUILD_DIR)_4c
run-ase-1c: run-ase-1c:
cd $(ASE_BUILD_DIR)_1c && make sim cd $(ASE_BUILD_DIR)_1c && make sim
run-ase-2c: run-ase-2c:
cd $(ASE_BUILD_DIR)_2c && make sim cd $(ASE_BUILD_DIR)_2c && make sim
run-ase-4c:
cd $(ASE_BUILD_DIR)_4c && make sim
clean-ase-1c: clean-ase-1c:
rm -rf $(ASE_BUILD_DIR)_1c rm -rf $(ASE_BUILD_DIR)_1c
clean-ase-2c: clean-ase-2c:
rm -rf $(ASE_BUILD_DIR)_2c rm -rf $(ASE_BUILD_DIR)_2c
clean-ase-4c:
rm -rf $(ASE_BUILD_DIR)_4c
clean-fpga-1c: clean-fpga-1c:
rm -rf $(FPGA_BUILD_DIR)_1c rm -rf $(FPGA_BUILD_DIR)_1c
clean-fpga-2c: clean-fpga-2c:
rm -rf $(FPGA_BUILD_DIR)_2c rm -rf $(FPGA_BUILD_DIR)_2c
clean-fpga-4c:
rm -rf $(FPGA_BUILD_DIR)_4c

View File

@@ -62,6 +62,7 @@ make ase
# tests # tests
./run_ase.sh build_ase_1c ../../driver/tests/basic/basic ./run_ase.sh build_ase_1c ../../driver/tests/basic/basic
./run_ase.sh build_ase_1c ../../driver/tests/demo/demo ./run_ase.sh build_ase_1c ../../driver/tests/demo/demo
./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd
# modify "vsim_run.tcl" to dump VCD trace # modify "vsim_run.tcl" to dump VCD trace
vcd file vortex.vcd vcd file vortex.vcd
@@ -90,5 +91,6 @@ lsof +D build_ase_1c
# quick off cache synthesis # quick off cache synthesis
make -C pipeline > pipeline/build.log 2>&1 & make -C pipeline > pipeline/build.log 2>&1 &
make -C cache > cache/build.log 2>&1 & make -C cache > cache/build.log 2>&1 &
make -C core > core/build.log 2>&1 &
make -C vortex > vortex/build.log 2>&1 & make -C vortex > vortex/build.log 2>&1 &
make -C top > top/build.log 2>&1 & make -C top > top/build.log 2>&1 &

View File

@@ -7,6 +7,9 @@ BUILD_DIR=$1
PROGRAM=$(basename "$2") PROGRAM=$(basename "$2")
PROGRAM_DIR=`dirname $2` PROGRAM_DIR=`dirname $2`
POCL_RT_PATH=$SCRIPT_DIR/../../benchmarks/opencl/runtime/lib
VORTEX_DRV_PATH=$SCRIPT_DIR/../../driver/opae/ase
# Export ASE_WORKDIR variable # Export ASE_WORKDIR variable
export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work export ASE_WORKDIR=$SCRIPT_DIR/$BUILD_DIR/work
@@ -33,5 +36,5 @@ done
# run application # run application
pushd $PROGRAM_DIR pushd $PROGRAM_DIR
echo " [DBG] running ./$PROGRAM $*" echo " [DBG] running ./$PROGRAM $*"
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$LD_LIBRARY_PATH ./$PROGRAM $* ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH:$VORTEX_DRV_PATH:$LD_LIBRARY_PATH ./$PROGRAM $*
popd popd

View File

@@ -67,6 +67,7 @@ QI:vortex_afu.qsf
../rtl/libs/VX_priority_encoder.v ../rtl/libs/VX_priority_encoder.v
../rtl/libs/VX_generic_queue.v ../rtl/libs/VX_generic_queue.v
../rtl/libs/VX_indexable_queue.v ../rtl/libs/VX_indexable_queue.v
../rtl/libs/VX_fair_arbiter.v
../rtl/libs/VX_fixed_arbiter.v ../rtl/libs/VX_fixed_arbiter.v
../rtl/libs/VX_rr_arbiter.v ../rtl/libs/VX_rr_arbiter.v
../rtl/libs/VX_countones.v ../rtl/libs/VX_countones.v

4
hw/opae/sources_4c.txt Normal file
View File

@@ -0,0 +1,4 @@
+define+NUM_CORES=4
+define+L2_ENABLE=0
C:sources.txt

View File

@@ -13,9 +13,6 @@ module VX_alu_unit (
output reg [31:0] alu_result, output reg [31:0] alu_result,
output reg alu_stall output reg alu_stall
); );
localparam DIV_PIPELINE_LEN = 20;
localparam MUL_PIPELINE_LEN = 8;
wire[31:0] div_result_unsigned; wire[31:0] div_result_unsigned;
wire[31:0] div_result_signed; wire[31:0] div_result_signed;
@@ -37,11 +34,11 @@ module VX_alu_unit (
`ALU_DIV, `ALU_DIV,
`ALU_DIVU, `ALU_DIVU,
`ALU_REM, `ALU_REM,
`ALU_REMU: inst_delay = DIV_PIPELINE_LEN; `ALU_REMU: inst_delay = `DIV_LATENCY;
`ALU_MUL, `ALU_MUL,
`ALU_MULH, `ALU_MULH,
`ALU_MULHSU, `ALU_MULHSU,
`ALU_MULHU: inst_delay = MUL_PIPELINE_LEN; `ALU_MULHU: inst_delay = `MUL_LATENCY;
default: inst_delay = 0; default: inst_delay = 0;
endcase endcase
end end
@@ -91,7 +88,7 @@ module VX_alu_unit (
.WIDTHD(32), .WIDTHD(32),
.NSIGNED(0), .NSIGNED(0),
.DSIGNED(0), .DSIGNED(0),
.PIPELINE(DIV_PIPELINE_LEN) .PIPELINE(`DIV_LATENCY)
) udiv ( ) udiv (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
@@ -106,7 +103,7 @@ module VX_alu_unit (
.WIDTHD(32), .WIDTHD(32),
.NSIGNED(1), .NSIGNED(1),
.DSIGNED(1), .DSIGNED(1),
.PIPELINE(DIV_PIPELINE_LEN) .PIPELINE(`DIV_LATENCY)
) sdiv ( ) sdiv (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),
@@ -124,7 +121,7 @@ module VX_alu_unit (
.WIDTHB(33), .WIDTHB(33),
.WIDTHP(64), .WIDTHP(64),
.SIGNED(1), .SIGNED(1),
.PIPELINE(MUL_PIPELINE_LEN) .PIPELINE(`MUL_LATENCY)
) multiplier ( ) multiplier (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),

View File

@@ -8,7 +8,7 @@
`endif `endif
`ifndef NUM_CORES `ifndef NUM_CORES
`define NUM_CORES 1 `define NUM_CORES 4
`endif `endif
`ifndef NUM_WARPS `ifndef NUM_WARPS
@@ -52,7 +52,7 @@
`endif `endif
`ifndef L2_ENABLE `ifndef L2_ENABLE
`define L2_ENABLE (`NUM_CORES > 2) `define L2_ENABLE 0
`endif `endif
`ifndef L3_ENABLE `ifndef L3_ENABLE

View File

@@ -72,6 +72,10 @@
`define CSR_WIDTH 12 `define CSR_WIDTH 12
`define DIV_LATENCY 18
`define MUL_LATENCY 2
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define BYTE_EN_NO 3'h7 `define BYTE_EN_NO 3'h7

View File

@@ -116,7 +116,7 @@ module VX_lsu_unit #(
end end
if (mrq_pop_part) begin if (mrq_pop_part) begin
mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd; mem_rsp_mask[mrq_read_addr] <= mem_rsp_mask_upd;
assert(mrq_read_addr == dbg_mrq_write_addr); assert(($time < 2) || mrq_read_addr == dbg_mrq_write_addr);
end end
end end

View File

@@ -11,7 +11,6 @@ module VX_fair_arbiter #(
output wire grant_valid output wire grant_valid
); );
if (N == 1) begin if (N == 1) begin
`UNUSED_VAR (clk) `UNUSED_VAR (clk)
@@ -22,7 +21,6 @@ module VX_fair_arbiter #(
end else begin end else begin
reg [N-1:0] requests_use; reg [N-1:0] requests_use;
wire [N-1:0] update_value; wire [N-1:0] update_value;
wire [N-1:0] late_value; wire [N-1:0] late_value;
@@ -48,7 +46,7 @@ module VX_fair_arbiter #(
reg [N-1:0] grant_onehot_r; reg [N-1:0] grant_onehot_r;
VX_priority_encoder # ( VX_priority_encoder #(
.N(N) .N(N)
) priority_encoder ( ) priority_encoder (
.data_in (requests_use), .data_in (requests_use),
@@ -61,7 +59,7 @@ module VX_fair_arbiter #(
grant_onehot_r[grant_index] = 1; grant_onehot_r[grant_index] = 1;
end end
assign grant_onehot = grant_onehot_r; assign grant_onehot = grant_onehot_r;
assign late_value = ((refill_original ^ requests) & ~refill_original); assign late_value = ((refill_original ^ requests) & ~refill_original);
assign update_value = (requests_use & ~grant_onehot_r) | late_value; assign update_value = (requests_use & ~grant_onehot_r) | late_value;
end end

View File

@@ -3,7 +3,7 @@
module VX_generic_queue #( module VX_generic_queue #(
parameter DATAW, parameter DATAW,
parameter SIZE = 16, parameter SIZE = 16,
parameter BUFFERED_OUTPUT = (SIZE > 8) parameter BUFFERED_OUTPUT = 1
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,

View File

@@ -1,5 +1,5 @@
#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4 #MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4 #MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2 MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2
# control RTL debug print states # control RTL debug print states
@@ -12,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_FLAGS += $(DBG_PRINT_FLAGS) #DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO DBG_FLAGS += -DDBG_CORE_REQ_INFO
INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/pipe_regs -I../rtl/cache -I../rtl/simulate INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/pipe_regs -I../rtl/cache -I../rtl/simulate
@@ -25,7 +25,7 @@ CF += -std=c++11 -fms-extensions -I../..
VF += --language 1800-2009 --assert -Wall -Wpedantic VF += --language 1800-2009 --assert -Wall -Wpedantic
VF += -Wno-DECLFILENAME VF += -Wno-DECLFILENAME
VF += --x-initial unique VF += --x-initial unique --x-assign unique
VF += -exe $(SRCS) $(INCLUDE) VF += -exe $(SRCS) $(INCLUDE)
DBG += -DVCD_OUTPUT $(DBG_FLAGS) DBG += -DVCD_OUTPUT $(DBG_FLAGS)

View File

@@ -10,12 +10,8 @@ double sc_time_stamp() {
} }
Simulator::Simulator() { Simulator::Simulator() {
#ifdef NDEBUG
// force random values for unitialized signals // force random values for unitialized signals
Verilated::randReset(2); Verilated::randReset(2);
Verilated::assertOn(false);
#endif
ram_ = nullptr; ram_ = nullptr;
vortex_ = new VVortex(); vortex_ = new VVortex();

View File

@@ -9,3 +9,6 @@
/pipeline/* /pipeline/*
!/pipeline/Makefile !/pipeline/Makefile
/core/*
!/core/Makefile

View File

@@ -0,0 +1,70 @@
PROJECT = Core
TOP_LEVEL_ENTITY = VX_core
SRC_FILE = VX_core.v
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
# Part, Family
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
# Executable Configuration
SYN_ARGS = --parallel --read_settings_files=on
FIT_ARGS = --part=$(DEVICE) --read_settings_files=on
ASM_ARGS =
STA_ARGS = --do_report_timing
# Build targets
all: $(PROJECT).sta.rpt
syn: $(PROJECT).syn.rpt
fit: $(PROJECT).fit.rpt
asm: $(PROJECT).asm.rpt
sta: $(PROJECT).sta.rpt
smart: smart.log
# Target implementations
STAMP = echo done >
$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES)
quartus_syn $(PROJECT) $(SYN_ARGS)
$(STAMP) fit.chg
$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt
quartus_fit $(PROJECT) $(FIT_ARGS)
$(STAMP) asm.chg
$(STAMP) sta.chg
$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt
quartus_asm $(PROJECT) $(ASM_ARGS)
$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt
quartus_sta $(PROJECT) $(STA_ARGS)
smart.log: $(PROJECT_FILES)
quartus_sh --determine_smart_action $(PROJECT) > smart.log
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src $(SRC_FILE) -sdc ../project.sdc -inc "../../../rtl;../../../rtl/libs;../../../rtl/interfaces;../../../rtl/pipe_regs;../../../rtl/cache"
syn.chg:
$(STAMP) syn.chg
fit.chg:
$(STAMP) fit.chg
sta.chg:
$(STAMP) sta.chg
asm.chg:
$(STAMP) asm.chg
program: $(PROJECT).sof
quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof"
clean:
rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox

View File

@@ -1,6 +1,6 @@
set_time_format -unit ns -decimal_places 3 set_time_format -unit ns -decimal_places 3
create_clock -name {clk} -period "250 MHz" -waveform { 0.0 1.0 } [get_ports {clk}] create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
derive_pll_clocks -create_base_clocks derive_pll_clocks -create_base_clocks
derive_clock_uncertainty derive_clock_uncertainty