Merge branch 'fpga_synthesis' of github.gatech.edu:casl/Vortex into fpga_synthesis

This commit is contained in:
codetector
2020-04-05 16:38:47 -04:00
43 changed files with 349 additions and 168 deletions

View File

@@ -1,35 +1,36 @@
BUILD_DIR=build_ase
ASE_BUILD_DIR=build_ase
FPGA_BUILD_DIR=build_fpga
all: ase fpga
ase: setup-ase
make -C $(BUILD_DIR)
make -C $(ASE_BUILD_DIR)
fpga: setup-fpga
cd build_fpga && qsub-synth
cd $(FPGA_BUILD_DIR) && qsub-synth
setup-ase: build_ase/Makefile
setup-ase: $(ASE_BUILD_DIR)/Makefile
setup-fpga: build_fpga/build/dcp.qpf
setup-fpga: $(FPGA_BUILD_DIR)/build/dcp.qpf
build_ase/Makefile:
afu_sim_setup --s sources.txt build_ase
$(ASE_BUILD_DIR)/Makefile:
afu_sim_setup --s sources.txt $(ASE_BUILD_DIR)
build_fpga/build/dcp.qpf:
afu_synth_setup -s sources.txt build_fpga
$(FPGA_BUILD_DIR)/build/dcp.qpf:
afu_synth_setup -s sources.txt $(FPGA_BUILD_DIR)
run-ase:
cd build_ase && make sim
cd $(ASE_BUILD_DIR) && make sim
wave:
vsim -view build_ase/work/vsim.wlf -do wave.do
vsim -view $(ASE_BUILD_DIR)/work/vsim.wlf -do wave.do
run-fpga:
# TODO
clean-ase:
rm -rf build_ase
rm -rf $(ASE_BUILD_DIR)
clean-fpga:
rm -rf build_fpga
rm -rf $(FPGA_BUILD_DIR)

View File

@@ -68,6 +68,7 @@ vortex_afu.json
../../rtl/VX_cache/VX_cache_miss_resrv.v
../../rtl/VX_cache/VX_fill_invalidator.v
../../rtl/VX_cache/VX_tag_data_structure.v
../../rtl/VX_cache/VX_prefetcher.v
../../rtl/cache/VX_generic_pe.v
../../rtl/cache/cache_set.v
../../rtl/cache/VX_d_cache.v

View File

@@ -14,7 +14,7 @@
"cmd-type-read": 1,
"cmd-type-write": 2,
"cmd-type-run": 3,
"cmd-type-snoop": 4,
"cmd-type-clflush": 4,
"afu-top-interface":
{

View File

@@ -34,7 +34,9 @@ module vortex_afu #(
);
localparam AVS_RD_QUEUE_SIZE = 16;
localparam VX_SNOOPING_DELAY = 300;
localparam VX_SNOOP_DELAY = 300;
localparam VX_SNOOP_LEVELS = 2;
localparam AFU_ID_L = 16'h0002; // AFU ID Lower
localparam AFU_ID_H = 16'h0004; // AFU ID Higher
@@ -42,7 +44,7 @@ localparam AFU_ID_H = 16'h0004; // AFU ID Higher
localparam CMD_TYPE_READ = `AFU_IMAGE_CMD_TYPE_READ;
localparam CMD_TYPE_WRITE = `AFU_IMAGE_CMD_TYPE_WRITE;
localparam CMD_TYPE_RUN = `AFU_IMAGE_CMD_TYPE_RUN;
localparam CMD_TYPE_SNOOP = `AFU_IMAGE_CMD_TYPE_SNOOP;
localparam CMD_TYPE_CLFLUSH = `AFU_IMAGE_CMD_TYPE_CLFLUSH;
localparam MMIO_CSR_CMD = `AFU_IMAGE_MMIO_CSR_CMD;
localparam MMIO_CSR_STATUS = `AFU_IMAGE_MMIO_CSR_STATUS;
@@ -52,13 +54,12 @@ localparam MMIO_CSR_DATA_SIZE = `AFU_IMAGE_MMIO_CSR_DATA_SIZE;
logic [127:0] afu_id = `AFU_ACCEL_UUID;
typedef enum logic[2:0] {
typedef enum logic[3:0] {
STATE_IDLE,
STATE_READ,
STATE_WRITE,
STATE_RUN,
STATE_SNOOP1,
STATE_SNOOP2
STATE_CLFLUSH
} state_t;
state_t state;
@@ -192,7 +193,7 @@ logic [31:0] cci_write_ctr;
logic [31:0] avs_read_ctr;
logic [31:0] avs_write_ctr;
logic [31:0] vx_snoop_ctr;
logic [31:0] vx_snoop_delay;
logic [9:0] vx_snoop_delay;
logic vx_reset;
always_ff @(posedge clk)
@@ -210,21 +211,21 @@ begin
STATE_IDLE: begin
case (csr_cmd)
CMD_TYPE_READ: begin
$display("%t: CMD READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size);
$display("%t: STATE READ: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size);
state <= STATE_READ;
end
CMD_TYPE_WRITE: begin
$display("%t: CMD WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size);
$display("%t: STATE WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size);
state <= STATE_WRITE;
end
CMD_TYPE_RUN: begin
$display("%t: CMD START", $time);
$display("%t: STATE START", $time);
vx_reset <= 1;
state <= STATE_RUN;
end
CMD_TYPE_SNOOP: begin
$display("%t: CMD SNOOP: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size);
state <= STATE_SNOOP1;
CMD_TYPE_CLFLUSH: begin
$display("%t: STATE CFLUSH: da=%h sz=%0d", $time, csr_mem_addr, csr_data_size);
state <= STATE_CLFLUSH;
end
endcase
end
@@ -250,15 +251,8 @@ begin
end
end
STATE_SNOOP1: begin
if (vx_snoop_delay >= VX_SNOOPING_DELAY)
begin
state <= STATE_SNOOP2;
end
end
STATE_SNOOP2: begin
if (vx_snoop_delay >= VX_SNOOPING_DELAY)
STATE_CLFLUSH: begin
if (vx_snoop_delay >= VX_SNOOP_DELAY)
begin
state <= STATE_IDLE;
end
@@ -320,7 +314,7 @@ begin
end
end
STATE_RUN: begin
STATE_RUN, STATE_CLFLUSH: begin
if (vx_dram_req_read
&& !vx_dram_req_delay)
begin
@@ -348,15 +342,20 @@ begin
end
// Vortex DRAM requests stalling
assign vx_dram_req_delay = !((STATE_RUN == state)
&& !avs_waitrequest
&& !avs_raq_full
&& !avs_rdq_full);
// Vortex DRAM fill response
logic vortex_enabled;
always_comb
begin
vx_dram_fill_rsp = (STATE_RUN == state) && !avs_rdq_empty && vx_dram_fill_accept;
vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state);
vx_dram_req_delay = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full;
end
// Vortex DRAM fill response
always_comb
begin
vx_dram_fill_rsp = vortex_enabled && !avs_rdq_empty && vx_dram_fill_accept;
vx_dram_fill_rsp_addr = (avs_raq_dout << 6);
{>>{vx_dram_fill_rsp_data}} = avs_rdq_dout;
end
@@ -524,32 +523,25 @@ begin
else begin
if (STATE_IDLE == state)
begin
vx_snoop_ctr <= 0;
vx_snoop_ctr <= 0;
vx_snoop_delay <= 0;
end
vx_snp_req <= 0;
if ((STATE_SNOOP1 == state
|| STATE_SNOOP2 == state)
if ((STATE_CLFLUSH == state)
&& vx_snoop_ctr < csr_data_size
&& vx_snp_req_delay)
&& !vx_snp_req_delay)
begin
vx_snp_req <= 1;
vx_snoop_ctr <= vx_snoop_ctr + 1;
vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6;
vx_snp_req <= 1;
vx_snoop_ctr <= vx_snoop_ctr + 1;
end
if ((vx_snoop_ctr >= csr_data_size)
&& (vx_snoop_delay < VX_SNOOPING_DELAY))
if (vx_snoop_ctr == csr_data_size)
begin
vx_snoop_delay <= vx_snoop_delay + 1;
end
if (vx_snoop_delay >= VX_SNOOPING_DELAY)
begin
vx_snoop_ctr <= 0;
vx_snoop_delay <= 0;
end
end
end

View File

@@ -27,12 +27,17 @@ add wave -noupdate -label avs_raq_full /ase_top/ase_top_generic/platform_shim_cc
add wave -noupdate -label avs_rdq_full /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_full
add wave -noupdate -label avs_raq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_raq_empty
add wave -noupdate -label avs_rdq_empty /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/avs_rdq_empty
add wave -noupdate -label vortex_enabled /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vortex_enabled
add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset
add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read
add wave -noupdate -label vx_dram_req_write /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_write
add wave -noupdate -label vx_dram_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_delay
add wave -noupdate -label vx_dram_req_read /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_read
add wave -noupdate -label vx_reset /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/reset
add wave -noupdate -label vx_dram_req_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_addr
add wave -noupdate -label vx_draw_req_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_req_data
add wave -noupdate -label out_dram_fill_rsp /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_rsp
add wave -noupdate -label out_dram_fill_accept /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_dram_fill_accept
add wave -noupdate -label vx_draw_fill_rsp_data -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_data
add wave -noupdate -label vx_dram_fill_rsp_addr -radix hexadecimal /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_dram_fill_rsp_addr
add wave -noupdate -label llc_snp_req /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req
add wave -noupdate -label llc_snp_req_delay /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/llc_snp_req_delay
add wave -noupdate -label out_break /ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/out_ebreak
@@ -45,7 +50,7 @@ add wave -noupdate -label warp_stalled {/ase_top/ase_top_generic/platform_shim_c
add wave -noupdate -label warp_lock {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/warp_lock}
add wave -noupdate -label use_active {/ase_top/ase_top_generic/platform_shim_ccip_std_afu/ccip_std_afu/vortex_afu_inst/vx_soc/genblk1/Vortex_Cluster/genblk1[0]/vortex_core/vx_front_end/vx_fetch/warp_scheduler/use_active}
TreeUpdate [SetDefaultTree]
WaveRestoreCursors {{Cursor 2} {66234495 ps} 0}
WaveRestoreCursors {{Cursor 2} {360293 ps} 0}
quietly wave cursor active 1
configure wave -namecolwidth 195
configure wave -valuecolwidth 100
@@ -61,4 +66,4 @@ configure wave -griddelta 40
configure wave -timeline 0
configure wave -timelineunits ps
update
WaveRestoreZoom {66041656 ps} {66406344 ps}
WaveRestoreZoom {346453 ps} {711141 ps}

View File

@@ -65,4 +65,4 @@ clean:
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
endif

View File

@@ -22,8 +22,8 @@
#define CMD_TYPE_READ AFU_IMAGE_CMD_TYPE_READ
#define CMD_TYPE_WRITE AFU_IMAGE_CMD_TYPE_WRITE
#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN
#define CMD_TYPE_SNOOP AFU_IMAGE_CMD_TYPE_SNOOP
#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN
#define CMD_TYPE_CLFLUSH AFU_IMAGE_CMD_TYPE_CLFLUSH
#define MMIO_CSR_CMD (AFU_IMAGE_MMIO_CSR_CMD * 4)
#define MMIO_CSR_STATUS (AFU_IMAGE_MMIO_CSR_STATUS * 4)
@@ -313,7 +313,7 @@ extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, size));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_SNOOP));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH));
// Wait for the write operation to finish
if (vx_ready_wait(hdevice, -1) != 0)

View File

@@ -1,3 +1,21 @@
RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
VX_DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
VX_CPY = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.S
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_IO = $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.s
VX_CFLAGS = -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
@@ -11,6 +29,18 @@ SRCS = basic.cpp
all: $(PROJECT)
kernel.dump: kernel.elf
$(VX_DMP) -D kernel.elf > kernel.dump
kernel.hex: kernel.elf
$(VX_CPY) -O ihex kernel.elf kernel.hex
kernel.bin: kernel.elf
$(VX_CPY) -O binary kernel.elf kernel.bin
kernel.elf: $(SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_SRCS) -I$(VX_RT_PATH) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../sw/dummy -lvortex -o $@
@@ -18,7 +48,7 @@ run-fpga: $(PROJECT)
LD_LIBRARY_PATH=../../sw/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-ase: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -t 1
ASE_LOG=0 LD_LIBRARY_PATH=../../sw/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../sw/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)

Binary file not shown.

View File

@@ -27,11 +27,11 @@ uint64_t shuffle(int i, uint64_t value) {
return (value << i) | (value & ((1 << i)-1));;
}
int run_test_0(vx_buffer_h sbuf,
vx_buffer_h dbuf,
uint32_t address,
uint64_t value,
int num_blocks) {
int run_memcopy_test(vx_buffer_h sbuf,
vx_buffer_h dbuf,
uint32_t address,
uint64_t value,
int num_blocks) {
int ret;
int errors = 0;
@@ -73,8 +73,29 @@ int run_test_0(vx_buffer_h sbuf,
return 0;
}
int run_test_1(vx_device_h device, const char* program) {
int run_kernel_test(vx_device_h device,
vx_buffer_h sbuf,
vx_buffer_h dbuf,
const char* program) {
int ret;
int errors = 0;
uint64_t seed = 0x0badf00d40ff40ff;
int num_blocks = 4;
unsigned src_dev_addr = 0x10000000;
unsigned dest_dev_addr = 0x20000000;
// write sbuf data
for (int i = 0; i < 8 * num_blocks; ++i) {
((uint64_t*)vx_host_ptr(sbuf))[i] = shuffle(i, seed);
}
// write buffer to local memory
std::cout << "write buffer to local memory" << std::endl;
ret = vx_copy_to_dev(sbuf, src_dev_addr, 64 * num_blocks, 0);
if (ret != 0)
return ret;
// upload program
std::cout << "upload program" << std::endl;
@@ -97,6 +118,37 @@ int run_test_1(vx_device_h device, const char* program) {
return ret;
}
// flush the caches
std::cout << "flush the caches" << std::endl;
ret = vx_flush_caches(device, dest_dev_addr, 64 * num_blocks);
if (ret != 0) {
return ret;
}
// read buffer from local memory
std::cout << "read buffer from local memory" << std::endl;
ret = vx_copy_from_dev(dbuf, dest_dev_addr, 64 * num_blocks, 0);
if (ret != 0)
return ret;
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < 8 * num_blocks; ++i) {
auto curr = ((uint64_t*)vx_host_ptr(dbuf))[i];
auto ref = shuffle(i, seed);
if (curr != ref) {
std::cout << "error @ " << std::hex << (dest_dev_addr + 64 * i)
<< ": actual " << curr << ", expected " << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
return 0;
}
@@ -147,27 +199,15 @@ int main(int argc, char *argv[]) {
// run tests
if (0 == test || -1 == test) {
std::cout << "run test suite 0" << std::endl;
std::cout << "run memcopy test" << std::endl;
ret = run_test_0(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1);
ret = run_memcopy_test(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 1);
if (ret != 0) {
cleanup();
return ret;
}
ret = run_test_0(sbuf, dbuf, 0x10000000, 0x0badf00d00ff00ff, 2);
if (ret != 0) {
cleanup();
return ret;
}
ret = run_test_0(sbuf, dbuf, 0x20000000, 0xff00ff00ff00ff00, 4);
if (ret != 0) {
cleanup();
return ret;
}
ret = run_test_0(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8);
ret = run_memcopy_test(sbuf, dbuf, 0x20000000, 0x0badf00d40ff40ff, 8);
if (ret != 0) {
cleanup();
return ret;
@@ -175,17 +215,8 @@ int main(int argc, char *argv[]) {
}
if (1 == test || -1 == test) {
std::cout << "run test suite 1" << std::endl;
ret = run_test_1(device, "rv32ui-p-lw.bin");
if (ret != 0) {
cleanup();
return ret;
}
}
if (2 == test || -1 == test) {
std::cout << "run test suite 1" << std::endl;
ret = run_test_1(device, "rv32ui-p-sw.bin");
std::cout << "run kernel test" << std::endl;
ret = run_kernel_test(device, sbuf, dbuf, "kernel.bin");
if (ret != 0) {
cleanup();
return ret;

Binary file not shown.

View File

@@ -0,0 +1,9 @@
#include <stdint.h>
void main() {
int64_t* x = (int64_t*)0x10000000;
int64_t* y = (int64_t*)0x20000000;
for (int i = 0; i < 8 * 4; ++i) {
y[i] = x[i];
}
}

Binary file not shown.

Binary file not shown.

View File

@@ -80,7 +80,7 @@ int run_test(vx_device_h device,
int errors = 0;
auto buf_ptr = (int*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i * i;
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
++errors;

Binary file not shown.

View File

@@ -16,7 +16,7 @@ void kernel_body(void* arg) {
unsigned i = ((wNo * _arg->num_threads) + tid) * _arg->stride;
for (unsigned j = 0; j < _arg->stride; ++j) {
z[i+j] = x[i+j] * y[i+j];
z[i+j] = x[i+j] + y[i+j];
}
}

BIN
driver/tests/demo/kernel.elf Executable file

Binary file not shown.