merging changes from OPAE branch making this branch

This commit is contained in:
Blaise Tine
2020-03-27 20:19:16 -04:00
parent 39516a6f98
commit 9b1b8789ac
267 changed files with 498191 additions and 166 deletions

19
driver/sw/Makefile Normal file
View File

@@ -0,0 +1,19 @@
all: opae rtlsim simx
opae:
$(MAKE) -C opae
rtlsim:
$(MAKE) -C rtlsim
simx:
$(MAKE) -C simx
clean:
$(MAKE) clean -C opae
$(MAKE) clean -C rtlsim
$(MAKE) clean -C simx
.PHONY: all opae rtlsim simx clean

View File

@@ -0,0 +1,67 @@
#ifndef __VX_DRIVER_H__
#define __VX_DRIVER_H__
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef void* vx_device_h;
typedef void* vx_buffer_h;
#define VX_LOCAL_MEM_SIZE 0xffffffff
#define VX_ALLOC_BASE_ADDR 0x10000000
#define VX_KERNEL_BASE_ADDR 0x80000000
#define VX_CACHE_LINESIZE 64
// open the device and connect to it
int vx_dev_open(vx_device_h* hdevice);
// Close the device when all the operations are done
int vx_dev_close(vx_device_h hdevice);
// Allocate shared buffer with device
int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer);
// Get host pointer address
volatile void* vx_host_ptr(vx_buffer_h hbuffer);
// release buffer
int vx_buf_release(vx_buffer_h hbuffer);
// allocate device memory and return address
int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr);
// Copy bytes from device local memory to buffer
int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size);
// Copy bytes from buffer to device local memory
int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset);
// Copy bytes from device local memory to buffer
int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dst_offset);
// Start device execution
int vx_start(vx_device_h hdevice);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, long long timeout);
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
// upload kernel bytes to device
int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size);
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h device, const char* filename);
#ifdef __cplusplus
}
#endif
#endif // __VX_DRIVER_H__

66
driver/sw/opae/Makefile Normal file
View File

@@ -0,0 +1,66 @@
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I/tools/opae/1.4.0/include
LDFLAGS += -L/tools/opae/1.4.0/lib
# stack execution protection
LDFLAGS +=-z noexecstack
# data relocation and projection
LDFLAGS +=-z relro -z now
# stack buffer overrun detection
CXXFLAGS +=-fstack-protector
# Position independent code
CXXFLAGS += -fPIC
LDFLAGS += -luuid
LDFLAGS += -shared
FPGA_LIBS += -lopae-c
ASE_LIBS += -lopae-c-ase
LIB_DIR=../lib
ASE_DIR = ase
PROJECT = libvortex.so
PROJECT_ASE = $(ASE_DIR)/libvortex.so
AFU_JSON_INFO = vortex_afu.h
SRCS = vortex.cpp ../vx_utils.cpp
all: $(PROJECT) $(PROJECT_ASE)
# AFU info from JSON file, including AFU UUID
$(AFU_JSON_INFO): ../../hw/vortex_afu.json
afu_json_mgr json-info --afu-json=$^ --c-hdr=$@
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) $(FPGA_LIBS) -o $@
$(PROJECT_ASE): $(SRCS) $(ASE_DIR)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $@
vortex.o: vortex.cpp $(AFU_JSON_INFO)
$(CC) $(CXXFLAGS) -c vortex.cpp -o $@
$(ASE_DIR):
mkdir -p ase
.depend: $(SRCS) $(AFU_JSON_INFO)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
rm -rf $(PROJECT) $(PROJECT_ASE) $(AFU_JSON_INFO) *.o .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

349
driver/sw/opae/vortex.cpp Executable file
View File

@@ -0,0 +1,349 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <uuid/uuid.h>
#include <opae/fpga.h>
#include <vortex.h>
#include "vortex_afu.h"
// MMIO Address Mappings
#define MMIO_COPY_IO_ADDRESS 0X120
#define MMIO_COPY_AVM_ADDRESS 0x100
#define MMIO_COPY_DATA_SIZE 0X118
#define MMIO_CMD_TYPE 0X110
#define MMIO_READY_FOR_CMD 0X198
#define MMIO_CMD_TYPE_READ 0
#define MMIO_CMD_TYPE_WRITE 1
#define MMIO_CMD_TYPE_START 2
#define MMIO_CMD_TYPE_SNOOP 3
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d!\n", #_expr, (int)res); \
return -1; \
} while (false)
///////////////////////////////////////////////////////////////////////////////
typedef struct vx_device_ {
fpga_handle fpga;
size_t mem_allocation;
} vx_device_t;
typedef struct vx_buffer_ {
uint64_t wsid;
volatile void* host_ptr;
uint64_t io_addr;
fpga_handle fpga;
size_t size;
} vx_buffer_t;
static size_t align_size(size_t size) {
return VX_CACHE_LINESIZE * ((size + VX_CACHE_LINESIZE - 1) / VX_CACHE_LINESIZE);
}
///////////////////////////////////////////////////////////////////////////////
// Search for an accelerator matching the requested UUID and connect to it
// Convert this to void if required as storing the fpga_handle to params variable
extern int vx_dev_open(vx_device_h* hdevice) {
fpga_properties filter = NULL;
fpga_result res;
fpga_guid guid;
fpga_token accel_token;
uint32_t num_matches;
fpga_handle accel_handle;
vx_device_t* device;
if (NULL == hdevice)
return -1;
// Set up a filter that will search for an accelerator
fpgaGetProperties(NULL, &filter);
fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
// Add the desired UUID to the filter
uuid_parse(AFU_ACCEL_UUID, guid);
fpgaPropertiesSetGUID(filter, guid);
// Do the search across the available FPGA contexts
num_matches = 1;
fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches);
// Not needed anymore
fpgaDestroyProperties(&filter);
if (num_matches < 1) {
fprintf(stderr, "Accelerator %s not found!\n", AFU_ACCEL_UUID);
return NULL;
}
// Open accelerator
res = fpgaOpen(accel_token, &accel_handle, 0);
if (FPGA_OK != res) {
return NULL;
}
// Done with token
fpgaDestroyToken(&accel_token);
// allocate device object
device = (vx_device_t*)malloc(sizeof(vx_device_t));
if (NULL == device) {
fpgaClose(accel_handle);
return NULL;
}
device->fpga = accel_handle;
device->mem_allocation = VX_ALLOC_BASE_ADDR;
*hdevice = device;
return 0;
}
// Close the fpga when all the operations are done
extern int vx_dev_close(vx_device_h hdevice) {
if (NULL == hdevice)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
fpgaClose(device->fpga);
free(device);
return 0;
}
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
if (NULL == hdevice
|| NULL == dev_maddr
|| 0 >= size)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
size_t asize = align_size(size);
if (device->mem_allocation + asize > VX_ALLOC_BASE_ADDR)
return -1;
*dev_maddr = device->mem_allocation;
device->mem_allocation += asize;
return 0;
}
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
fpga_result res;
void* host_ptr;
uint64_t wsid;
uint64_t io_addr;
vx_buffer_t* buffer;
if (NULL == hdevice
|| 0 >= size
|| NULL == hbuffer)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
size_t asize = align_size(size);
res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0);
if (FPGA_OK != res) {
return -1;
}
// Get the physical address of the buffer in the accelerator
res = fpgaGetIOAddress(device->fpga, wsid, &io_addr);
if (FPGA_OK != res) {
fpgaReleaseBuffer(device->fpga, wsid);
return -1;
}
// allocate buffer object
buffer = (vx_buffer_t*)malloc(sizeof(vx_buffer_t));
if (NULL == buffer) {
fpgaReleaseBuffer(device->fpga, wsid);
return -1;
}
buffer->wsid = wsid;
buffer->host_ptr = host_ptr;
buffer->io_addr = io_addr;
buffer->fpga = device->fpga;
buffer->size = size;
*hbuffer = buffer;
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
if (NULL == buffer)
return NULL;
return buffer->host_ptr;
}
extern int vx_buf_release(vx_buffer_h hbuffer) {
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
if (NULL == buffer)
return -1;
fpgaReleaseBuffer(buffer->fpga, buffer->wsid);
free(buffer);
return 0;
}
// Check if HW is ready for SW
static int ready_for_sw(fpga_handle hdevice) {
uint64_t data = 0;
struct timespec sleep_time;
#ifdef USE_ASE
sleep_time.tv_sec = 1;
sleep_time.tv_nsec = 0;
#else
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = 1000000;
#endif
do {
CHECK_RES(fpgaReadMMIO64(hdevice, 0, MMIO_READY_FOR_CMD, &data));
nanosleep(&sleep_time, NULL);
} while (data != 0x1);
return 0;
}
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
if (NULL == hbuffer
|| 0 >= size)
return -1;
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
// bound checking
if (size + src_offset > buffer->size)
return -1;
// Ensure ready for new command
if (ready_for_sw(buffer->fpga) != 0)
return -1;
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_AVM_ADDRESS, dev_maddr));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_IO_ADDRESS, (buffer->io_addr + src_offset)/VX_CACHE_LINESIZE));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_DATA_SIZE, size));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_CMD_TYPE, MMIO_CMD_TYPE_WRITE));
// Wait for the write operation to finish
return ready_for_sw(buffer->fpga);
}
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
if (NULL == hbuffer
|| 0 >= size)
return -1;
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
// bound checking
if (size + dest_offset > buffer->size)
return -1;
// Ensure ready for new command
if (ready_for_sw(buffer->fpga) != 0)
return -1;
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_AVM_ADDRESS, dev_maddr));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_IO_ADDRESS, (buffer->io_addr + dest_offset)/VX_CACHE_LINESIZE));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_DATA_SIZE, size));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_CMD_TYPE, MMIO_CMD_TYPE_READ));
// Wait for the write operation to finish
return ready_for_sw(buffer->fpga);
}
extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
if (NULL == hbuffer
|| 0 >= size)
return -1;
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
// bound checking
if (size + src_offset > buffer->size)
return -1;
// Ensure ready for new command
if (ready_for_sw(buffer->fpga) != 0)
return -1;
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_AVM_ADDRESS, dev_maddr));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_IO_ADDRESS, (buffer->io_addr + src_offset)/VX_CACHE_LINESIZE));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_COPY_DATA_SIZE, size));
CHECK_RES(fpgaWriteMMIO64(buffer->fpga, 0, MMIO_CMD_TYPE, MMIO_CMD_TYPE_SNOOP));
// Wait for the write operation to finish
return ready_for_sw(buffer->fpga);
return 0;
}
extern int vx_start(vx_device_h hdevice) {
if (NULL == hdevice)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
// Ensure ready for new command
if (ready_for_sw(device->fpga) != 0)
return -1;
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, MMIO_CMD_TYPE_START));
return 0;
}
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
if (NULL == hdevice)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
uint64_t data = 0;
struct timespec sleep_time;
#ifdef USE_ASE
sleep_time.tv_sec = 1;
sleep_time.tv_nsec = 0;
#else
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = 1000000;
#endif
// to milliseconds
long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
do {
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_READY_FOR_CMD, &data));
nanosleep(&sleep_time, NULL);
sleep_time_ms -= sleep_time_ms;
if (timeout <= sleep_time_ms)
break;
} while (data != 0x1);
return 0;
}

49
driver/sw/rtlsim/Makefile Normal file
View File

@@ -0,0 +1,49 @@
#CFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors
CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors
#USE_MULTICORE=1
CFLAGS += -I../../include -I../../../../rtl/simulate
CFLAGS += -fPIC
CFLAGS += -DUSE_RTLSIM
LDFLAGS += -shared -pthread
ifdef USE_MULTICORE
CFLAGS += -DUSE_MULTICORE
RTL_TOP = Vortex_SOC
else
RTL_TOP = Vortex
endif
SRCS = vortex.cpp ../vx_utils.cpp ../../../rtl/simulate/$(RTL_TOP).cpp
RTL_INCLUDE = -I../../../rtl -I../../../rtl/interfaces -I../../../rtl/cache -I../../../rtl/VX_cache -I../../../rtl/shared_memory -I../../../rtl/pipe_regs -I../../../rtl/compat
THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
VL_FLAGS += --threads $(THREADS)
VL_FLAGS += -Wno-UNOPTFLAT -Wno-WIDTH
VL_FLAGS += -Wno-UNDRIVEN --Wno-PINMISSING -Wno-STMTDLY -Wno-WIDTH -Wno-UNSIGNED -Wno-UNOPTFLAT -Wno-LITENDIAN
# Debugigng
VL_FLAGS += --trace -DVL_DEBUG=1
CFLAGS += -DVCD_OUTPUT
PROJECT = libvortex.so
all: $(PROJECT)
.PHONY: build_config
build_config:
(cd ../../../rtl && ./gen_config.py --rtl_locations)
$(PROJECT): $(SRCS) build_config
verilator --exe --cc $(RTL_TOP).v $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f V$(RTL_TOP).mk
clean:
rm -rf $(PROJECT) obj_dir

327
driver/sw/rtlsim/vortex.cpp Normal file
View File

@@ -0,0 +1,327 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <thread>
#include <mutex>
#include <chrono>
#include <vortex.h>
#include <ram.h>
#ifdef USE_MULTICORE
#include <Vortex_SOC.h>
#else
#include <Vortex.h>
#endif
#define PAGE_SIZE 4096
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d!\n", #_expr, (int)res); \
return -1; \
} while (false)
///////////////////////////////////////////////////////////////////////////////
static size_t align_size(size_t size) {
return VX_CACHE_LINESIZE * ((size + VX_CACHE_LINESIZE - 1) / VX_CACHE_LINESIZE);
}
///////////////////////////////////////////////////////////////////////////////
class vx_device;
class vx_buffer {
public:
vx_buffer(size_t size, vx_device* device)
: size_(size)
, device_(device) {
auto aligned_asize = align_size(size);
data_ = malloc(aligned_asize);
}
~vx_buffer() {
if (data_) {
free(data_);
}
}
void* data() const {
return data_;
}
size_t size() const {
return size_;
}
vx_device* device() const {
return device_;
}
private:
size_t size_;
vx_device* device_;
void* data_;
};
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device()
: is_done_(false)
, mem_allocation_(VX_ALLOC_BASE_ADDR)
, vortex_(&ram_) {
thread_ = new std::thread(__thread_proc__, this);
}
~vx_device() {
if (thread_) {
mutex_.lock();
is_done_ = true;
mutex_.unlock();
thread_->join();
delete thread_;
}
}
int alloc_local_mem(size_t size, size_t* dev_maddr) {
size_t asize = align_size(size);
if (mem_allocation_ + asize > VX_LOCAL_MEM_SIZE)
return -1;
*dev_maddr = mem_allocation_;
mem_allocation_ += asize;
return 0;
}
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
size_t asize = align_size(size);
if (dest_addr + asize > ram_.size())
return -1;
/*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-write: 0x%x <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + src_offset + i));
}*/
ram_.write(dest_addr, asize, (uint8_t*)src + src_offset);
return 0;
}
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size);
if (src_addr + asize > ram_.size())
return -1;
ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset);
/*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-read: 0x%x -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + dest_offset + i));
}*/
return 0;
}
int flush_caches(size_t dev_maddr, size_t size) {
mutex_.lock();
vortex_.flush_caches(dev_maddr, size);
mutex_.unlock();
return 0;
}
int start() {
mutex_.lock();
vortex_.reset();
mutex_.unlock();
return 0;
}
int wait(long long timeout) {
auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
for (;;) {
mutex_.lock();
bool is_busy = vortex_.is_busy();
mutex_.unlock();
if (!is_busy || 0 == timeout_sec--)
break;
std::this_thread::sleep_for(std::chrono::seconds(1));
}
return 0;
}
private:
void thread_proc() {
std::cout << "Device ready..." << std::endl;
for (;;) {
mutex_.lock();
bool is_done = is_done_;
mutex_.unlock();
if (is_done)
break;
mutex_.lock();
vortex_.step();
mutex_.unlock();
}
std::cout << "Device shutdown..." << std::endl;
}
static void __thread_proc__(vx_device* device) {
device->thread_proc();
}
bool is_done_;
size_t mem_allocation_;
RAM ram_;
#ifdef USE_MULTICORE
Vortex_SOC vortex_;
#else
Vortex vortex_;
#endif
std::thread* thread_;
std::mutex mutex_;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
if (NULL == hdevice)
return -1;
*hdevice = new vx_device();
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
delete device;
return 0;
}
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
if (NULL == hdevice
|| NULL == dev_maddr
|| 0 >= size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->alloc_local_mem(size, dev_maddr);
}
extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
if (NULL == hdevice
|| 0 >= size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->flush_caches(dev_maddr, size);
}
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| 0 >= size
|| NULL == hbuffer)
return -1;
vx_device *device = ((vx_device*)hdevice);
auto buffer = new vx_buffer(size, device);
if (nullptr == buffer->data()) {
delete buffer;
return -1;
}
*hbuffer = buffer;
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
return buffer->data();
}
extern int vx_buf_release(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return -1;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
delete buffer;
return 0;
}
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + src_offset > buffer->size())
return -1;
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
}
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + dest_offset > buffer->size())
return -1;
return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->start();
}
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}

37
driver/sw/simx/Makefile Normal file
View File

@@ -0,0 +1,37 @@
CFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors
MAX_WARPS ?= 8
MAX_THREADS ?= 4
CFLAGS += -I../../include -I../../../../simX/include
CFLAGS += -fPIC
CFLAGS += -DUSE_SIMX
CFLAGS += -DMAX_WARPS=$(MAX_WARPS) -DMAX_THREADS=$(MAX_THREADS)
LDFLAGS += -shared -pthread
SRCS = vortex.cpp ../vx_utils.cpp ../../../simX/args.cpp ../../../simX/mem.cpp ../../../simX/core.cpp ../../../simX/instruction.cpp ../../../simX/enc.cpp ../../../simX/util.cpp
RTL_TOP = ../../../simX/cache_simX.v
RTL_INCLUDE = -I../../../old_rtl -I../../../old_rtl/interfaces -I../../../old_rtl/cache -I../../../old_rtl/shared_memory
THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
VL_FLAGS += --threads $(THREADS)
VL_FLAGS += -Wno-UNOPTFLAT -Wno-WIDTH
PROJECT = libvortex.so
all: $(PROJECT)
$(PROJECT): $(SRCS)
verilator --exe --cc $(RTL_TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f Vcache_simX.mk
clean:
rm -rf $(PROJECT) obj_dir

BIN
driver/sw/simx/libvortex.so Executable file

Binary file not shown.

324
driver/sw/simx/vortex.cpp Normal file
View File

@@ -0,0 +1,324 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <thread>
#include <mutex>
#include <chrono>
#include <vortex.h>
#include "core.h"
#define PAGE_SIZE 4096
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d!\n", #_expr, (int)res); \
return -1; \
} while (false)
///////////////////////////////////////////////////////////////////////////////
static size_t align_size(size_t size) {
return VX_CACHE_LINESIZE * ((size + VX_CACHE_LINESIZE - 1) / VX_CACHE_LINESIZE);
}
///////////////////////////////////////////////////////////////////////////////
class vx_device;
class vx_buffer {
public:
vx_buffer(size_t size, vx_device* device)
: size_(size)
, device_(device) {
auto aligned_asize = align_size(size);
data_ = malloc(aligned_asize);
}
~vx_buffer() {
if (data_) {
free(data_);
}
}
void* data() const {
return data_;
}
size_t size() const {
return size_;
}
vx_device* device() const {
return device_;
}
private:
size_t size_;
vx_device* device_;
void* data_;
};
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device()
: is_done_(false)
, is_running_(false)
, mem_allocation_(VX_ALLOC_BASE_ADDR)
, thread_(__thread_proc__, this)
{}
~vx_device() {
mutex_.lock();
is_done_ = true;
mutex_.unlock();
thread_.join();
}
int alloc_local_mem(size_t size, size_t* dev_maddr) {
size_t asize = align_size(size);
if (mem_allocation_ + asize > VX_LOCAL_MEM_SIZE)
return -1;
*dev_maddr = mem_allocation_;
mem_allocation_ += asize;
return 0;
}
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
size_t asize = align_size(size);
if (dest_addr + asize > ram_.size())
return -1;
/*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-write: 0x%x <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + src_offset + i));
}*/
ram_.write(dest_addr, asize, (uint8_t*)src + src_offset);
return 0;
}
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size);
if (src_addr + asize > ram_.size())
return -1;
ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset);
/*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-read: 0x%x -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + dest_offset + i));
}*/
return 0;
}
int start() {
mutex_.lock();
is_running_ = true;
mutex_.unlock();
return 0;
}
int wait(long long timeout) {
auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
for (;;) {
mutex_.lock();
bool is_running = is_running_;
mutex_.unlock();
if (!is_running || 0 == timeout_sec--)
break;
std::this_thread::sleep_for(std::chrono::seconds(1));
}
return 0;
}
private:
void run() {
Harp::ArchDef arch("rv32i", false, MAX_WARPS, MAX_THREADS);
Harp::WordDecoder dec(arch);
Harp::MemoryUnit mu(PAGE_SIZE, arch.getWordSize(), true);
Harp::Core core(arch, dec, mu);
mu.attach(ram_, 0);
while (core.running()) {
core.step();
}
core.printStats();
}
void thread_proc() {
std::cout << "Device ready..." << std::endl;
for (;;) {
mutex_.lock();
bool is_done = is_done_;
bool is_running = is_running_;
mutex_.unlock();
if (is_done)
break;
if (is_running) {
std::cout << "Device running..." << std::endl;
this->run();
mutex_.lock();
is_running_ = false;
mutex_.unlock();
std::cout << "Device ready..." << std::endl;
}
}
std::cout << "Device shutdown..." << std::endl;
}
static void __thread_proc__(vx_device* device) {
device->thread_proc();
}
bool is_done_;
bool is_running_;
size_t mem_allocation_;
std::thread thread_;
Harp::RAM ram_;
std::mutex mutex_;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
if (NULL == hdevice)
return -1;
*hdevice = new vx_device();
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
delete device;
return 0;
}
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
if (NULL == hdevice
|| NULL == dev_maddr
|| 0 >= size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->alloc_local_mem(size, dev_maddr);
}
extern int vx_flush_caches(vx_device_h hdevice, size_t /*dev_maddr*/, size_t size) {
if (NULL == hdevice
|| 0 >= size)
return -1;
// this functionality is not need by simX
return 0;
}
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| 0 >= size
|| NULL == hbuffer)
return -1;
vx_device *device = ((vx_device*)hdevice);
auto buffer = new vx_buffer(size, device);
if (nullptr == buffer->data()) {
delete buffer;
return -1;
}
*hbuffer = buffer;
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
return buffer->data();
}
extern int vx_buf_release(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return -1;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
delete buffer;
return 0;
}
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + src_offset > buffer->size())
return -1;
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
}
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + dest_offset > buffer->size())
return -1;
return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->start();
}
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}

91
driver/sw/vx_utils.cpp Normal file
View File

@@ -0,0 +1,91 @@
#include <iostream>
#include <fstream>
#include <cstring>
#include <vortex.h>
int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) {
int err = 0;
if (NULL == content || 0 == size)
return -1;
static constexpr uint32_t TRANSFER_SIZE = 4096;
// allocate device buffer
vx_buffer_h buffer;
err = vx_alloc_shared_mem(device, TRANSFER_SIZE, &buffer);
if (err != 0)
return -1;
// get buffer address
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
#if defined(USE_SIMX)
// default startup routine
((uint32_t*)buf_ptr)[0] = 0xf1401073;
((uint32_t*)buf_ptr)[1] = 0xf1401073;
((uint32_t*)buf_ptr)[2] = 0x30101073;
((uint32_t*)buf_ptr)[3] = 0x800000b7;
((uint32_t*)buf_ptr)[4] = 0x000080e7;
err = vx_copy_to_dev(buffer, 0, 5 * 4, 0);
if (err != 0) {
vx_buf_release(buffer);
return err;
}
// newlib io simulator trap
((uint32_t*)buf_ptr)[0] = 0x00008067;
err = vx_copy_to_dev(buffer, 0x70000000, 4, 0);
if (err != 0) {
vx_buf_release(buffer);
return err;
}
#endif
//
// upload content
//
size_t offset = 0;
while (offset < size) {
auto chunk_size = std::min<size_t>(TRANSFER_SIZE, size - offset);
std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size);
err = vx_copy_to_dev(buffer, VX_KERNEL_BASE_ADDR + offset, chunk_size, 0);
if (err != 0) {
vx_buf_release(buffer);
return err;
}
offset += chunk_size;
}
vx_buf_release(buffer);
return 0;
}
int vx_upload_kernel_file(vx_device_h device, const char* filename) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
return -1;
}
// get length of file:
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
ifs.seekg(0, ifs.beg);
// allocate buffer
auto content = new char [size];
// read file content
ifs.read(content, size);
// upload
int err = vx_upload_kernel_bytes(device, content, size);
// release buffer
delete[] content;
return err;
}