fpga_synthesis merge

This commit is contained in:
Blaise Tine
2020-06-23 12:41:26 -07:00
1384 changed files with 5533630 additions and 791310 deletions

23
driver/Makefile Normal file
View File

@@ -0,0 +1,23 @@
all: stub
stub:
$(MAKE) -C stub
opae:
$(MAKE) -C opae
rtlsim:
$(MAKE) -C rtlsim
simx:
$(MAKE) -C simx
clean:
$(MAKE) clean -C dummy
$(MAKE) clean -C opae
$(MAKE) clean -C rtlsim
$(MAKE) clean -C simx
.PHONY: all opae rtlsim simx clean

114
driver/common/vx_utils.cpp Normal file
View File

@@ -0,0 +1,114 @@
#include <iostream>
#include <fstream>
#include <cstring>
#include <vortex.h>
#include <VX_config.h>
extern int vx_dev_caps(int caps_id) {
switch (caps_id) {
case VX_CAPS_VERSION:
return 0;
case VX_CAPS_MAX_CORES:
return NUM_CORES;
case VX_CAPS_MAX_WARPS:
return NUM_WARPS;
case VX_CAPS_MAX_THREADS:
return NUM_THREADS;
case VX_CAPS_CACHE_LINESIZE:
return 64;
case VX_CAPS_LOCAL_MEM_SIZE:
return 0xffffffff;
case VX_CAPS_ALLOC_BASE_ADDR:
return 0x10000000;
case VX_CAPS_KERNEL_BASE_ADDR:
return 0x80000000;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();
return 0;
}
}
extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) {
int err = 0;
if (NULL == content || 0 == size)
return -1;
uint32_t buffer_transfer_size = 65536;
uint32_t kernel_base_addr = vx_dev_caps(VX_CAPS_KERNEL_BASE_ADDR);
// allocate device buffer
vx_buffer_h buffer;
err = vx_alloc_shared_mem(device, buffer_transfer_size, &buffer);
if (err != 0)
return -1;
// get buffer address
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
#if defined(USE_SIMX)
// default startup routine
((uint32_t*)buf_ptr)[0] = 0xf1401073;
((uint32_t*)buf_ptr)[1] = 0xf1401073;
((uint32_t*)buf_ptr)[2] = 0x30101073;
((uint32_t*)buf_ptr)[3] = 0x800000b7;
((uint32_t*)buf_ptr)[4] = 0x000080e7;
err = vx_copy_to_dev(buffer, 0, 5 * 4, 0);
if (err != 0) {
vx_buf_release(buffer);
return err;
}
// newlib io simulator trap
((uint32_t*)buf_ptr)[0] = 0x00008067;
err = vx_copy_to_dev(buffer, 0x70000000, 4, 0);
if (err != 0) {
vx_buf_release(buffer);
return err;
}
#endif
//
// upload content
//
size_t offset = 0;
while (offset < size) {
auto chunk_size = std::min<size_t>(buffer_transfer_size, size - offset);
std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size);
err = vx_copy_to_dev(buffer, kernel_base_addr + offset, chunk_size, 0);
if (err != 0) {
vx_buf_release(buffer);
return err;
}
offset += chunk_size;
}
vx_buf_release(buffer);
return 0;
}
extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
return -1;
}
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
auto content = new char [size];
ifs.seekg(0, ifs.beg);
ifs.read(content, size);
// upload
int err = vx_upload_kernel_bytes(device, content, size);
// release buffer
delete[] content;
return err;
}

78
driver/include/vortex.h Normal file
View File

@@ -0,0 +1,78 @@
#ifndef __VX_DRIVER_H__
#define __VX_DRIVER_H__
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef void* vx_device_h;
typedef void* vx_buffer_h;
// device caps ids
#define VX_CAPS_VERSION 0x0
#define VX_CAPS_MAX_CORES 0x1
#define VX_CAPS_MAX_WARPS 0x2
#define VX_CAPS_MAX_THREADS 0x3
#define VX_CAPS_CACHE_LINESIZE 0x4
#define VX_CAPS_LOCAL_MEM_SIZE 0x5
#define VX_CAPS_ALLOC_BASE_ADDR 0x6
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
// return device configurations
int vx_dev_caps(int caps_id);
// open the device and connect to it
int vx_dev_open(vx_device_h* hdevice);
// Close the device when all the operations are done
int vx_dev_close(vx_device_h hdevice);
// Allocate shared buffer with device
int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer);
// Get host pointer address
volatile void* vx_host_ptr(vx_buffer_h hbuffer);
// release buffer
int vx_buf_release(vx_buffer_h hbuffer);
// allocate device memory and return address
int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr);
// Copy bytes from device local memory to buffer
int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size);
// Copy bytes from buffer to device local memory
int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset);
// Copy bytes from device local memory to buffer
int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dst_offset);
// Start device execution
int vx_start(vx_device_h hdevice);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, long long timeout);
// set device constant registers
int vx_set_regiters(int state, int value);
// get device constant registers
int vx_get_regiters(int state, int* value);
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
// upload kernel bytes to device
int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size);
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h device, const char* filename);
#ifdef __cplusplus
}
#endif
#endif // __VX_DRIVER_H__

69
driver/opae/Makefile Normal file
View File

@@ -0,0 +1,69 @@
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I/tools/opae/1.4.0/include -I../../hw
LDFLAGS += -L/tools/opae/1.4.0/lib
# stack execution protection
LDFLAGS +=-z noexecstack
# data relocation and projection
LDFLAGS +=-z relro -z now
# stack buffer overrun detection
CXXFLAGS +=-fstack-protector
# Position independent code
CXXFLAGS += -fPIC
# Enable scope analyzer
#CXXFLAGS += -DSCOPE
LDFLAGS += -luuid
LDFLAGS += -shared
FPGA_LIBS += -lopae-c
ASE_LIBS += -lopae-c-ase
LIB_DIR=../lib
ASE_DIR = ase
PROJECT = libvortex.so
PROJECT_ASE = $(ASE_DIR)/libvortex.so
AFU_JSON_INFO = vortex_afu.h
SRCS = vortex.cpp scope.cpp ../common/vx_utils.cpp
all: $(PROJECT) $(PROJECT_ASE)
# AFU info from JSON file, including AFU UUID
$(AFU_JSON_INFO): ../../hw/opae/vortex_afu.json
afu_json_mgr json-info --afu-json=$^ --c-hdr=$@
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) $(FPGA_LIBS) -o $@
$(PROJECT_ASE): $(SRCS) $(ASE_DIR)
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) $(ASE_LIBS) -o $@
vortex.o: vortex.cpp $(AFU_JSON_INFO)
$(CXX) $(CXXFLAGS) -c vortex.cpp -o $@
$(ASE_DIR):
mkdir -p ase
.depend: $(SRCS) $(AFU_JSON_INFO)
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
clean:
rm -rf $(PROJECT) $(PROJECT_ASE) $(AFU_JSON_INFO) *.o .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

259
driver/opae/scope.cpp Normal file
View File

@@ -0,0 +1,259 @@
#include <iostream>
#include <fstream>
#include <thread>
#include <chrono>
#include <vector>
#include <assert.h>
#include <VX_config.h>
#include "scope.h"
#include "vortex_afu.h"
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d, %s!\n", \
#_expr, (int)res, fpgaErrStr(res)); \
return -1; \
} while (false)
#define MMIO_CSR_SCOPE_CMD (AFU_IMAGE_MMIO_CSR_SCOPE_CMD * 4)
#define MMIO_CSR_SCOPE_DATA (AFU_IMAGE_MMIO_CSR_SCOPE_DATA * 4)
struct scope_signal_t {
int width;
const char* name;
};
constexpr int ilog2(int n) {
return (n > 1) ? 1 + ilog2(n >> 1) : 0;
}
static constexpr int NW_BITS = ilog2(NUM_WARPS);
static const scope_signal_t scope_signals[] = {
{ 32, "dram_req_addr" },
{ 1, "dram_req_rw" },
{ 16, "dram_req_byteen" },
{ 32, "dram_req_data" },
{ 29, "dram_req_tag" },
{ 32, "dram_rsp_data" },
{ 29, "dram_rsp_tag" },
{ 32, "snp_req_addr" },
{ 1, "snp_req_invalidate" },
{ 16, "snp_req_tag" },
{ 16, "snp_rsp_tag" },
{ NW_BITS, "icache_req_warp_num" },
{ 32, "icache_req_addr" },
{ NW_BITS, "icache_req_tag" },
{ 32, "icache_rsp_data" },
{ NW_BITS, "icache_rsp_tag" },
{ NW_BITS, "dcache_req_warp_num" },
{ 32, "dcache_req_curr_PC" },
{ 32, "dcache_req_addr" },
{ 1, "dcache_req_rw" },
{ 4, "dcache_req_byteen" },
{ 32, "dcache_req_data" },
{ NW_BITS, "dcache_req_tag" },
{ 32, "dcache_rsp_data" },
{ NW_BITS, "dcache_rsp_tag" },
{ NW_BITS, "decode_warp_num" },
{ 32, "decode_curr_PC" },
{ 1, "decode_is_jal" },
{ 5, "decode_rs1" },
{ 5, "decode_rs2" },
{ NW_BITS, "execute_warp_num" },
{ 5, "execute_rd" },
{ 32, "execute_a" },
{ 32, "execute_b" },
{ NW_BITS, "writeback_warp_num" },
{ 2, "writeback_wb" },
{ 5, "writeback_rd" },
{ 32, "writeback_data" },
///////////////////////////////////////////////////////////////////////////
{ 1, "dram_req_valid" },
{ 1, "dram_req_ready" },
{ 1, "dram_rsp_valid" },
{ 1, "dram_rsp_ready" },
{ 1, "snp_req_valid" },
{ 1, "snp_req_ready" },
{ 1, "snp_rsp_valid" },
{ 1, "snp_rsp_ready" },
{ 1, "icache_req_valid" },
{ 1, "icache_req_ready" },
{ 1, "icache_rsp_valid" },
{ 1, "icache_rsp_ready" },
{ NUM_THREADS, "dcache_req_valid" },
{ 1, "dcache_req_ready" },
{ NUM_THREADS, "dcache_rsp_valid" },
{ 1, "dcache_rsp_ready" },
{ NUM_THREADS, "decode_valid" },
{ NUM_THREADS, "execute_valid" },
{ NUM_THREADS, "writeback_valid" },
{ 1, "schedule_delay" },
{ 1, "memory_delay" },
{ 1, "exec_delay" },
{ 1, "gpr_stage_delay" },
{ 1, "busy" },
};
static const int num_signals = sizeof(scope_signals) / sizeof(scope_signal_t);
int vx_scope_start(fpga_handle hfpga, uint64_t delay) {
if (nullptr == hfpga)
return -1;
if (delay != uint64_t(-1)) {
// set start delay
uint64_t cmd_delay = ((delay << 3) | 4);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, cmd_delay));
std::cout << "scope start delay: " << delay << std::endl;
}
return 0;
}
int vx_scope_stop(fpga_handle hfpga, uint64_t delay) {
if (nullptr == hfpga)
return -1;
if (delay != uint64_t(-1)) {
// stop recording
uint64_t cmd_stop = ((delay << 3) | 5);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, cmd_stop));
std::cout << "scope stop delay: " << delay << std::endl;
}
std::ofstream ofs("vx_scope.vcd");
ofs << "$timescale 1 ns $end" << std::endl;
ofs << "$var reg 1 0 clk $end" << std::endl;
int fwidth = 0;
for (int i = 0; i < num_signals; ++i) {
ofs << "$var reg " << scope_signals[i].width << " " << (i+1) << " " << scope_signals[i].name << " $end" << std::endl;
fwidth += scope_signals[i].width;
}
ofs << "enddefinitions $end" << std::endl;
uint64_t frame_width, max_frames, data_valid;
// wait for recording to terminate
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0));
do {
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid));
if (data_valid)
break;
std::this_thread::sleep_for(std::chrono::seconds(1));
} while (true);
std::cout << "scope trace dump begin..." << std::endl;
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 2));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &frame_width));
std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl;
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 3));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &max_frames));
std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl;
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 1));
if (fwidth != (int)frame_width) {
std::cerr << "invalid frame_width: expecting " << std::dec << fwidth << "!" << std::endl;
std::abort();
}
std::vector<char> signal_data(frame_width+1);
uint64_t frame_offset = 0;
uint64_t frame_no = 0;
uint64_t timestamp = 0;
int signal_id = 0;
int signal_offset = 0;
auto print_header = [&] () {
ofs << '#' << timestamp++ << std::endl;
ofs << "b0 0" << std::endl;
ofs << '#' << timestamp++ << std::endl;
ofs << "b1 0" << std::endl;
uint64_t delta;
fpga_result res = fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &delta);
assert(res == FPGA_OK);
while (delta != 0) {
ofs << '#' << timestamp++ << std::endl;
ofs << "b0 0" << std::endl;
ofs << '#' << timestamp++ << std::endl;
ofs << "b1 0" << std::endl;
--delta;
}
signal_id = num_signals;
};
print_header();
do {
if (frame_no == (max_frames-1)) {
// verify last frame is valid
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid));
assert(data_valid == 1);
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 1));
}
uint64_t word;
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &word));
do {
int signal_width = scope_signals[signal_id-1].width;
int word_offset = frame_offset % 64;
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
++signal_offset;
++frame_offset;
if (signal_offset == signal_width) {
signal_data[signal_width] = 0; // string null termination
ofs << 'b' << signal_data.data() << ' ' << signal_id << std::endl;
signal_offset = 0;
--signal_id;
}
if (frame_offset == frame_width) {
assert(0 == signal_offset);
frame_offset = 0;
++frame_no;
if (frame_no != max_frames) {
print_header();
}
}
} while ((frame_offset % 64) != 0);
} while (frame_no != max_frames);
std::cout << "scope trace dump done! - " << (timestamp/2) << " cycles" << std::endl;
// verify data not valid
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_CSR_SCOPE_CMD, 0));
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_CSR_SCOPE_DATA, &data_valid));
assert(data_valid == 0);
return 0;
}

7
driver/opae/scope.h Normal file
View File

@@ -0,0 +1,7 @@
#pragma once
#include <opae/fpga.h>
int vx_scope_start(fpga_handle hfpga, uint64_t delay = -1);
int vx_scope_stop(fpga_handle hfpga, uint64_t delay = -1);

408
driver/opae/vortex.cpp Executable file
View File

@@ -0,0 +1,408 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <cmath>
#include <uuid/uuid.h>
#include <opae/fpga.h>
#include <vortex.h>
#include "vortex_afu.h"
#ifdef SCOPE
#include "scope.h"
#endif
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
if (res == FPGA_OK) \
break; \
printf("OPAE Error: '%s' returned %d, %s!\n", \
#_expr, (int)res, fpgaErrStr(res)); \
return -1; \
} while (false)
///////////////////////////////////////////////////////////////////////////////
#define CMD_TYPE_READ AFU_IMAGE_CMD_TYPE_READ
#define CMD_TYPE_WRITE AFU_IMAGE_CMD_TYPE_WRITE
#define CMD_TYPE_RUN AFU_IMAGE_CMD_TYPE_RUN
#define CMD_TYPE_CLFLUSH AFU_IMAGE_CMD_TYPE_CLFLUSH
#define MMIO_CSR_CMD (AFU_IMAGE_MMIO_CSR_CMD * 4)
#define MMIO_CSR_IO_ADDR (AFU_IMAGE_MMIO_CSR_IO_ADDR * 4)
#define MMIO_CSR_MEM_ADDR (AFU_IMAGE_MMIO_CSR_MEM_ADDR * 4)
#define MMIO_CSR_DATA_SIZE (AFU_IMAGE_MMIO_CSR_DATA_SIZE * 4)
#define MMIO_CSR_STATUS (AFU_IMAGE_MMIO_CSR_STATUS * 4)
///////////////////////////////////////////////////////////////////////////////
typedef struct vx_device_ {
fpga_handle fpga;
size_t mem_allocation;
} vx_device_t;
typedef struct vx_buffer_ {
uint64_t wsid;
volatile void* host_ptr;
uint64_t io_addr;
vx_device_h hdevice;
size_t size;
} vx_buffer_t;
inline size_t align_size(size_t size, size_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
inline bool is_aligned(size_t addr, size_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return 0 == (addr & (alignment - 1));
}
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
fpga_properties filter = nullptr;
fpga_result res;
fpga_guid guid;
fpga_token accel_token;
uint32_t num_matches;
fpga_handle accel_handle;
vx_device_t* device;
if (nullptr == hdevice)
return -1;
// ensure that the block size 64
assert(64 == vx_dev_caps(VX_CAPS_CACHE_LINESIZE));
// Set up a filter that will search for an accelerator
fpgaGetProperties(nullptr, &filter);
fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
// Add the desired UUID to the filter
uuid_parse(AFU_ACCEL_UUID, guid);
fpgaPropertiesSetGUID(filter, guid);
// Do the search across the available FPGA contexts
num_matches = 1;
fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches);
// Not needed anymore
fpgaDestroyProperties(&filter);
if (num_matches < 1) {
fprintf(stderr, "Accelerator %s not found!\n", AFU_ACCEL_UUID);
return -1;
}
// Open accelerator
res = fpgaOpen(accel_token, &accel_handle, 0);
if (FPGA_OK != res) {
return -1;
}
// Done with token
fpgaDestroyToken(&accel_token);
// allocate device object
device = (vx_device_t*)malloc(sizeof(vx_device_t));
if (nullptr == device) {
fpgaClose(accel_handle);
return -1;
}
device->fpga = accel_handle;
device->mem_allocation = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR);
*hdevice = device;
#ifdef SCOPE
{
int ret = vx_scope_start(device->fpga, 0);
if (ret != 0)
return ret;
}
#endif
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
#ifdef SCOPE
vx_scope_stop(device->fpga, 0);
#endif
fpgaClose(device->fpga);
free(device);
return 0;
}
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
if (nullptr == hdevice
|| nullptr == dev_maddr
|| 0 >= size)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
size_t asize = align_size(size, line_size);
if (device->mem_allocation + asize > dev_mem_size)
return -1;
*dev_maddr = device->mem_allocation;
device->mem_allocation += asize;
return 0;
}
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
fpga_result res;
void* host_ptr;
uint64_t wsid;
uint64_t io_addr;
vx_buffer_t* buffer;
if (nullptr == hdevice
|| 0 >= size
|| nullptr == hbuffer)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
size_t asize = align_size(size, line_size);
res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0);
if (FPGA_OK != res) {
return -1;
}
// Get the physical address of the buffer in the accelerator
res = fpgaGetIOAddress(device->fpga, wsid, &io_addr);
if (FPGA_OK != res) {
fpgaReleaseBuffer(device->fpga, wsid);
return -1;
}
// allocate buffer object
buffer = (vx_buffer_t*)malloc(sizeof(vx_buffer_t));
if (nullptr == buffer) {
fpgaReleaseBuffer(device->fpga, wsid);
return -1;
}
buffer->wsid = wsid;
buffer->host_ptr = host_ptr;
buffer->io_addr = io_addr;
buffer->hdevice = hdevice;
buffer->size = asize;
*hbuffer = buffer;
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
return buffer->host_ptr;
}
extern int vx_buf_release(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return -1;
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
fpgaReleaseBuffer(device->fpga, buffer->wsid);
free(buffer);
return 0;
}
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
if (nullptr == hdevice)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
uint64_t data = 0;
struct timespec sleep_time;
#if defined(USE_ASE)
sleep_time.tv_sec = 1;
sleep_time.tv_nsec = 0;
#else
sleep_time.tv_sec = 0;
sleep_time.tv_nsec = 1000000;
#endif
// to milliseconds
long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
for (;;) {
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_STATUS, &data));
if (0 == data || 0 == timeout) {
if (data != 0) {
fprintf(stdout, "ready-wait timed out: status=%ld\n", data);
}
break;
}
nanosleep(&sleep_time, nullptr);
timeout -= sleep_time_ms;
};
return 0;
}
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
size_t asize = align_size(size, line_size);
// check alignment
if (!is_aligned(dev_maddr, line_size))
return -1;
if (!is_aligned(buffer->io_addr + src_offset, line_size))
return -1;
// bound checking
if (src_offset + asize > buffer->size)
return -1;
if (dev_maddr + asize > dev_mem_size)
return -1;
// Ensure ready for new command
if (vx_ready_wait(buffer->hdevice, -1) != 0)
return -1;
auto ls_shift = (int)std::log2(line_size);
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr >> ls_shift) ));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_WRITE));
// Wait for the write operation to finish
if (vx_ready_wait(buffer->hdevice, -1) != 0)
return -1;
return 0;
}
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
size_t dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
size_t asize = align_size(size, line_size);
// check alignment
if (!is_aligned(dev_maddr, line_size))
return -1;
if (!is_aligned(buffer->io_addr + dest_offset, line_size))
return -1;
// bound checking
if (dest_offset + asize > buffer->size)
return -1;
if (dev_maddr + asize > dev_mem_size)
return -1;
// Ensure ready for new command
if (vx_ready_wait(buffer->hdevice, -1) != 0)
return -1;
auto ls_shift = (int)std::log2(line_size);
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, (dev_maddr) >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_READ));
// Wait for the write operation to finish
if (vx_ready_wait(buffer->hdevice, -1) != 0)
return -1;
return 0;
}
extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
if (nullptr == hdevice
|| 0 >= size)
return -1;
vx_device_t* device = ((vx_device_t*)hdevice);
int line_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
size_t asize = align_size(size, line_size);
// check alignment
if (!is_aligned(dev_maddr, line_size))
return -1;
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
auto ls_shift = (int)std::log2(line_size);
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_MEM_ADDR, dev_maddr >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA_SIZE, asize >> ls_shift));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_CLFLUSH));
// Wait for the write operation to finish
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
return 0;
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
// start execution
vx_device_t *device = ((vx_device_t*)hdevice);
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CMD, CMD_TYPE_RUN));
return 0;
}

2
driver/rtlsim/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
obj_dir
*.so

76
driver/rtlsim/Makefile Normal file
View File

@@ -0,0 +1,76 @@
#CFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -Wfatal-errors
CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CFLAGS += -I../../include -I../../../hw/simulate -I../../../hw
# control RTL debug print states
DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \
-DDBG_PRINT_CORE_DCACHE \
-DDBG_PRINT_CACHE_BANK \
-DDBG_PRINT_CACHE_SNP \
-DDBG_PRINT_CACHE_MSRQ \
-DDBG_PRINT_DRAM \
-DDBG_PRINT_PIPELINE \
-DDBG_PRINT_OPAE
#DBG_PRINT=$(DBG_PRINT_FLAGS)
#CONFIGS += -DNUM_CLUSTERS=2 -DNUM_CORES=4
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=4
#CONFIGS += -DNUM_CLUSTERS=1 -DNUM_CORES=2
#DEBUG=1
#AFU=1
CFLAGS += -fPIC
CFLAGS += -DUSE_RTLSIM $(CONFIGS)
LDFLAGS += -shared -pthread
# LDFLAGS += -dynamiclib -pthread
TOP = Vortex
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/pipe_regs -I../../hw/rtl/cache
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
VL_FLAGS += --x-initial unique
VL_FLAGS += --x-assign unique
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace -DVCD_OUTPUT $(DBG_PRINT)
CFLAGS += -DVCD_OUTPUT $(DBG_PRINT)
#VL_FLAGS += -DDBG_CORE_REQ_INFO
#CFLAGS += -DDBG_CORE_REQ_INFO
else
CFLAGS += -DNDEBUG
VL_FLAGS += -DNDEBUG
endif
# AFU
ifdef AFU
TOP = vortex_afu_sim
VL_FLAGS += -DNOPAE -DSCOPE
CFLAGS += -DNOPAE -DSCOPE
RTL_INCLUDE += -I../../hw/opae -I../../hw/opae/ccip
endif
PROJECT = libvortex.so
# PROJECT = libvortex.dylib
all: $(PROJECT)
$(PROJECT): $(SRCS)
verilator --exe --cc $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f V$(TOP).mk
clean:
rm -rf $(PROJECT) obj_dir

64
driver/rtlsim/ram.h Normal file
View File

@@ -0,0 +1,64 @@
#pragma once
#include <stdio.h>
#include <stdint.h>
class RAM {
private:
mutable uint8_t *mem_[(1 << 12)];
uint8_t *get(uint32_t address) const {
uint32_t block_addr = address >> 20;
uint32_t block_offset = address & 0x000FFFFF;
if (mem_[block_addr] == NULL) {
mem_[block_addr] = new uint8_t[(1 << 20)];
}
return mem_[block_addr] + block_offset;
}
public:
RAM() {
for (uint32_t i = 0; i < (1 << 12); i++) {
mem_[i] = NULL;
}
}
~RAM() {
this->clear();
}
size_t size() const {
return (1ull << 32);
}
void clear() {
for (uint32_t i = 0; i < (1 << 12); i++) {
if (mem_[i]) {
delete mem_[i];
mem_[i] = NULL;
}
}
}
void read(uint32_t address, uint32_t length, uint8_t *data) const {
for (unsigned i = 0; i < length; i++) {
data[i] = *this->get(address + i);
}
}
void write(uint32_t address, uint32_t length, const uint8_t *data) {
for (unsigned i = 0; i < length; i++) {
*this->get(address + i) = data[i];
}
}
uint8_t& operator[](uint32_t address) {
return *get(address);
}
const uint8_t& operator[](uint32_t address) const {
return *get(address);
}
};

View File

@@ -0,0 +1,70 @@
#include "simulator.h"
#include <iostream>
#include <fstream>
#include <iomanip>
uint64_t timestamp = 0;
double sc_time_stamp() {
return timestamp;
}
Simulator::Simulator() {
// force random values for unitialized signals
const char* args[] = {"", "+verilator+rand+reset+2", "+verilator+seed+50"};
Verilated::commandArgs(3, args);
vortex_ = new Vvortex_afu_sim();
#ifdef VCD_OUTPUT
Verilated::traceEverOn(true);
trace_ = new VerilatedVcdC;
vortex_->trace(trace_, 99);
trace_->open("trace.vcd");
#endif
}
Simulator::~Simulator() {
#ifdef VCD_OUTPUT
trace_->close();
#endif
delete vortex_;
}
void Simulator::reset() {
#ifndef NDEBUG
std::cout << timestamp << ": [sim] reset()" << std::endl;
#endif
vortex_->reset = 1;
this->step();
vortex_->reset = 0;
dram_rsp_vec_.clear();
}
void Simulator::step() {
vortex_->clk = 0;
this->eval();
vortex_->clk = 1;
this->eval();
avs_driver();
ccip_driver();
}
void Simulator::eval() {
vortex_->eval();
#ifdef VCD_OUTPUT
trace_->dump(timestamp);
#endif
++timestamp;
}
void Simulator::avs_driver() {
//--
}
void Simulator::ccip_driver() {
//--
}

59
driver/rtlsim/simulator.h Normal file
View File

@@ -0,0 +1,59 @@
#pragma once
#include "Vvortex_afu_sim.h"
#include "Vvortex_afu_sim__Syms.h"
#include "verilated.h"
#ifdef VCD_OUTPUT
#include <verilated_vcd_c.h>
#endif
#include <VX_config.h>
#include "ram.h"
#include <ostream>
#include <vector>
#define ENABLE_DRAM_STALLS
#define DRAM_LATENCY 100
#define DRAM_RQ_SIZE 16
#define DRAM_STALLS_MODULO 16
typedef struct {
int cycles_left;
uint8_t *data;
unsigned tag;
} dram_req_t;
class Simulator {
public:
Simulator();
virtual ~Simulator();
void reset();
void step();
int mmio_read(uint64_t addr, uint64_t* value);
int mmio_write(uint64_t addr, uint64_t value);
private:
void eval();
void avs_driver();
void ccip_driver();
std::vector<dram_req_t> dram_rsp_vec_;
RAM ram_;
Vvortex_afu_sim *vortex_;
#ifdef VCD_OUTPUT
VerilatedVcdC *trace_;
#endif
};

281
driver/rtlsim/vortex.cpp Normal file
View File

@@ -0,0 +1,281 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <future>
#include <chrono>
#include <vortex.h>
#include <ram.h>
#include <simulator.h>
///////////////////////////////////////////////////////////////////////////////
static size_t align_size(size_t size) {
uint32_t cache_block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
return cache_block_size * ((size + cache_block_size - 1) / cache_block_size);
}
///////////////////////////////////////////////////////////////////////////////
class vx_device;
class vx_buffer {
public:
vx_buffer(size_t size, vx_device* device)
: size_(size)
, device_(device) {
auto aligned_asize = align_size(size);
data_ = malloc(aligned_asize);
}
~vx_buffer() {
if (data_) {
free(data_);
}
}
void* data() const {
return data_;
}
size_t size() const {
return size_;
}
vx_device* device() const {
return device_;
}
private:
size_t size_;
vx_device* device_;
void* data_;
};
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device() {
mem_allocation_ = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR);
simulator_.attach_ram(&ram_);
}
~vx_device() {
if (future_.valid()) {
future_.wait();
}
}
int alloc_local_mem(size_t size, size_t* dev_maddr) {
size_t asize = align_size(size);
auto dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
if (mem_allocation_ + asize > dev_mem_size)
return -1;
*dev_maddr = mem_allocation_;
mem_allocation_ += asize;
return 0;
}
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
size_t asize = align_size(size);
if (dest_addr + asize > ram_.size())
return -1;
/*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-write: 0x%x <- 0x%x\n", uint32_t(dest_addr + i), *(uint32_t*)((uint8_t*)src + src_offset + i));
}*/
ram_.write(dest_addr, asize, (uint8_t*)src + src_offset);
return 0;
}
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size);
if (src_addr + asize > ram_.size())
return -1;
ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset);
/*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-read: 0x%x -> 0x%x\n", uint32_t(src_addr + i), *(uint32_t*)((uint8_t*)dest + dest_offset + i));
}*/
return 0;
}
int start() {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
future_ = std::async(std::launch::async, [&]{
simulator_.reset();
while (simulator_.is_busy()) {
simulator_.step();
}
});
return 0;
}
int wait(long long timeout) {
if (!future_.valid())
return 0;
auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
std::chrono::seconds wait_time(1);
for (;;) {
auto status = future_.wait_for(wait_time); // wait for 1 sec and check status
if (status == std::future_status::ready
|| 0 == timeout_sec--)
break;
}
return 0;
}
int flush_caches(size_t dev_maddr, size_t size) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
simulator_.flush_caches(dev_maddr, size);
while (simulator_.is_busy()) {
simulator_.step();
};
return 0;
}
private:
size_t mem_allocation_;
RAM ram_;
Simulator simulator_;
std::future<void> future_;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
*hdevice = new vx_device();
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
delete device;
return 0;
}
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
if (nullptr == hdevice
|| nullptr == dev_maddr
|| 0 >= size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->alloc_local_mem(size, dev_maddr);
}
extern int vx_flush_caches(vx_device_h hdevice, size_t dev_maddr, size_t size) {
if (nullptr == hdevice
|| 0 >= size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->flush_caches(dev_maddr, size);
}
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| 0 >= size
|| nullptr == hbuffer)
return -1;
vx_device *device = ((vx_device*)hdevice);
auto buffer = new vx_buffer(size, device);
if (nullptr == buffer->data()) {
delete buffer;
return -1;
}
*hbuffer = buffer;
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
return buffer->data();
}
extern int vx_buf_release(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return -1;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
delete buffer;
return 0;
}
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + src_offset > buffer->size())
return -1;
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
}
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + dest_offset > buffer->size())
return -1;
return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->start();
}
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}

2
driver/simx/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
obj_dir
libvortex.so

32
driver/simx/Makefile Normal file
View File

@@ -0,0 +1,32 @@
CFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors
CFLAGS += -I../../include -I../../../simX/include -I../../../hw
CFLAGS += -fPIC
CFLAGS += -DUSE_SIMX
LDFLAGS += -shared -pthread
SRCS = vortex.cpp ../common/vx_utils.cpp ../../simX/args.cpp ../../simX/mem.cpp ../../simX/core.cpp ../../simX/instruction.cpp ../../simX/enc.cpp ../../simX/util.cpp
RTL_TOP = ../../simX/cache_simX.v
RTL_INCLUDE = -I../../hw/old_rtl -I../../hw/old_rtl/interfaces -I../../hw/old_rtl/cache -I../../hw/old_rtl/shared_memory
THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
VL_FLAGS += --threads $(THREADS)
VL_FLAGS += -Wno-UNOPTFLAT -Wno-WIDTH
PROJECT = libvortex.so
all: $(PROJECT)
$(PROJECT): $(SRCS)
verilator --exe --cc $(RTL_TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f Vcache_simX.mk
clean:
rm -rf $(PROJECT) obj_dir

318
driver/simx/vortex.cpp Normal file
View File

@@ -0,0 +1,318 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <iostream>
#include <thread>
#include <mutex>
#include <chrono>
#include <vortex.h>
#include <core.h>
#include <VX_config.h>
#define PAGE_SIZE 4096
///////////////////////////////////////////////////////////////////////////////
static size_t align_size(size_t size) {
uint32_t cache_block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
return cache_block_size * ((size + cache_block_size - 1) / cache_block_size);
}
///////////////////////////////////////////////////////////////////////////////
class vx_device;
class vx_buffer {
public:
vx_buffer(size_t size, vx_device* device)
: size_(size)
, device_(device) {
auto aligned_asize = align_size(size);
data_ = malloc(aligned_asize);
}
~vx_buffer() {
if (data_) {
free(data_);
}
}
void* data() const {
return data_;
}
size_t size() const {
return size_;
}
vx_device* device() const {
return device_;
}
private:
size_t size_;
vx_device* device_;
void* data_;
};
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device()
: is_done_(false)
, is_running_(false)
, thread_(__thread_proc__, this) {
mem_allocation_ = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR);
}
~vx_device() {
mutex_.lock();
is_done_ = true;
mutex_.unlock();
thread_.join();
}
int alloc_local_mem(size_t size, size_t* dev_maddr) {
auto asize = align_size(size);
auto dev_mem_size = vx_dev_caps(VX_CAPS_LOCAL_MEM_SIZE);
if (mem_allocation_ + asize > dev_mem_size)
return -1;
*dev_maddr = mem_allocation_;
mem_allocation_ += asize;
return 0;
}
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
auto asize = align_size(size);
if (dest_addr + asize > ram_.size())
return -1;
/*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-write: 0x%x <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + src_offset + i));
}*/
ram_.write(dest_addr, asize, (uint8_t*)src + src_offset);
return 0;
}
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size);
if (src_addr + asize > ram_.size())
return -1;
ram_.read(src_addr, asize, (uint8_t*)dest + dest_offset);
/*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
for (int i = 0; i < size; i += 4) {
printf("mem-read: 0x%x -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + dest_offset + i));
}*/
return 0;
}
int start() {
mutex_.lock();
is_running_ = true;
mutex_.unlock();
return 0;
}
int wait(long long timeout) {
auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
for (;;) {
mutex_.lock();
bool is_running = is_running_;
mutex_.unlock();
if (!is_running || 0 == timeout_sec--)
break;
std::this_thread::sleep_for(std::chrono::seconds(1));
}
return 0;
}
private:
void run() {
Harp::ArchDef arch("rv32i", NUM_WARPS, NUM_THREADS);
Harp::WordDecoder dec(arch);
Harp::MemoryUnit mu(PAGE_SIZE, arch.getWordSize(), true);
Harp::Core core(arch, dec, mu);
mu.attach(ram_, 0);
while (core.running()) {
core.step();
}
core.printStats();
}
void thread_proc() {
std::cout << "Device ready..." << std::endl;
for (;;) {
mutex_.lock();
bool is_done = is_done_;
bool is_running = is_running_;
mutex_.unlock();
if (is_done)
break;
if (is_running) {
std::cout << "Device running..." << std::endl;
this->run();
mutex_.lock();
is_running_ = false;
mutex_.unlock();
std::cout << "Device ready..." << std::endl;
}
}
std::cout << "Device shutdown..." << std::endl;
}
static void __thread_proc__(vx_device* device) {
device->thread_proc();
}
bool is_done_;
bool is_running_;
size_t mem_allocation_;
std::thread thread_;
Harp::RAM ram_;
std::mutex mutex_;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
*hdevice = new vx_device();
return 0;
}
extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
delete device;
return 0;
}
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
if (nullptr == hdevice
|| nullptr == dev_maddr
|| 0 >= size)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->alloc_local_mem(size, dev_maddr);
}
extern int vx_flush_caches(vx_device_h hdevice, size_t /*dev_maddr*/, size_t size) {
if (nullptr == hdevice
|| 0 >= size)
return -1;
// this functionality is not need by simX
return 0;
}
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| 0 >= size
|| nullptr == hbuffer)
return -1;
vx_device *device = ((vx_device*)hdevice);
auto buffer = new vx_buffer(size, device);
if (nullptr == buffer->data()) {
delete buffer;
return -1;
}
*hbuffer = buffer;
return 0;
}
extern volatile void* vx_host_ptr(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return nullptr;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
return buffer->data();
}
extern int vx_buf_release(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return -1;
vx_buffer* buffer = ((vx_buffer*)hbuffer);
delete buffer;
return 0;
}
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + src_offset > buffer->size())
return -1;
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
}
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
if (nullptr == hbuffer
|| 0 >= size)
return -1;
auto buffer = (vx_buffer*)hbuffer;
if (size + dest_offset > buffer->size())
return -1;
return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
}
extern int vx_start(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->start();
}
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}

20
driver/stub/Makefile Normal file
View File

@@ -0,0 +1,20 @@
CXXFLAGS += -std=c++11 -O3 -Wall -Wextra -pedantic -Wfatal-errors
#CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../include -I../../runtime
CXXFLAGS += -fPIC
LDFLAGS += -shared -pthread
SRCS = vortex.cpp ../common/vx_utils.cpp
PROJECT = libvortex.so
all: $(PROJECT)
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
clean:
rm -rf $(PROJECT) obj_dir

45
driver/stub/vortex.cpp Normal file
View File

@@ -0,0 +1,45 @@
#include <vortex.h>
extern int vx_dev_open(vx_device_h* /*hdevice*/) {
return -1;
}
extern int vx_dev_close(vx_device_h /*hdevice*/) {
return -1;
}
extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, size_t /*size*/, size_t* /*dev_maddr*/) {
return -1;
}
extern int vx_flush_caches(vx_device_h /*hdevice*/, size_t /*dev_maddr*/, size_t /*size*/) {
return -1;
}
extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, size_t /*size*/, vx_buffer_h* /*hbuffer*/) {
return -1;
}
extern volatile void* vx_host_ptr(vx_buffer_h /*hbuffer*/) {
return nullptr;
}
extern int vx_buf_release(vx_buffer_h /*hbuffer*/) {
return -1;
}
extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*src_offset*/) {
return -1;
}
extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*dest_offset*/) {
return -1;
}
extern int vx_start(vx_device_h /*hdevice*/) {
return -1;
}
extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) {
return -1;
}

View File

@@ -0,0 +1,69 @@
RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
# RISCV_TOOL_PATH ?= /opt/riscv-new/drops
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
VX_DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
VX_CPY = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.S
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_IO = $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
VX_FIO = $(VX_RT_PATH)/fileio/fileio.S
VX_CFLAGS = -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I../../../hw
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../../include
LDFLAGS +=
PROJECT = basic
SRCS = basic.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DMP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CPY) -O binary kernel.elf kernel.bin
kernel.elf: $(SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_SRCS) -I$(VX_RT_PATH) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
run-fpga: $(PROJECT)
LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
run-ase: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
run-simx: $(PROJECT)
LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 256
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all:
rm -rf $(PROJECT) *.o *.elf *.bin *.dump .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

241
driver/tests/basic/basic.cpp Executable file
View File

@@ -0,0 +1,241 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
const char* kernel_file = "kernel.bin";
int test = -1;
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h buffer = nullptr;
static void show_usage() {
std::cout << "Vortex Driver Test." << std::endl;
std::cout << "Usage: [-t testno][-k: kernel][-n words][-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
test = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (buffer) {
vx_buf_release(buffer);
}
if (device) {
vx_dev_close(device);
}
}
uint64_t shuffle(int i, uint64_t value) {
return (value << i) | (value & ((1 << i)-1));;
}
int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
int errors = 0;
// update source buffer
for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
((uint64_t*)vx_host_ptr(buffer))[i] = shuffle(i, value);
}
// write buffer to local memory
std::cout << "write buffer to local memory" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, dev_addr, 64 * num_blocks, 0));
// clear destination buffer
for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
((uint64_t*)vx_host_ptr(buffer))[i] = 0;
}
// read buffer from local memory
std::cout << "read buffer from local memory" << std::endl;
RT_CHECK(vx_copy_from_dev(buffer, dev_addr, 64 * num_blocks, 0));
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < (64 * num_blocks) / 8; ++i) {
auto curr = ((uint64_t*)vx_host_ptr(buffer))[i];
auto ref = shuffle(i, value);
if (curr != ref) {
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
<< ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
return 0;
}
int run_kernel_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
int errors = 0;
// update source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xffffffff;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, -1));
// flush the caches
std::cout << "flush the caches" << std::endl;
RT_CHECK(vx_flush_caches(device, kernel_arg.dst_ptr, buf_size));
// read buffer from local memory
std::cout << "read buffer from local memory" << std::endl;
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
// verify result
std::cout << "verify result" << std::endl;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t curr = ((int32_t*)vx_host_ptr(buffer))[i];
int32_t ref = i;
if (curr != ref) {
std::cout << "error at value " << i
<< ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
kernel_arg_t kernel_arg;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
uint32_t max_cores = vx_dev_caps(VX_CAPS_MAX_CORES);
uint32_t num_points = max_cores * count;
uint32_t num_blocks = (num_points * sizeof(uint32_t) + 63) / 64;
uint32_t buf_size = num_blocks * 64;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// allocate device memory
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
kernel_arg.src_ptr = value;
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
kernel_arg.dst_ptr = value;
kernel_arg.count = count;
std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
// run tests
if (0 == test || -1 == test) {
std::cout << "run memcopy test" << std::endl;
RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d00ff00ff, 1));
RT_CHECK(run_memcopy_test(kernel_arg.src_ptr, 0x0badf00d40ff40ff, num_blocks));
}
if (1 == test || -1 == test) {
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (void*)vx_host_ptr(buffer);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
std::cout << "run kernel test" << std::endl;
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "Test PASSED" << std::endl;
return 0;
}

View File

@@ -0,0 +1,12 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
struct kernel_arg_t {
uint32_t count;
uint32_t src_ptr;
uint32_t dst_ptr;
};
#endif

Binary file not shown.

View File

@@ -0,0 +1,17 @@
#include <stdint.h>
#include <VX_config.h>
#include "intrinsics/vx_intrinsics.h"
#include "common.h"
void main() {
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t count = arg->count;
int32_t* src_ptr = (int32_t*)arg->src_ptr;
int32_t* dst_ptr = (int32_t*)arg->dst_ptr;
uint32_t offset = vx_core_id() * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset + i] = src_ptr[offset + i];
}
}

View File

@@ -0,0 +1,66 @@
RISCV_TOOL_PATH ?= $(wildcard ~/dev/riscv-gnu-toolchain/drops)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
VX_DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
VX_CPY = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
#VX_NEWLIB = $(VX_RT_PATH)/newlib/newlib.c
VX_STR = $(VX_RT_PATH)/startup/vx_start.S
VX_INT = $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
#VX_IO = $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_API = $(VX_RT_PATH)/vx_api/vx_api.c
#VX_FIO = $(VX_RT_PATH)/fileio/fileio.S
VX_CFLAGS = -march=rv32im -mabi=ilp32 -O3 -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld -ffreestanding -nostartfiles -Wl,--gc-sections
VX_CFLAGS += -I../../../hw
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../../include
PROJECT = demo
SRCS = demo.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DMP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CPY) -O binary kernel.elf kernel.bin
kernel.elf: $(SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_SRCS) -I$(VX_RT_PATH) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
run-fpga: $(PROJECT)
LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
run-ase: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
run-simx: $(PROJECT)
LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) -n 16
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all:
rm -rf $(PROJECT) *.o *.elf *.bin *.dump .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -0,0 +1,13 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
struct kernel_arg_t {
uint32_t count;
uint32_t src0_ptr;
uint32_t src1_ptr;
uint32_t dst_ptr;
};
#endif

201
driver/tests/demo/demo.cpp Normal file
View File

@@ -0,0 +1,201 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h buffer = nullptr;
static void show_usage() {
std::cout << "Vortex Driver Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (buffer) {
vx_buf_release(buffer);
}
if (device) {
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, -1));
// flush the destination buffer caches
std::cout << "flush the destination buffer caches" << std::endl;
RT_CHECK(vx_flush_caches(device, kernel_arg.dst_ptr, buf_size));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at value " << i
<< ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
kernel_arg_t kernel_arg;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
uint32_t max_cores = vx_dev_caps(VX_CAPS_MAX_CORES);
uint32_t max_warps = vx_dev_caps(VX_CAPS_MAX_WARPS);
uint32_t max_threads = vx_dev_caps(VX_CAPS_MAX_THREADS);
uint32_t num_points = count * max_cores * max_warps * max_threads;
uint32_t buf_size = num_points * sizeof(uint32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
kernel_arg.src0_ptr = value;
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
kernel_arg.src1_ptr = value;
RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value));
kernel_arg.dst_ptr = value;
kernel_arg.count = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_ptr << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_ptr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(buffer);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src0_ptr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src1_ptr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xffffffff;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

Binary file not shown.

View File

@@ -0,0 +1,30 @@
#include <stdlib.h>
#include <stdio.h>
#include "intrinsics/vx_intrinsics.h"
#include "vx_api/vx_api.h"
#include "common.h"
void kernel_body(void* arg) {
struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg);
uint32_t count = _arg->count;
int32_t* src0_ptr = (int32_t*)_arg->src0_ptr;
int32_t* src1_ptr = (int32_t*)_arg->src1_ptr;
int32_t* dst_ptr = (int32_t*)_arg->dst_ptr;
uint32_t offset = vx_thread_gid() * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
}
void main() {
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
/*printf("stride=%d\n", arg->stride);
printf("src0_ptr=0x%src0\n", arg->src0_ptr);
printf("src1_ptr=0x%src0\n", arg->src1_ptr);
printf("dst_ptr=0x%src0\n", arg->dst_ptr);*/
int num_warps = vx_num_warps();
int num_threads = vx_num_threads();
vx_spawn_warps(num_warps, num_threads, kernel_body, arg);
}

BIN
driver/tests/demo/kernel.elf Executable file

Binary file not shown.

View File

@@ -0,0 +1,603 @@
//
// Copyright (c) 2017, Intel Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// Neither the name of the Intel Corporation nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
// Read from the memory locations first and then write to the memory locations
`include "platform_if.vh"
`include "afu_json_info.vh"
module ccip_std_afu
(
// CCI-P Clocks and Resets
input logic pClk, // 400MHz - CCI-P clock domain. Primary interface clock
input logic pClkDiv2, // 200MHz - CCI-P clock domain.
input logic pClkDiv4, // 100MHz - CCI-P clock domain.
input logic uClk_usr, // User clock domain. Refer to clock programming guide ** Currently provides fixed 300MHz clock **
input logic uClk_usrDiv2, // User clock domain. Half the programmed frequency ** Currently provides fixed 150MHz clock **
input logic pck_cp2af_softReset, // CCI-P ACTIVE HIGH Soft Reset
input logic [1:0] pck_cp2af_pwrState, // CCI-P AFU Power State
input logic pck_cp2af_error, // CCI-P Protocol Error Detected
// Interface structures
input t_if_ccip_Rx pck_cp2af_sRx, // CCI-P Rx Port
output t_if_ccip_Tx pck_af2cp_sTx // CCI-P Tx Port
);
//
// Run the entire design at the standard CCI-P frequency (400 MHz).
//
logic clk;
assign clk = pClk;
logic reset;
assign reset = pck_cp2af_softReset;
logic [511:0] wr_data;
logic [511:0] rd_data;
logic get_write_addr;
logic do_update;
logic rd_end_of_list;
logic rd_needed;
logic wr_needed;
logic [15:0] cnt_list_length;
// =========================================================================
//
// Register requests.
//
// =========================================================================
//
// The incoming pck_cp2af_sRx and outgoing pck_af2cp_sTx must both be
// registered. Here we register pck_cp2af_sRx and assign it to sRx.
// We also assign pck_af2cp_sTx to sTx here but don't register it.
// The code below never uses combinational logic to write sTx.
//
t_if_ccip_Rx sRx;
always_ff @(posedge clk)
begin
sRx <= pck_cp2af_sRx;
end
t_if_ccip_Tx sTx;
assign pck_af2cp_sTx = sTx;
// =========================================================================
//
// CSR (MMIO) handling.
//
// =========================================================================
// The AFU ID is a unique ID for a given program. Here we generated
// one with the "uuidgen" program and stored it in the AFU's JSON file.
// ASE and synthesis setup scripts automatically invoke afu_json_mgr
// to extract the UUID into afu_json_info.vh.
logic [127:0] afu_id = `AFU_ACCEL_UUID;
//
// A valid AFU must implement a device feature list, starting at MMIO
// address 0. Every entry in the feature list begins with 5 64-bit
// words: a device feature header, two AFU UUID words and two reserved
// words.
//
// Is a CSR read request active this cycle?
logic is_csr_read;
assign is_csr_read = sRx.c0.mmioRdValid;
// Is a CSR write request active this cycle?
logic is_csr_write;
assign is_csr_write = sRx.c0.mmioWrValid;
// The MMIO request header is overlayed on the normal c0 memory read
// response data structure. Cast the c0Rx header to an MMIO request
// header.
t_ccip_c0_ReqMmioHdr mmio_req_hdr;
assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(sRx.c0.hdr);
//
// Implement the device feature list by responding to MMIO reads.
//
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c2.mmioRdValid <= 1'b0;
end
else
begin
// Always respond with something for every read request
sTx.c2.mmioRdValid <= is_csr_read;
// The unique transaction ID matches responses to requests
sTx.c2.hdr.tid <= mmio_req_hdr.tid;
// Addresses are of 32-bit objects in MMIO space. Addresses
// of 64-bit objects are thus multiples of 2.
case (mmio_req_hdr.address)
0: // AFU DFH (device feature header)
begin
// Here we define a trivial feature list. In this
// example, our AFU is the only entry in this list.
sTx.c2.data <= t_ccip_mmioData'(0);
// Feature type is AFU
sTx.c2.data[63:60] <= 4'h1;
// End of list (last entry in list)
sTx.c2.data[40] <= 1'b1;
end
// AFU_ID_L
2: sTx.c2.data <= afu_id[63:0];
// AFU_ID_H
4: sTx.c2.data <= afu_id[127:64];
// DFH_RSVD0
6: sTx.c2.data <= t_ccip_mmioData'(0);
// DFH_RSVD1
8: sTx.c2.data <= t_ccip_mmioData'(0);
default: sTx.c2.data <= t_ccip_mmioData'(0);
endcase
end
end
//
// CSR write handling. Host software must tell the AFU the memory address
// to which it should be writing. The address is set by writing a CSR.
//
// We use MMIO address 0 to set the memory address. The read and
// write MMIO spaces are logically separate so we are free to use
// whatever we like. This may not be good practice for cleanly
// organizing the MMIO address space, but it is legal.
logic is_mem_addr_csr_write;
assign is_mem_addr_csr_write = get_write_addr && is_csr_write &&
(mmio_req_hdr.address == t_ccip_mmioAddr'(0));
// Memory address to which this AFU will write.
t_ccip_clAddr write_mem_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
get_write_addr <= 1'b1;
end
else if (is_mem_addr_csr_write)
begin
write_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
get_write_addr <= 1'b0;
end
end
// We use MMIO address 0 to set the memory address for reading data.
logic is_mem_addr_csr_read;
assign is_mem_addr_csr_read = !get_write_addr && is_csr_write &&
(mmio_req_hdr.address == t_ccip_mmioAddr'(0));
// Memory address from which this AFU will read.
logic start_read;
t_ccip_clAddr read_mem_addr;
//logic start_traversal = 'b0;
//t_ccip_clAddr start_traversal_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
start_read <= 1'b0;
end
else if (is_mem_addr_csr_read)
begin
read_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
start_read <= 'b1;
end
end
// =========================================================================
//
// Main AFU logic
//
// =========================================================================
//
// States in our simple example.
//
//typedef enum logic [0:0]
typedef enum logic [1:0]
{
STATE_IDLE,
STATE_READ,
STATE_UPDATE,
STATE_WRITE
}
t_state;
t_state state;
//
// State machine
//
always_ff @(posedge clk)
begin
if (reset)
begin
state <= STATE_IDLE;
rd_end_of_list <= 1'b0;
end
else
begin
case (state)
STATE_IDLE:
begin
// Traversal begins when CSR 1 is written
if (start_read)
begin
state <= STATE_READ;
$display("AFU starting traversal at 0x%x", t_ccip_clAddr'(read_mem_addr));
end
end
STATE_READ:
begin
if (rd_needed)
begin
// Read data from the address and update address
state <= STATE_UPDATE;
start_read <= 'b0;
$display("AFU reading data and pointing to next read address...");
end
end
STATE_UPDATE:
begin
// Update the read value to be written back
if (do_update)
begin
state <= STATE_WRITE;
$display("AFU performing comutations on the read values...");
end
end
STATE_WRITE:
begin
// Write the updated value to the address
// Point to new address after that
// if done then point to IDLE; else read new values
if (rd_end_of_list)
begin
state <= STATE_IDLE;
$display("AFU done...");
end
else
begin
if (wr_needed)
begin
state <= STATE_READ;
$display("AFU reading again from read address...");
end
end
end
endcase
end
end
// =========================================================================
//
// Read logic.
//
// =========================================================================
//
// READ REQUEST
//
// Did a write response just arrive
logic addr_next_valid;
// Next read address
t_ccip_clAddr addr_next;
always_ff @(posedge clk)
begin
// Next read address is valid when we have got the write response back
// and channel is not full
//addr_next_valid <= sRx.c0TxAlmFull;
addr_next_valid <= sRx.c1.rspValid;
// Next address is current address plus address length
// Apurve
//addr_next <= addr_next + addr_size;
addr_next <= addr_next + 0;
// End of list reached if we have read 10 times
rd_end_of_list <= (cnt_list_length == 'h10);
end
//
// Since back pressure may prevent an immediate read request, we must
// record whether a read is needed and hold it until the request can
// be sent to the FIU.
//
t_ccip_clAddr rd_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
rd_needed <= 1'b0;
end
else
begin
// If reads are allowed this cycle then we can safely clear
// any previously requested reads. This simple AFU has only
// one read in flight at a time since it is walking a pointer
// chain.
if (rd_needed)
begin
rd_needed <= sRx.c0TxAlmFull;
end
else
begin
// Need a read under two conditions:
// - Starting a new walk
// - A read response just arrived from a line containing
// a next pointer.
rd_needed <= (start_read || (addr_next_valid && ! rd_end_of_list));
rd_addr <= (start_read ? read_mem_addr : addr_next);
end
end
end
//
// Emit read requests to the FIU.
//
// Read header defines the request to the FIU
t_cci_c0_ReqMemHdr rd_hdr;
always_comb
begin
rd_hdr = t_cci_c0_ReqMemHdr'(0);
// Read request type
rd_hdr.req_type = eREQ_RDLINE_I;
// Virtual address (MPF virtual addressing is enabled)
rd_hdr.address = rd_addr;
// Let the FIU pick the channel
rd_hdr.vc_sel = eVC_VA;
// Read 4 lines (the size of an entry in the list)
rd_hdr.cl_len = eCL_LEN_4;
end
// Send read requests to the FIU
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c0.valid <= 1'b0;
cnt_list_length <= 0;
end
else
begin
// Generate a read request when needed and the FIU isn't full
sTx.c0.valid <= (rd_needed && ! sRx.c0TxAlmFull);
sTx.c0.hdr <= rd_hdr;
if (rd_needed && ! sRx.c0TxAlmFull)
begin
cnt_list_length <= cnt_list_length + 1;
//$display(" Reading from VA 0x%x", clAddrToByteAddr(rd_addr));
$display("Incrementing read count...");
end
end
end
//
// READ RESPONSE HANDLING
//
//
// Receive data (read responses).
//
always_ff @(posedge clk)
begin
if (reset)
begin
do_update <= 1'b0;
end
else
begin
if (state == STATE_READ)
begin
rd_data <= sRx.c0.data;
do_update <= 1'b1;
end
if (state == STATE_UPDATE)
begin
// Update the read data and put it in the write data to be written
wr_data <= rd_data + 1;
do_update <= 1'b0;
end
end
end
// =========================================================================
//
// Write logic.
//
// =========================================================================
//
// WRITE REQUEST
//
// Did a write response just arrive
logic wr_addr_next_valid;
// Next write address
t_ccip_clAddr wr_addr_next;
always_ff @(posedge clk)
begin
// Next write address is valid when we have got the read response back
// and channel is not full
//wr_addr_next_valid <= sRx.c1TxAlmFull;
wr_addr_next_valid <= sRx.c0.rspValid;
// Next address is current address plus address length
// Apurve
//wr_addr_next <= wr_addr_next + addr_size;
wr_addr_next <= wr_addr_next + 0;
end
//
// Since back pressure may prevent an immediate write request, we must
// record whether a write is needed and hold it until the request can
// be sent to the FIU.
//
t_ccip_clAddr wr_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
wr_needed <= 1'b0;
end
else
begin
// If writes are allowed this cycle then we can safely clear
// any previously requested writes. This simple AFU has only
// one write in flight at a time since it is walking a pointer
// chain.
if (wr_needed)
begin
wr_needed <= sRx.c1TxAlmFull;
end
else
begin
// Need a write under two conditions:
// - Starting a new walk
// - A write response just arrived from a line containing
// a next pointer.
//wr_needed <= (start_write || (wr_addr_next_valid && ! rd_end_of_list));
wr_needed <= (start_write || wr_addr_next_valid);
wr_addr <= (start_write ? write_mem_addr : wr_addr_next);
end
end
end
//
// Emit write requests to the FIU.
//
// Write header defines the request to the FIU
t_ccip_c1_ReqMemHdr wr_hdr;
always_comb
begin
wr_hdr = t_cci_c1_ReqMemHdr'(0);
// Write request type
wr_hdr.req_type = eREQ_RDLINE_I;
// Virtual address (MPF virtual addressing is enabled)
wr_hdr.address = wr_addr;
// Let the FIU pick the channel
wr_hdr.vc_sel = eVC_VA;
// Write 4 lines (the size of an entry in the list)
wr_hdr.cl_len = eCL_LEN_4;
// Start of packet is true (single line write)
wr_hdr.sop = 1'b1;
end
// Send write requests to the FIU
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c1.valid <= 1'b0;
//cnt_list_length <= 0;
end
else
begin
// Generate a write request when needed and the FIU isn't full
sTx.c1.valid <= (wr_needed && ! sRx.c1TxAlmFull);
sTx.c1.hdr <= wr_hdr;
sTx.c1.data = t_ccip_clData'(wr_data);
//if (wr_needed && ! sRx.c1TxAlmFull)
//begin
// cnt_list_length <= cnt_list_length + 1;
// //$display(" Writing from VA 0x%x", clAddrToByteAddr(rd_addr));
// $display("Incrementing write count...");
//end
end
end
//
// WRITE RESPONSE HANDLING
//
// Apurve: Check if a signal is to be sent to read to start reading in case
// write response does not work
//
// Send data (write requests).
//
//always_ff @(posedge clk)
//begin
// if (state == STATE_WRITE)
// begin
// rd_data <= sRx.c0.data;
// end
// if (state == STATE_UPDATE)
// begin
// // Update the write data and put it in the write data to be written
// wr_data <= rd_data + 1;
// end
//end
endmodule

View File

@@ -0,0 +1,18 @@
{
"version": 1,
"afu-image": {
"power": 0,
"afu-top-interface":
{
"name": "ccip_std_afu"
},
"accelerator-clusters":
[
{
"name": "cci_hello",
"total-contexts": 1,
"accelerator-type-uuid": "c6aa954a-9b91-4a37-abc1-1d9f0709dcc3"
}
]
}
}

View File

@@ -0,0 +1,653 @@
//
// Copyright (c) 2017, Intel Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// Neither the name of the Intel Corporation nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
// Read from the memory locations first and then write to the memory locations
`include "platform_if.vh"
`include "afu_json_info.vh"
module ccip_std_afu
(
// CCI-P Clocks and Resets
input logic pClk, // 400MHz - CCI-P clock domain. Primary interface clock
input logic pClkDiv2, // 200MHz - CCI-P clock domain.
input logic pClkDiv4, // 100MHz - CCI-P clock domain.
input logic uClk_usr, // User clock domain. Refer to clock programming guide ** Currently provides fixed 300MHz clock **
input logic uClk_usrDiv2, // User clock domain. Half the programmed frequency ** Currently provides fixed 150MHz clock **
input logic pck_cp2af_softReset, // CCI-P ACTIVE HIGH Soft Reset
input logic [1:0] pck_cp2af_pwrState, // CCI-P AFU Power State
input logic pck_cp2af_error, // CCI-P Protocol Error Detected
// Interface structures
input t_if_ccip_Rx pck_cp2af_sRx, // CCI-P Rx Port
output t_if_ccip_Tx pck_af2cp_sTx // CCI-P Tx Port
);
//
// Run the entire design at the standard CCI-P frequency (400 MHz).
//
logic clk;
assign clk = pClk;
logic reset;
assign reset = pck_cp2af_softReset;
logic [511:0] wr_data;
logic [511:0] rd_data;
logic do_update;
logic start_read;
logic start_write;
logic wr_addr_next_valid;
logic addr_next_valid;
logic rd_end_of_list;
logic rd_needed;
logic wr_needed;
logic read_req;
logic write_req;
logic [15:0] cnt_list_length;
t_ccip_clAddr rd_addr;
t_ccip_clAddr wr_addr;
t_ccip_clAddr addr_next;
t_ccip_clAddr wr_addr_next;
// =========================================================================
//
// Register requests.
//
// =========================================================================
//
// The incoming pck_cp2af_sRx and outgoing pck_af2cp_sTx must both be
// registered. Here we register pck_cp2af_sRx and assign it to sRx.
// We also assign pck_af2cp_sTx to sTx here but don't register it.
// The code below never uses combinational logic to write sTx.
//
t_if_ccip_Rx sRx;
always_ff @(posedge clk)
begin
sRx <= pck_cp2af_sRx;
end
t_if_ccip_Tx sTx;
assign pck_af2cp_sTx = sTx;
// =========================================================================
//
// CSR (MMIO) handling.
//
// =========================================================================
// The AFU ID is a unique ID for a given program. Here we generated
// one with the "uuidgen" program and stored it in the AFU's JSON file.
// ASE and synthesis setup scripts automatically invoke afu_json_mgr
// to extract the UUID into afu_json_info.vh.
logic [127:0] afu_id = `AFU_ACCEL_UUID;
//
// A valid AFU must implement a device feature list, starting at MMIO
// address 0. Every entry in the feature list begins with 5 64-bit
// words: a device feature header, two AFU UUID words and two reserved
// words.
//
// Is a CSR read request active this cycle?
logic is_csr_read;
assign is_csr_read = sRx.c0.mmioRdValid;
// Is a CSR write request active this cycle?
logic is_csr_write;
assign is_csr_write = sRx.c0.mmioWrValid;
// The MMIO request header is overlayed on the normal c0 memory read
// response data structure. Cast the c0Rx header to an MMIO request
// header.
t_ccip_c0_ReqMmioHdr mmio_req_hdr;
assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(sRx.c0.hdr);
//
// Implement the device feature list by responding to MMIO reads.
//
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c2.mmioRdValid <= 1'b0;
end
else
begin
// Always respond with something for every read request
sTx.c2.mmioRdValid <= is_csr_read;
// The unique transaction ID matches responses to requests
sTx.c2.hdr.tid <= mmio_req_hdr.tid;
// Addresses are of 32-bit objects in MMIO space. Addresses
// of 64-bit objects are thus multiples of 2.
case (mmio_req_hdr.address)
0: // AFU DFH (device feature header)
begin
// Here we define a trivial feature list. In this
// example, our AFU is the only entry in this list.
sTx.c2.data <= t_ccip_mmioData'(0);
// Feature type is AFU
sTx.c2.data[63:60] <= 4'h1;
// End of list (last entry in list)
sTx.c2.data[40] <= 1'b1;
end
// AFU_ID_L
2: sTx.c2.data <= afu_id[63:0];
// AFU_ID_H
4: sTx.c2.data <= afu_id[127:64];
// DFH_RSVD0
6: sTx.c2.data <= t_ccip_mmioData'(0);
// DFH_RSVD1
8: sTx.c2.data <= t_ccip_mmioData'(0);
// Updated by apurve to check fpgaReadMMIO
10: sTx.c2.data <= t_ccip_mmioData'(start_read);
default: sTx.c2.data <= t_ccip_mmioData'(0);
endcase
end
end
//
// CSR write handling. Host software must tell the AFU the memory address
// to which it should be writing. The address is set by writing a CSR.
//
// We use MMIO address 0 to set the memory address. The read and
// write MMIO spaces are logically separate so we are free to use
// whatever we like. This may not be good practice for cleanly
// organizing the MMIO address space, but it is legal.
logic is_mem_addr_csr_write;
assign is_mem_addr_csr_write = is_csr_write &&
(mmio_req_hdr.address == t_ccip_mmioAddr'(0));
// Memory address to which this AFU will write.
t_ccip_clAddr write_mem_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
start_write <= 1'b0;
end
else if (is_mem_addr_csr_write)
begin
write_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
start_write <= 1'b1;
//$display("Write mem address is 0x%x", t_ccip_clAddr'(write_mem_addr));
end
end
// We use MMIO address 8 to set the memory address for reading data.
logic is_mem_addr_csr_read;
assign is_mem_addr_csr_read = is_csr_write &&
(mmio_req_hdr.address == t_ccip_mmioAddr'(2));
// Memory address from which this AFU will read.
t_ccip_clAddr read_mem_addr;
//logic start_traversal = 'b0;
//t_ccip_clAddr start_traversal_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
start_read <= 1'b0;
end
else if (is_mem_addr_csr_read)
begin
read_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
start_read <= 1'b1;
//$display("Read mem address is 0x%x", t_ccip_clAddr'(read_mem_addr));
end
end
// =========================================================================
//
// Main AFU logic
//
// =========================================================================
//
// States in our simple example.
//
//typedef enum logic [0:0]
typedef enum logic [1:0]
{
STATE_IDLE,
STATE_READ,
STATE_UPDATE,
STATE_WRITE
}
t_state;
t_state state;
//
// State machine
//
always_ff @(posedge clk)
begin
if (reset)
begin
state <= STATE_IDLE;
rd_end_of_list <= 1'b0;
end
else
begin
case (state)
STATE_IDLE:
begin
// Traversal begins when CSR 1 is written
if (start_read)
begin
state <= STATE_READ;
$display("AFU starting traversal at 0x%x", t_ccip_clAddr'(read_mem_addr));
end
end
STATE_READ:
begin
$display("AFU in READ...");
$display("do_update is %d...",do_update);
$display("addr_next_valid is %d...",addr_next_valid);
$display("rd_needed is %d...",rd_needed);
if (!rd_needed && do_update)
begin
state <= STATE_UPDATE;
$display("AFU moving to UPDATE...");
end
end
STATE_UPDATE:
begin
// Update the read value to be written back
$display("AFU in UPDATE...");
if (!do_update)
begin
state <= STATE_WRITE;
wr_needed <= 1'b1;
$display("AFU moving to WRITE...");
end
end
STATE_WRITE:
begin
// Write the updated value to the address
// Point to new address after that
// if done then point to IDLE; else read new values
$display("AFU in WRITE...");
if (rd_end_of_list)
begin
state <= STATE_IDLE;
$display("AFU done...");
end
else if (!wr_needed)
begin
state <= STATE_READ;
$display("AFU moving to READ from WRITE...");
start_write <= 1'b0;
write_req <= 1'b0;
end
end
endcase
end
end
// =========================================================================
//
// Read logic.
//
// =========================================================================
//
// READ REQUEST
//
// Did a write response just arrive
// Next read address
always_ff @(posedge clk)
begin
// Next read address is valid when we have got the write response back
if (sRx.c1.rspValid)
begin
addr_next_valid <= sRx.c1.rspValid;
//if (state == STATE_READ && !rd_needed)
//begin
// Apurve: Next address is current address plus address length
//addr_next <= addr_next + addr_size;
addr_next <= (addr_next_valid ? rd_addr + 0 : rd_addr);
// End of list reached if we have read 5 times
rd_end_of_list <= (cnt_list_length == 'h5);
//end
end
end
//
// Since back pressure may prevent an immediate read request, we must
// record whether a read is needed and hold it until the request can
// be sent to the FIU.
//
always_ff @(posedge clk)
begin
if (reset)
begin
rd_needed <= 1'b0;
end
else
begin
// If reads are allowed this cycle then we can safely clear
// any previously requested reads. This simple AFU has only
// one read in flight at a time since it is walking a pointer
// chain.
if (rd_needed)
begin
//rd_needed <= sRx.c0TxAlmFull;
//rd_needed <= (!sRx.c0TxAlmFull && !sRx.c0.rspValid);
rd_needed <= !sRx.c0.rspValid;
end
else if (state == STATE_READ)
begin
// Need a read under two conditions:
// - Starting a new walk
// - A read response just arrived from a line containing
// a next pointer.
rd_needed <= (start_read || (!sRx.c0TxAlmFull && (addr_next_valid && ! rd_end_of_list)));
rd_addr <= (start_read ? read_mem_addr : addr_next);
//$display("rd_addr is 0x%x", t_ccip_clAddr'(rd_addr));
//$display("read mem addr is 0x%x", t_ccip_clAddr'(read_mem_addr));
//$display("start read is %d", start_read);
end
end
end
//
// Emit read requests to the FIU.
//
// Read header defines the request to the FIU
t_ccip_c0_ReqMemHdr rd_hdr;
always_comb
begin
rd_hdr = t_ccip_c0_ReqMemHdr'(0);
// Read request type (No intention to cache)
//rd_hdr.req_type = 4'h0;
// Virtual address (MPF virtual addressing is enabled)
rd_hdr.address = rd_addr;
// Read over channel VA
//rd_hdr.vc_sel = 2'h0;
// Read one cache line (64 bytes)
//rd_hdr.cl_len = 2'h0;
end
// Send read requests to the FIU
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c0.valid <= 1'b0;
cnt_list_length <= 0;
read_req <= 1'b0;
end
else
begin
// Generate a read request when needed and the FIU isn't full
if (state == STATE_READ)
begin
sTx.c0.valid <= (rd_needed && !sRx.c0TxAlmFull && !read_req);
if (rd_needed && !sRx.c0TxAlmFull && !read_req)
begin
sTx.c0.hdr <= rd_hdr;
cnt_list_length <= cnt_list_length + 1;
read_req <= 1'b1;
$display("Incrementing read count...%d",cnt_list_length);
$display("Read address is 0x%x...",rd_hdr.address);
addr_next_valid <= 1'b0;
// Apurve: Add something to stop read once this section has been accessed
//rd_needed <= 1'b0;
end
end
end
end
//
// READ RESPONSE HANDLING
//
//
// Receive data (read responses).
//
always_ff @(posedge clk)
begin
if (reset)
begin
do_update <= 1'b0;
end
else
begin
if (!do_update && sRx.c0.rspValid)
begin
rd_data <= sRx.c0.data;
do_update <= 1'b1;
$display("rd data is %d...",rd_data);
end
if ((state == STATE_UPDATE) && (do_update == 1'b1))
begin
// Update the read data and put it in the write data to be written
wr_data <= rd_data + 2;
do_update <= 1'b0;
read_req <= 1'b0;
$display("write data is %d...",wr_data);
// First read done. Next reads should be from the updated addresses
start_read <= 1'b0;
end
end
end
// =========================================================================
//
// Write logic.
//
// =========================================================================
//
// WRITE REQUEST
//
// Did a write response just arrive
// Next write address
always_ff @(posedge clk)
begin
if (sRx.c0.rspValid)
begin
// Next write address is valid when we have got the read response back
wr_addr_next_valid <= sRx.c0.rspValid;
//wr_addr_next_valid <= (!start_write && sRx.c0.rspValid);
//if (state == STATE_WRITE && !wr_needed)
//begin
// Apurve: Next address is current address plus address length
//wr_addr_next <= wr_addr + 0;
wr_addr_next <= (wr_addr_next_valid ? wr_addr + 0 : wr_addr);
//end
end
end
//
// Since back pressure may prevent an immediate write request, we must
// record whether a write is needed and hold it until the request can
// be sent to the FIU.
//
always_ff @(posedge clk)
begin
if (reset)
begin
wr_needed <= 1'b0;
end
else
begin
// If writes are allowed this cycle then we can safely clear
// any previously requested writes. This simple AFU has only
// one write in flight at a time since it is walking a pointer
// chain.
if (wr_needed)
begin
//wr_needed <= sRx.c1TxAlmFull;
//wr_needed <= (!sRx.c1TxAlmFull && !sRx.c1.rspValid);
wr_needed <= !sRx.c1.rspValid;
end
else
begin
// Need a write under two conditions:
// - Starting a new walk
// - A write response just arrived from a line containing
// a next pointer.
wr_needed <= (start_write || (!sRx.c1TxAlmFull && wr_addr_next_valid));
wr_addr <= (start_write ? write_mem_addr : wr_addr_next);
//$display("Write mem address later is 0x%x", t_ccip_clAddr'(write_mem_addr));
end
end
end
//
// Emit write requests to the FIU.
//
// Write header defines the request to the FIU
t_ccip_c1_ReqMemHdr wr_hdr;
always_comb
begin
wr_hdr = t_ccip_c1_ReqMemHdr'(0);
// Write request type
//wr_hdr.req_type = 4'h0;
// Virtual address (MPF virtual addressing is enabled)
wr_hdr.address = wr_addr;
// Let the FIU pick the channel
//wr_hdr.vc_sel = 2'h2;
// Write 1 cache line (64 bytes)
//wr_hdr.cl_len = 2'h0;
// Start of packet is true (single line write)
wr_hdr.sop = 1'b1;
end
// Send write requests to the FIU
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c1.valid <= 1'b0;
write_req <= 1'b0;
end
else
begin
// Generate a write request when needed and the FIU isn't full
if (state == STATE_WRITE)
begin
sTx.c1.valid <= (wr_needed && !sRx.c1TxAlmFull && !write_req);
if (wr_needed && !sRx.c1TxAlmFull && !write_req)
begin
sTx.c1.hdr <= wr_hdr;
sTx.c1.data <= t_ccip_clData'(wr_data);
write_req <= 1'b1;
wr_addr_next_valid <= 1'b0;
$display("Write address is 0x%x...", wr_hdr.address);
end
end
end
end
//
// WRITE RESPONSE HANDLING
//
// Apurve: Check if a signal is to be sent to read to start reading in case
// write response does not work
//
// Send data (write requests).
//
//always_ff @(posedge clk)
//begin
// if (state == STATE_WRITE)
// begin
// rd_data <= sRx.c0.data;
// end
// if (state == STATE_UPDATE)
// begin
// // Update the write data and put it in the write data to be written
// wr_data <= rd_data + 1;
// end
//end
endmodule

View File

@@ -0,0 +1,621 @@
//
// Copyright (c) 2017, Intel Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// Neither the name of the Intel Corporation nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
// Read from the memory locations first and then write to the memory locations
`include "platform_if.vh"
`include "afu_json_info.vh"
module ccip_std_afu
(
// CCI-P Clocks and Resets
input logic pClk, // 400MHz - CCI-P clock domain. Primary interface clock
input logic pClkDiv2, // 200MHz - CCI-P clock domain.
input logic pClkDiv4, // 100MHz - CCI-P clock domain.
input logic uClk_usr, // User clock domain. Refer to clock programming guide ** Currently provides fixed 300MHz clock **
input logic uClk_usrDiv2, // User clock domain. Half the programmed frequency ** Currently provides fixed 150MHz clock **
input logic pck_cp2af_softReset, // CCI-P ACTIVE HIGH Soft Reset
input logic [1:0] pck_cp2af_pwrState, // CCI-P AFU Power State
input logic pck_cp2af_error, // CCI-P Protocol Error Detected
// Interface structures
input t_if_ccip_Rx pck_cp2af_sRx, // CCI-P Rx Port
output t_if_ccip_Tx pck_af2cp_sTx // CCI-P Tx Port
);
//
// Run the entire design at the standard CCI-P frequency (400 MHz).
//
logic clk;
assign clk = pClk;
logic reset;
assign reset = pck_cp2af_softReset;
logic [511:0] wr_data;
logic [511:0] rd_data;
logic do_update;
logic start_read;
logic start_write;
logic wr_addr_next_valid;
logic addr_next_valid;
logic rd_end_of_list;
logic rd_needed;
logic wr_needed;
logic [15:0] cnt_list_length;
t_ccip_clAddr rd_addr;
t_ccip_clAddr wr_addr;
t_ccip_clAddr addr_next;
t_ccip_clAddr wr_addr_next;
// =========================================================================
//
// Register requests.
//
// =========================================================================
//
// The incoming pck_cp2af_sRx and outgoing pck_af2cp_sTx must both be
// registered. Here we register pck_cp2af_sRx and assign it to sRx.
// We also assign pck_af2cp_sTx to sTx here but don't register it.
// The code below never uses combinational logic to write sTx.
//
t_if_ccip_Rx sRx;
always_ff @(posedge clk)
begin
sRx <= pck_cp2af_sRx;
end
t_if_ccip_Tx sTx;
assign pck_af2cp_sTx = sTx;
// =========================================================================
//
// CSR (MMIO) handling.
//
// =========================================================================
// The AFU ID is a unique ID for a given program. Here we generated
// one with the "uuidgen" program and stored it in the AFU's JSON file.
// ASE and synthesis setup scripts automatically invoke afu_json_mgr
// to extract the UUID into afu_json_info.vh.
logic [127:0] afu_id = `AFU_ACCEL_UUID;
//
// A valid AFU must implement a device feature list, starting at MMIO
// address 0. Every entry in the feature list begins with 5 64-bit
// words: a device feature header, two AFU UUID words and two reserved
// words.
//
// Is a CSR read request active this cycle?
logic is_csr_read;
assign is_csr_read = sRx.c0.mmioRdValid;
// Is a CSR write request active this cycle?
logic is_csr_write;
assign is_csr_write = sRx.c0.mmioWrValid;
// The MMIO request header is overlayed on the normal c0 memory read
// response data structure. Cast the c0Rx header to an MMIO request
// header.
t_ccip_c0_ReqMmioHdr mmio_req_hdr;
assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(sRx.c0.hdr);
//
// Implement the device feature list by responding to MMIO reads.
//
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c2.mmioRdValid <= 1'b0;
end
else
begin
// Always respond with something for every read request
sTx.c2.mmioRdValid <= is_csr_read;
// The unique transaction ID matches responses to requests
sTx.c2.hdr.tid <= mmio_req_hdr.tid;
// Addresses are of 32-bit objects in MMIO space. Addresses
// of 64-bit objects are thus multiples of 2.
case (mmio_req_hdr.address)
0: // AFU DFH (device feature header)
begin
// Here we define a trivial feature list. In this
// example, our AFU is the only entry in this list.
sTx.c2.data <= t_ccip_mmioData'(0);
// Feature type is AFU
sTx.c2.data[63:60] <= 4'h1;
// End of list (last entry in list)
sTx.c2.data[40] <= 1'b1;
end
// AFU_ID_L
2: sTx.c2.data <= afu_id[63:0];
// AFU_ID_H
4: sTx.c2.data <= afu_id[127:64];
// DFH_RSVD0
6: sTx.c2.data <= t_ccip_mmioData'(0);
// DFH_RSVD1
8: sTx.c2.data <= t_ccip_mmioData'(0);
// Updated by apurve to check fpgaReadMMIO
10: sTx.c2.data <= t_ccip_mmioData'(start_read);
default: sTx.c2.data <= t_ccip_mmioData'(0);
endcase
end
end
//
// CSR write handling. Host software must tell the AFU the memory address
// to which it should be writing. The address is set by writing a CSR.
//
// We use MMIO address 0 to set the memory address. The read and
// write MMIO spaces are logically separate so we are free to use
// whatever we like. This may not be good practice for cleanly
// organizing the MMIO address space, but it is legal.
logic is_mem_addr_csr_write;
assign is_mem_addr_csr_write = is_csr_write &&
(mmio_req_hdr.address == t_ccip_mmioAddr'(0));
// Memory address to which this AFU will write.
t_ccip_clAddr write_mem_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
start_write <= 1'b0;
end
else if (is_mem_addr_csr_write)
begin
write_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
start_write <= 1'b1;
//$display("Write mem address is 0x%x", t_ccip_clAddr'(write_mem_addr));
end
end
// We use MMIO address 8 to set the memory address for reading data.
logic is_mem_addr_csr_read;
assign is_mem_addr_csr_read = is_csr_write &&
(mmio_req_hdr.address == t_ccip_mmioAddr'(2));
// Memory address from which this AFU will read.
t_ccip_clAddr read_mem_addr;
//logic start_traversal = 'b0;
//t_ccip_clAddr start_traversal_addr;
always_ff @(posedge clk)
begin
if (reset)
begin
start_read <= 1'b0;
end
else if (is_mem_addr_csr_read)
begin
read_mem_addr <= t_ccip_clAddr'(sRx.c0.data);
start_read <= 1'b1;
//$display("Read mem address is 0x%x", t_ccip_clAddr'(read_mem_addr));
end
end
// =========================================================================
//
// Main AFU logic
//
// =========================================================================
//
// States in our simple example.
//
//typedef enum logic [0:0]
typedef enum logic [1:0]
{
STATE_IDLE,
STATE_READ,
STATE_UPDATE,
STATE_WRITE
}
t_state;
t_state state;
//
// State machine
//
always_ff @(posedge clk)
begin
if (reset)
begin
state <= STATE_IDLE;
rd_end_of_list <= 1'b0;
end
else
begin
case (state)
STATE_IDLE:
begin
// Traversal begins when CSR 1 is written
if (start_read)
begin
state <= STATE_READ;
$display("AFU starting traversal at 0x%x", t_ccip_clAddr'(read_mem_addr));
end
end
STATE_READ:
begin
$display("AFU in READ...");
if (!rd_needed && do_update)
begin
state <= STATE_UPDATE;
$display("AFU moving to UPDATE...");
end
end
STATE_UPDATE:
begin
// Update the read value to be written back
$display("AFU in UPDATE...");
if (!do_update)
begin
state <= STATE_WRITE;
wr_needed <= 1'b1;
$display("AFU moving to WRITE...");
end
end
STATE_WRITE:
begin
// Write the updated value to the address
// Point to new address after that
// if done then point to IDLE; else read new values
$display("AFU in WRITE...");
if (rd_end_of_list)
begin
state <= STATE_IDLE;
$display("AFU done...");
end
else if (!wr_needed)
begin
state <= STATE_READ;
$display("AFU moving to READ from WRITE...");
start_write <= 1'b0;
end
end
endcase
end
end
// =========================================================================
//
// Read logic.
//
// =========================================================================
//
// READ REQUEST
//
// Did a write response just arrive
// Next read address
always_ff @(posedge clk)
begin
// Next read address is valid when we have got the write response back
addr_next_valid <= sRx.c1.rspValid;
// Apurve: Next address is current address plus address length
//addr_next <= addr_next + addr_size;
addr_next <= rd_addr + 0;
// End of list reached if we have read 5 times
rd_end_of_list <= (cnt_list_length == 'h5);
end
//
// Since back pressure may prevent an immediate read request, we must
// record whether a read is needed and hold it until the request can
// be sent to the FIU.
//
always_ff @(posedge clk)
begin
if (reset)
begin
rd_needed <= 1'b0;
end
else
begin
// If reads are allowed this cycle then we can safely clear
// any previously requested reads. This simple AFU has only
// one read in flight at a time since it is walking a pointer
// chain.
if (rd_needed)
begin
rd_needed <= sRx.c0TxAlmFull;
end
else
begin
// Need a read under two conditions:
// - Starting a new walk
// - A read response just arrived from a line containing
// a next pointer.
rd_needed <= (start_read || (!sRx.c0TxAlmFull && (addr_next_valid && ! rd_end_of_list)));
rd_addr <= (start_read ? read_mem_addr : addr_next);
//$display("rd_addr is 0x%x", t_ccip_clAddr'(rd_addr));
//$display("read mem addr is 0x%x", t_ccip_clAddr'(read_mem_addr));
//$display("start read is %d", start_read);
end
end
end
//
// Emit read requests to the FIU.
//
// Read header defines the request to the FIU
t_ccip_c0_ReqMemHdr rd_hdr;
always_comb
begin
rd_hdr = t_ccip_c0_ReqMemHdr'(0);
// Read request type (No intention to cache)
//rd_hdr.req_type = 4'h0;
// Virtual address (MPF virtual addressing is enabled)
rd_hdr.address = rd_addr;
// Read over channel VA
//rd_hdr.vc_sel = 2'h0;
// Read one cache line (64 bytes)
//rd_hdr.cl_len = 2'h0;
end
// Send read requests to the FIU
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c0.valid <= 1'b0;
cnt_list_length <= 0;
end
else
begin
// Generate a read request when needed and the FIU isn't full
if (state == STATE_READ)
begin
sTx.c0.valid <= (rd_needed && !sRx.c0TxAlmFull);
if (rd_needed && !sRx.c0TxAlmFull)
begin
sTx.c0.hdr <= rd_hdr;
cnt_list_length <= cnt_list_length + 1;
$display("Incrementing read count...%d",cnt_list_length);
$display("Read address is 0x%x...",rd_hdr.address);
// Apurve: Add something to stop read once this section has been accessed
end
end
end
end
//
// READ RESPONSE HANDLING
//
//
// Receive data (read responses).
//
always_ff @(posedge clk)
begin
if (reset)
begin
do_update <= 1'b0;
end
else
begin
if (sRx.c0.rspValid)
begin
rd_data <= sRx.c0.data;
do_update <= 1'b1;
//$display("rd data is %d...",rd_data);
end
if (state == STATE_UPDATE)
begin
// Update the read data and put it in the write data to be written
wr_data <= rd_data + 2;
do_update <= 1'b0;
$display("write data is %d...",wr_data);
// First read done. Next reads should be from the updated addresses
start_read <= 1'b0;
end
end
end
// =========================================================================
//
// Write logic.
//
// =========================================================================
//
// WRITE REQUEST
//
// Did a write response just arrive
// Next write address
always_ff @(posedge clk)
begin
// Next write address is valid when we have got the read response back
wr_addr_next_valid <= sRx.c0.rspValid;
// Apurve: Next address is current address plus address length
wr_addr_next <= wr_addr + 0;
end
//
// Since back pressure may prevent an immediate write request, we must
// record whether a write is needed and hold it until the request can
// be sent to the FIU.
//
always_ff @(posedge clk)
begin
if (reset)
begin
wr_needed <= 1'b0;
end
else
begin
// If writes are allowed this cycle then we can safely clear
// any previously requested writes. This simple AFU has only
// one write in flight at a time since it is walking a pointer
// chain.
if (wr_needed)
begin
wr_needed <= sRx.c1TxAlmFull;
end
else
begin
// Need a write under two conditions:
// - Starting a new walk
// - A write response just arrived from a line containing
// a next pointer.
wr_needed <= (start_write || (!sRx.c1TxAlmFull && wr_addr_next_valid));
wr_addr <= (start_write ? write_mem_addr : wr_addr_next);
//$display("Write mem address later is 0x%x", t_ccip_clAddr'(write_mem_addr));
end
end
end
//
// Emit write requests to the FIU.
//
// Write header defines the request to the FIU
t_ccip_c1_ReqMemHdr wr_hdr;
always_comb
begin
wr_hdr = t_ccip_c1_ReqMemHdr'(0);
// Write request type
//wr_hdr.req_type = 4'h0;
// Virtual address (MPF virtual addressing is enabled)
wr_hdr.address = wr_addr;
// Let the FIU pick the channel
//wr_hdr.vc_sel = 2'h2;
// Write 1 cache line (64 bytes)
//wr_hdr.cl_len = 2'h0;
// Start of packet is true (single line write)
wr_hdr.sop = 1'b1;
end
// Send write requests to the FIU
always_ff @(posedge clk)
begin
if (reset)
begin
sTx.c1.valid <= 1'b0;
end
else
begin
// Generate a write request when needed and the FIU isn't full
if (state == STATE_WRITE)
begin
sTx.c1.valid <= (wr_needed && !sRx.c1TxAlmFull);
if (wr_needed && !sRx.c1TxAlmFull)
begin
sTx.c1.hdr <= wr_hdr;
sTx.c1.data <= t_ccip_clData'(wr_data);
end
end
end
end
//
// WRITE RESPONSE HANDLING
//
// Apurve: Check if a signal is to be sent to read to start reading in case
// write response does not work
//
// Send data (write requests).
//
//always_ff @(posedge clk)
//begin
// if (state == STATE_WRITE)
// begin
// rd_data <= sRx.c0.data;
// end
// if (state == STATE_UPDATE)
// begin
// // Update the write data and put it in the write data to be written
// wr_data <= rd_data + 1;
// end
//end
endmodule

View File

@@ -0,0 +1,2 @@
cci_hello.json
cci_hello_afu.sv

View File

@@ -0,0 +1,11 @@
#!/bin/sh
##
## Setup ASE environment using ../rtl/sources.txt.
##
# Absolute path to this script
SCRIPT=$(readlink -f "$0")
SCRIPT_PATH=$(dirname "$SCRIPT")
afu_sim_setup --sources="${SCRIPT_PATH}/../rtl/sources.txt" $@

View File

@@ -0,0 +1,41 @@
include ../../common/sw/common_include.mk
# Primary test name
TEST = cci_hello
# Build directory
OBJDIR = obj
CFLAGS += -I./$(OBJDIR)
CPPFLAGS += -I./$(OBJDIR)
# Files and folders
SRCS = $(TEST).c
OBJS = $(addprefix $(OBJDIR)/,$(patsubst %.c,%.o,$(SRCS)))
# Targets (build only $(TEST)_ase by default)
all: $(TEST) $(TEST)_ase
# AFU info from JSON file, including AFU UUID
AFU_JSON_INFO = $(OBJDIR)/afu_json_info.h
$(AFU_JSON_INFO): ../hw/rtl/$(TEST).json | objdir
afu_json_mgr json-info --afu-json=$^ --c-hdr=$@
$(OBJS): $(AFU_JSON_INFO)
$(TEST): $(OBJS)
$(CC) -o $@ $^ $(LDFLAGS) $(FPGA_LIBS)
$(TEST)_ase: $(OBJS)
$(CC) -o $@ $^ $(LDFLAGS) $(ASE_LIBS)
$(OBJDIR)/%.o: %.c | objdir
$(CC) $(CFLAGS) -c $< -o $@
clean:
rm -rf $(TEST) $(TEST)_ase $(OBJDIR)
objdir:
@mkdir -p $(OBJDIR)
.PHONY: all clean

View File

@@ -0,0 +1,210 @@
//
// Copyright (c) 2017, Intel Corporation
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// Neither the name of the Intel Corporation nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <uuid/uuid.h>
#include <opae/fpga.h>
// State from the AFU's JSON file, extracted using OPAE's afu_json_mgr script
#include "afu_json_info.h"
#define CACHELINE_BYTES 64
#define CL(x) ((x) * CACHELINE_BYTES)
//
// Search for an accelerator matching the requested UUID and connect to it.
//
static fpga_handle connect_to_accel(const char *accel_uuid)
{
fpga_properties filter = NULL;
fpga_guid guid;
fpga_token accel_token;
uint32_t num_matches;
fpga_handle accel_handle;
fpga_result r;
// Don't print verbose messages in ASE by default
//setenv("ASE_LOG", "0", 0);
// Set up a filter that will search for an accelerator
fpgaGetProperties(NULL, &filter);
fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
// Add the desired UUID to the filter
uuid_parse(accel_uuid, guid);
fpgaPropertiesSetGUID(filter, guid);
// Do the search across the available FPGA contexts
num_matches = 1;
fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches);
// Not needed anymore
fpgaDestroyProperties(&filter);
if (num_matches < 1)
{
fprintf(stderr, "Accelerator %s not found!\n", accel_uuid);
return 0;
}
// Open accelerator
r = fpgaOpen(accel_token, &accel_handle, 0);
assert(FPGA_OK == r);
// Done with token
fpgaDestroyToken(&accel_token);
return accel_handle;
}
//
// Allocate a buffer in I/O memory, shared with the FPGA.
//
static volatile void* alloc_buffer(fpga_handle accel_handle,
ssize_t size,
uint64_t *wsid,
uint64_t *io_addr)
{
fpga_result r;
volatile void* buf;
r = fpgaPrepareBuffer(accel_handle, size, (void*)&buf, wsid, 0);
if (FPGA_OK != r) return NULL;
// Get the physical address of the buffer in the accelerator
r = fpgaGetIOAddress(accel_handle, *wsid, io_addr);
assert(FPGA_OK == r);
return buf;
}
int main(int argc, char *argv[])
{
fpga_handle accel_handle;
volatile char *buf;
volatile char *buf_r;
uint64_t wsid1;
uint64_t wsid2;
uint64_t buf_pa;
uint64_t ret_buf_pa;
uint64_t buf_rpa;
uint64_t ret_buf_rpa;
fpga_result r;
// Find and connect to the accelerator
accel_handle = connect_to_accel(AFU_ACCEL_UUID);
// Allocate a single page memory buffer for write
buf = (volatile char*)alloc_buffer(accel_handle, 4 * getpagesize(),
&wsid1, &buf_pa);
// Allocate a single page memory buffer for read
buf_r = (volatile char*)alloc_buffer(accel_handle, 4 * getpagesize(),
&wsid2, &buf_rpa);
assert(NULL != buf);
//// Set the low byte of the shared buffer to 0. The FPGA will write
//// a non-zero value to it.
//buf[0] = 0;
// Set the low byte of the shared buffer buf_r to 0. The FPGA will read
// the values and write to buf address
buf[0] = 5;
buf_r[0] = 5;
// Tell the accelerator the address of the buffer using cache line
// addresses. The accelerator will respond by writing to the buffer.
r = fpgaWriteMMIO64(accel_handle, 0, 0, buf_pa / CL(1));
printf("Write address is %08lx\n", buf_pa);
printf("Write address div 64 is %08lx\n", buf_pa/ CL(1));
assert(FPGA_OK == r);
// Wait for response from FPGA. Check using fpgaReadMMIO
//r = fpgaReadMMIO64(accel_handle, 0, 0, &ret_buf_pa);
//printf("Returned write is %08lx\n", ret_buf_pa);
//assert(FPGA_OK == r);
///////////////////// Added to check fpgaRead
// Wait for response from FPGA. Check using fpgaReadMMIO
r = fpgaReadMMIO64(accel_handle, 0, 5 * sizeof(uint64_t), &ret_buf_rpa);
printf("Returned read at 10 is %08lx\n", ret_buf_rpa);
assert(FPGA_OK == r);
///////////////////////////////////////////////
// Tell the accelerator the address of the buffer using cache line
// addresses. The accelerator will read from the buffer.
// Write the address to MMIO 1
r = fpgaWriteMMIO64(accel_handle, 0, sizeof(uint64_t), buf_rpa / CL(1));
printf("Read address is %08lx\n", buf_rpa);
printf("Read address div64 is %08lx\n", buf_rpa / CL(1));
assert(FPGA_OK == r);
// Wait for response from FPGA. Check using fpgaReadMMIO
//r = fpgaReadMMIO64(accel_handle, 0, sizeof(uint64_t), &ret_buf_rpa);
//printf("Returned write is %08lx\n", ret_buf_rpa);
//assert(FPGA_OK == r);
// Update this
// Spin, waiting for the value in memory to change to something non-zero.
while (5 == buf[0])
{
// A well-behaved program would use _mm_pause(), nanosleep() or
// equivalent to save power here.
};
// Print the string written by the FPGA
printf("%d\n", buf[0]);
do {
//printf("%d\n", buf[0]);
} while (10 != buf[0]);
// Done
fpgaReleaseBuffer(accel_handle, wsid1);
fpgaReleaseBuffer(accel_handle, wsid2);
fpgaClose(accel_handle);
return 0;
}

View File

@@ -0,0 +1,13 @@
//
// Generated by afu_json_mgr from ../hw/rtl/cci_hello.json
//
#ifndef __AFU_JSON_INFO__
#define __AFU_JSON_INFO__
#define AFU_ACCEL_NAME "cci_hello"
#define AFU_ACCEL_UUID "C6AA954A-9B91-4A37-ABC1-1D9F0709DCC3"
#define AFU_IMAGE_POWER 0
#define AFU_TOP_IFC "ccip_std_afu"
#endif // __AFU_JSON_INFO__

Binary file not shown.