Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

1
sim/opaesim/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/obj_dir/*

138
sim/opaesim/Makefile Normal file
View File

@@ -0,0 +1,138 @@
XLEN ?= 32
DESTDIR ?= .
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
AFU_DIR = $(RTL_DIR)/afu/opae
SCRIPT_DIR = ../../hw/scripts
THIRD_PARTY_DIR = ../../third_party
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I.. -I../../../hw -I../../common -I$(abspath $(DESTDIR))
CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
CXXFLAGS += -I../$(THIRD_PARTY_DIR)
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator -pthread
# control RTL debug tracing states
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_PIPELINE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_MEM
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
DBG_TRACE_FLAGS += -DDBG_TRACE_RASTER
DBG_TRACE_FLAGS += -DDBG_TRACE_ROP
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
# Control logic analyzer monitors
DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU
DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
DBG_SCOPE_FLAGS += -DDBG_SCOPE_RASTER
DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED
# AFU parameters
CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=26
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=512
endif
ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH=4
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += fpga.cpp opae_sim.cpp
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
FPU_INCLUDE = -I$(RTL_DIR)/fpu
ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE)
RTL_INCLUDE += -I$(AFU_DIR) -I$(AFU_DIR)/ccip
TOP = vortex_afu_shim
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(RTL_PKGS)
VL_FLAGS += $(DBG_SCOPE_FLAGS)
CXXFLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())')
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O3 -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
VL_FLAGS += -DSCOPE
CXXFLAGS += -DSCOPE
SCOPE_JSON = $(DESTDIR)/scope.json
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CXXFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CXXFLAGS += -DNOPAE
PROJECT = libopae-c-sim.so
all: $(PROJECT)
$(DESTDIR)/vortex.xml:
verilator --xml-only -O0 $(VL_FLAGS) $(TOP) --xml-output $(DESTDIR)/vortex.xml
$(DESTDIR)/scope.json: $(DESTDIR)/vortex.xml
$(SCRIPT_DIR)/scope.py $(DESTDIR)/vortex.xml -o $(DESTDIR)/scope.json
$(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $(AFU_DIR)/vortex_afu.vh -o $(DESTDIR)/vortex_afu.h
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)
clean:
rm -rf obj_dir $(DESTDIR)/vortex.xml $(DESTDIR)/scope.json $(DESTDIR)/vortex_afu.h $(DESTDIR)/$(PROJECT)

154
sim/opaesim/fpga.cpp Normal file
View File

@@ -0,0 +1,154 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <cstdlib>
#include <unistd.h>
#include <assert.h>
#include "fpga.h"
#include "opae_sim.h"
#include <VX_config.h>
#include <util.h>
using namespace vortex;
#ifdef __cplusplus
extern "C" {
#endif
extern fpga_result fpgaGetProperties(fpga_token token, fpga_properties *prop) {
__unused (token, prop);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesSetObjectType(fpga_properties prop, fpga_objtype objtype) {
__unused (prop, objtype);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesSetGUID(fpga_properties prop, fpga_guid guid) {
__unused (prop, guid);
return FPGA_OK;
}
extern fpga_result fpgaDestroyProperties(fpga_properties *prop) {
__unused (prop);
return FPGA_OK;
}
extern fpga_result fpgaEnumerate(const fpga_properties *filters, uint32_t num_filters, fpga_token *tokens, uint32_t max_tokens, uint32_t *num_matches) {
__unused (filters, num_filters, num_filters, tokens, max_tokens);
if (num_matches) {
*num_matches = 1;
}
return FPGA_OK;
}
extern fpga_result fpgaDestroyToken(fpga_token *token) {
__unused (token);
return FPGA_OK;
}
extern fpga_result fpgaPropertiesGetLocalMemorySize(const fpga_properties *filters, uint64_t* lms) {
__unused (filters);
if (lms) {
#if (XLEN == 64)
*lms = 0x200000000; // 8 GB
#else
*lms = 0x100000000; // 4 GB
#endif
}
return FPGA_OK;
}
extern fpga_result fpgaOpen(fpga_token token, fpga_handle *handle, int flags) {
__unused (token);
if (NULL == handle || flags != 0)
return FPGA_INVALID_PARAM;
auto sim = new opae_sim();
*handle = reinterpret_cast<fpga_handle>(sim);
return FPGA_OK;
}
extern fpga_result fpgaClose(fpga_handle handle) {
if (NULL == handle)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
delete sim;
return FPGA_OK;
}
extern fpga_result fpgaPrepareBuffer(fpga_handle handle, uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
if (NULL == handle || len == 0 || buf_addr == NULL || wsid == NULL)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
int ret = sim->prepare_buffer(len, buf_addr, wsid, flags);
if (ret != 0)
return FPGA_NO_MEMORY;
return FPGA_OK;
}
extern fpga_result fpgaReleaseBuffer(fpga_handle handle, uint64_t wsid) {
if (NULL == handle)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->release_buffer(wsid);
return FPGA_OK;
}
extern fpga_result fpgaGetIOAddress(fpga_handle handle, uint64_t wsid, uint64_t *ioaddr) {
if (NULL == handle || ioaddr == NULL)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->get_io_address(wsid, ioaddr);
return FPGA_OK;
}
extern fpga_result fpgaWriteMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t value) {
if (NULL == handle || mmio_num != 0)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->write_mmio64(mmio_num, offset, value);
return FPGA_OK;
}
extern fpga_result fpgaReadMMIO64(fpga_handle handle, uint32_t mmio_num, uint64_t offset, uint64_t *value) {
if (NULL == handle || mmio_num != 0 || value == NULL)
return FPGA_INVALID_PARAM;
auto sim = reinterpret_cast<opae_sim*>(handle);
sim->read_mmio64(mmio_num, offset, value);
return FPGA_OK;
}
extern const char *fpgaErrStr(fpga_result e) {
return "";
}
#ifdef __cplusplus
}
#endif

54
sim/opaesim/fpga.h Normal file
View File

@@ -0,0 +1,54 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __FPGA_H__
#define __FPGA_H__
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
FPGA_OK = 0, /**< Operation completed successfully */
FPGA_INVALID_PARAM, /**< Invalid parameter supplied */
FPGA_BUSY, /**< Resource is busy */
FPGA_EXCEPTION, /**< An exception occurred */
FPGA_NOT_FOUND, /**< A required resource was not found */
FPGA_NO_MEMORY, /**< Not enough memory to complete operation */
FPGA_NOT_SUPPORTED, /**< Requested operation is not supported */
FPGA_NO_DRIVER, /**< Driver is not loaded */
FPGA_NO_DAEMON, /**< FPGA Daemon (fpgad) is not running */
FPGA_NO_ACCESS, /**< Insufficient privileges or permissions */
FPGA_RECONF_ERROR /**< Error while reconfiguring FPGA */
} fpga_result;
typedef enum {
FPGA_DEVICE = 0,
FPGA_ACCELERATOR
} fpga_objtype;
typedef void *fpga_handle;
typedef void *fpga_token;
typedef void *fpga_properties;
typedef uint8_t fpga_guid[16];
#ifdef __cplusplus
}
#endif
#endif // __FPGA_H__

553
sim/opaesim/opae_sim.cpp Normal file
View File

@@ -0,0 +1,553 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "opae_sim.h"
#include <verilated.h>
#include "Vvortex_afu_shim.h"
#include "Vvortex_afu_shim__Syms.h"
#ifdef VCD_OUTPUT
#include <verilated_vcd_c.h>
#endif
#include <iostream>
#include <fstream>
#include <iomanip>
#include <mem.h>
#define RAMULATOR
#include <ramulator/src/Gem5Wrapper.h>
#include <ramulator/src/Request.h>
#include <ramulator/src/Statistics.h>
#include <VX_config.h>
#include <vortex_afu.h>
#include <future>
#include <list>
#include <queue>
#include <unordered_map>
#include <util.h>
#ifndef MEMORY_BANKS
#ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
#define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS
#else
#define MEMORY_BANKS 2
#endif
#endif
#ifndef MEM_CYCLE_RATIO
#define MEM_CYCLE_RATIO -1
#endif
#undef MEM_BLOCK_SIZE
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
#define CACHE_BLOCK_SIZE 64
#define CCI_LATENCY 8
#define CCI_RAND_MOD 8
#define CCI_RQ_SIZE 16
#define CCI_WQ_SIZE 16
#ifndef TRACE_START_TIME
#define TRACE_START_TIME 0ull
#endif
#ifndef TRACE_STOP_TIME
#define TRACE_STOP_TIME -1ull
#endif
#ifndef VERILATOR_RESET_VALUE
#define VERILATOR_RESET_VALUE 2
#endif
#define RAM_PAGE_SIZE 4096
#define CPU_GPU_LATENCY 200
using namespace vortex;
static uint64_t timestamp = 0;
double sc_time_stamp() {
return timestamp;
}
static bool trace_enabled = false;
static uint64_t trace_start_time = TRACE_START_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
bool sim_trace_enabled() {
if (timestamp >= trace_start_time
&& timestamp < trace_stop_time)
return true;
return trace_enabled;
}
void sim_trace_enable(bool enable) {
trace_enabled = enable;
}
///////////////////////////////////////////////////////////////////////////////
class opae_sim::Impl {
public:
Impl()
: stop_(false)
, host_buffer_ids_(0) {
// force random values for unitialized signals
Verilated::randReset(VERILATOR_RESET_VALUE);
Verilated::randSeed(50);
// turn off assertion before reset
Verilated::assertOn(false);
// create RTL module instance
device_ = new Vvortex_afu_shim();
#ifdef VCD_OUTPUT
Verilated::traceEverOn(true);
trace_ = new VerilatedVcdC();
device_->trace(trace_, 99);
trace_->open("trace.vcd");
#endif
ram_ = new RAM(RAM_PAGE_SIZE);
// initialize dram simulator
ramulator::Config ram_config;
ram_config.add("standard", "DDR4");
ram_config.add("channels", std::to_string(MEMORY_BANKS));
ram_config.add("ranks", "1");
ram_config.add("speed", "DDR4_2400R");
ram_config.add("org", "DDR4_4Gb_x8");
ram_config.add("mapping", "defaultmapping");
ram_config.set_core_num(1);
dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE);
Stats::statlist.output("ramulator.ddr4.log");
// reset the device
this->reset();
// launch execution thread
future_ = std::async(std::launch::async, [&]{
while (!stop_) {
std::lock_guard<std::mutex> guard(mutex_);
this->tick();
}
});
}
~Impl() {
stop_ = true;
if (future_.valid()) {
future_.wait();
}
for (auto& buffer : host_buffers_) {
aligned_free(buffer.second.data);
}
#ifdef VCD_OUTPUT
trace_->close();
delete trace_;
#endif
delete device_;
delete ram_;
if (dram_) {
dram_->finish();
Stats::statlist.printall();
delete dram_;
}
}
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
auto alloc = aligned_malloc(len, CACHE_BLOCK_SIZE);
if (alloc == NULL)
return -1;
// set uninitialized data to "baadf00d"
for (uint32_t i = 0; i < len; ++i) {
((uint8_t*)alloc)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
}
host_buffer_t buffer;
buffer.data = (uint64_t*)alloc;
buffer.size = len;
buffer.ioaddr = uintptr_t(alloc);
auto buffer_id = host_buffer_ids_++;
host_buffers_.emplace(buffer_id, buffer);
*buf_addr = alloc;
*wsid = buffer_id;
return 0;
}
void release_buffer(uint64_t wsid) {
auto it = host_buffers_.find(wsid);
if (it != host_buffers_.end()) {
aligned_free(it->second.data);
host_buffers_.erase(it);
}
}
void get_io_address(uint64_t wsid, uint64_t *ioaddr) {
*ioaddr = host_buffers_[wsid].ioaddr;
}
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
std::lock_guard<std::mutex> guard(mutex_);
// simulate CPU-GPU latency
for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i)
this->tick();
// simulate mmio request
device_->vcp2af_sRxPort_c0_mmioRdValid = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
this->tick();
device_->vcp2af_sRxPort_c0_mmioRdValid = 0;
assert(device_->af2cp_sTxPort_c2_mmioRdValid);
*value = device_->af2cp_sTxPort_c2_data;
}
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
std::lock_guard<std::mutex> guard(mutex_);
// simulate CPU-GPU latency
for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i)
this->tick();
// simulate mmio request
device_->vcp2af_sRxPort_c0_mmioWrValid = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8);
this->tick();
device_->vcp2af_sRxPort_c0_mmioWrValid = 0;
}
private:
void reset() {
cci_reads_.clear();
cci_writes_.clear();
device_->vcp2af_sRxPort_c0_mmioRdValid = 0;
device_->vcp2af_sRxPort_c0_mmioWrValid = 0;
device_->vcp2af_sRxPort_c0_rspValid = 0;
device_->vcp2af_sRxPort_c1_rspValid = 0;
device_->vcp2af_sRxPort_c0_TxAlmFull = 0;
device_->vcp2af_sRxPort_c1_TxAlmFull = 0;
for (int b = 0; b < MEMORY_BANKS; ++b) {
pending_mem_reqs_[b].clear();
device_->avs_readdatavalid[b] = 0;
device_->avs_waitrequest[b] = 0;
}
device_->reset = 1;
for (int i = 0; i < RESET_DELAY; ++i) {
device_->clk = 0;
this->eval();
device_->clk = 1;
this->eval();
}
device_->reset = 0;
for (int i = 0; i < RESET_DELAY; ++i) {
device_->clk = 0;
this->eval();
device_->clk = 1;
this->eval();
}
// Turn on assertion after reset
Verilated::assertOn(true);
}
void tick() {
this->sRxPort_bus();
this->sTxPort_bus();
this->avs_bus();
if (!dram_queue_.empty()) {
if (dram_->send(dram_queue_.front()))
dram_queue_.pop();
}
device_->clk = 0;
this->eval();
device_->clk = 1;
this->eval();
if (MEM_CYCLE_RATIO > 0) {
auto cycle = timestamp / 2;
if ((cycle % MEM_CYCLE_RATIO) == 0)
dram_->tick();
} else {
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
dram_->tick();
}
#ifndef NDEBUG
fflush(stdout);
#endif
}
void eval() {
device_->eval();
#ifdef VCD_OUTPUT
if (sim_trace_enabled()) {
trace_->dump(timestamp);
}
#endif
++timestamp;
}
void sRxPort_bus() {
// check mmio request
bool mmio_req_enabled = device_->vcp2af_sRxPort_c0_mmioRdValid
|| device_->vcp2af_sRxPort_c0_mmioWrValid;
// schedule CCI read responses
std::list<cci_rd_req_t>::iterator cci_rd_it(cci_reads_.end());
for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) {
if (it->cycles_left > 0)
it->cycles_left -= 1;
if ((cci_rd_it == ie) && (it->cycles_left == 0)) {
cci_rd_it = it;
}
}
// schedule CCI write responses
std::list<cci_wr_req_t>::iterator cci_wr_it(cci_writes_.end());
for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) {
if (it->cycles_left > 0)
it->cycles_left -= 1;
if ((cci_wr_it == ie) && (it->cycles_left == 0)) {
cci_wr_it = it;
}
}
// send CCI write response
device_->vcp2af_sRxPort_c1_rspValid = 0;
if (cci_wr_it != cci_writes_.end()) {
device_->vcp2af_sRxPort_c1_rspValid = 1;
device_->vcp2af_sRxPort_c1_hdr_resp_type = 0;
device_->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata;
cci_writes_.erase(cci_wr_it);
}
// send CCI read response (ensure mmio disabled)
device_->vcp2af_sRxPort_c0_rspValid = 0;
if (!mmio_req_enabled
&& (cci_rd_it != cci_reads_.end())) {
device_->vcp2af_sRxPort_c0_rspValid = 1;
device_->vcp2af_sRxPort_c0_hdr_resp_type = 0;
memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;
/*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
printf("\n");*/
cci_reads_.erase(cci_rd_it);
}
}
void sTxPort_bus() {
// process read requests
if (device_->af2cp_sTxPort_c0_valid) {
assert(!device_->vcp2af_sRxPort_c0_TxAlmFull);
cci_rd_req_t cci_req;
cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);
cci_req.addr = device_->af2cp_sTxPort_c0_hdr_address;
cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata;
auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
//printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
cci_reads_.emplace_back(cci_req);
}
// process write requests
if (device_->af2cp_sTxPort_c1_valid) {
assert(!device_->vcp2af_sRxPort_c1_TxAlmFull);
cci_wr_req_t cci_req;
cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);
cci_req.mdata = device_->af2cp_sTxPort_c1_hdr_mdata;
auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE);
memcpy(host_ptr, device_->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE);
cci_writes_.emplace_back(cci_req);
}
// check queues overflow
device_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1));
device_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1));
}
void avs_bus() {
for (int b = 0; b < MEMORY_BANKS; ++b) {
// process memory responses
device_->avs_readdatavalid[b] = 0;
if (!pending_mem_reqs_[b].empty()
&& (*pending_mem_reqs_[b].begin())->ready) {
auto mem_rd_it = pending_mem_reqs_[b].begin();
auto mem_req = *mem_rd_it;
device_->avs_readdatavalid[b] = 1;
memcpy(device_->avs_readdata[b], mem_req->data.data(), MEM_BLOCK_SIZE);
uint32_t addr = mem_req->addr;
pending_mem_reqs_[b].erase(mem_rd_it);
delete mem_req;
}
// process memory requests
assert(!device_->avs_read[b] || !device_->avs_write[b]);
unsigned byte_addr = (device_->avs_address[b] * MEMORY_BANKS + b) * MEM_BLOCK_SIZE;
if (device_->avs_write[b]) {
uint64_t byteen = device_->avs_byteenable[b];
uint8_t* data = (uint8_t*)(device_->avs_writedata[b].data());
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i];
}
}
/*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");*/
// send dram request
ramulator::Request dram_req(
byte_addr,
ramulator::Request::Type::WRITE,
0
);
dram_queue_.push(dram_req);
} else
if (device_->avs_read[b]) {
auto mem_req = new mem_rd_req_t();
mem_req->addr = device_->avs_address[b];
ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE);
mem_req->ready = false;
pending_mem_reqs_[b].emplace_back(mem_req);
/*printf("%0ld: [sim] MEM Rd Req: bank=%d, addr=%x, pending={", timestamp, b, mem_req.addr * MEM_BLOCK_SIZE);
for (auto& req : pending_mem_reqs_[b]) {
if (req.cycles_left != 0)
printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
else
printf(" %0x", req.addr * MEM_BLOCK_SIZE);
}
printf("}\n");*/
// send dram request
ramulator::Request dram_req(
byte_addr,
ramulator::Request::Type::READ,
std::bind([](ramulator::Request& dram_req, mem_rd_req_t* mem_req) {
mem_req->ready = true;
}, placeholders::_1, mem_req),
0
);
dram_queue_.push(dram_req);
}
device_->avs_waitrequest[b] = false;
}
}
typedef struct {
bool ready;
std::array<uint8_t, MEM_BLOCK_SIZE> data;
uint32_t addr;
} mem_rd_req_t;
typedef struct {
int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
uint64_t addr;
uint32_t mdata;
} cci_rd_req_t;
typedef struct {
int cycles_left;
uint32_t mdata;
} cci_wr_req_t;
typedef struct {
uint64_t* data;
size_t size;
uint64_t ioaddr;
} host_buffer_t;
std::future<void> future_;
bool stop_;
std::unordered_map<int64_t, host_buffer_t> host_buffers_;
int64_t host_buffer_ids_;
std::list<mem_rd_req_t*> pending_mem_reqs_[MEMORY_BANKS];
std::list<cci_rd_req_t> cci_reads_;
std::list<cci_wr_req_t> cci_writes_;
std::mutex mutex_;
RAM* ram_;
ramulator::Gem5Wrapper* dram_;
std::queue<ramulator::Request> dram_queue_;
Vvortex_afu_shim *device_;
#ifdef VCD_OUTPUT
VerilatedVcdC *trace_;
#endif
};
///////////////////////////////////////////////////////////////////////////////
opae_sim::opae_sim()
: impl_(new Impl())
{}
opae_sim::~opae_sim() {
delete impl_;
}
int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
return impl_->prepare_buffer(len, buf_addr, wsid, flags);
}
void opae_sim::release_buffer(uint64_t wsid) {
impl_->release_buffer(wsid);
}
void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) {
impl_->get_io_address(wsid, ioaddr);
}
void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
impl_->write_mmio64(mmio_num, offset, value);
}
void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
impl_->read_mmio64(mmio_num, offset, value);
}

43
sim/opaesim/opae_sim.h Normal file
View File

@@ -0,0 +1,43 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
namespace vortex {
class RAM;
class opae_sim {
public:
opae_sim();
virtual ~opae_sim();
int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags);
void release_buffer(uint64_t wsid);
void get_io_address(uint64_t wsid, uint64_t *ioaddr);
void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value);
void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value);
private:
class Impl;
Impl* impl_;
};
}

View File

@@ -0,0 +1,8 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "*/fpnew/src/*"
lint_off -rule UNOPTFLAT -file "*/fpnew/src/*"
lint_off -file "*/fpnew/src/*"
lint_off -file "*/afu/opae/ccip/ccip_if_pkg.sv"
lint_off -file "*/afu/opae/local_mem_cfg_pkg.sv"

View File

@@ -0,0 +1,178 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`IGNORE_WARNINGS_BEGIN
`include "vortex_afu.vh"
`IGNORE_WARNINGS_END
`include "VX_define.vh"
module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; (
// global signals
input clk,
input reset,
// IF signals between CCI and AFU
input logic vcp2af_sRxPort_c0_TxAlmFull,
input logic vcp2af_sRxPort_c1_TxAlmFull,
input t_ccip_vc vcp2af_sRxPort_c0_hdr_vc_used,
input logic vcp2af_sRxPort_c0_hdr_rsvd1,
input logic vcp2af_sRxPort_c0_hdr_hit_miss,
input logic [1:0] vcp2af_sRxPort_c0_hdr_rsvd0,
input t_ccip_clNum vcp2af_sRxPort_c0_hdr_cl_num,
input t_ccip_c0_rsp vcp2af_sRxPort_c0_hdr_resp_type,
input t_ccip_mdata vcp2af_sRxPort_c0_hdr_mdata,
input t_ccip_clData vcp2af_sRxPort_c0_data,
input logic vcp2af_sRxPort_c0_rspValid,
input logic vcp2af_sRxPort_c0_mmioRdValid,
input logic vcp2af_sRxPort_c0_mmioWrValid,
input t_ccip_mmioAddr vcp2af_sRxPort_c0_ReqMmioHdr_address,
input logic [1:0] vcp2af_sRxPort_c0_ReqMmioHdr_length,
input logic vcp2af_sRxPort_c0_ReqMmioHdr_rsvd,
input t_ccip_tid vcp2af_sRxPort_c0_ReqMmioHdr_tid,
input t_ccip_vc vcp2af_sRxPort_c1_hdr_vc_used,
input logic vcp2af_sRxPort_c1_hdr_rsvd1,
input logic vcp2af_sRxPort_c1_hdr_hit_miss,
input logic vcp2af_sRxPort_c1_hdr_format,
input logic vcp2af_sRxPort_c1_hdr_rsvd0,
input t_ccip_clNum vcp2af_sRxPort_c1_hdr_cl_num,
input t_ccip_c1_rsp vcp2af_sRxPort_c1_hdr_resp_type,
input t_ccip_mdata vcp2af_sRxPort_c1_hdr_mdata,
input logic vcp2af_sRxPort_c1_rspValid,
output t_ccip_vc af2cp_sTxPort_c0_hdr_vc_sel,
output logic [1:0] af2cp_sTxPort_c0_hdr_rsvd1,
output t_ccip_clLen af2cp_sTxPort_c0_hdr_cl_len,
output t_ccip_c0_req af2cp_sTxPort_c0_hdr_req_type,
output logic [5:0] af2cp_sTxPort_c0_hdr_rsvd0,
output t_ccip_clAddr af2cp_sTxPort_c0_hdr_address,
output t_ccip_mdata af2cp_sTxPort_c0_hdr_mdata,
output logic af2cp_sTxPort_c0_valid,
output logic [5:0] af2cp_sTxPort_c1_hdr_rsvd2,
output t_ccip_vc af2cp_sTxPort_c1_hdr_vc_sel,
output logic af2cp_sTxPort_c1_hdr_sop,
output logic af2cp_sTxPort_c1_hdr_rsvd1,
output t_ccip_clLen af2cp_sTxPort_c1_hdr_cl_len,
output t_ccip_c1_req af2cp_sTxPort_c1_hdr_req_type,
output logic [5:0] af2cp_sTxPort_c1_hdr_rsvd0,
output t_ccip_clAddr af2cp_sTxPort_c1_hdr_address,
output t_ccip_mdata af2cp_sTxPort_c1_hdr_mdata,
output t_ccip_clData af2cp_sTxPort_c1_data,
output logic af2cp_sTxPort_c1_valid,
output t_ccip_tid af2cp_sTxPort_c2_hdr_tid,
output logic af2cp_sTxPort_c2_mmioRdValid,
output t_ccip_mmioData af2cp_sTxPort_c2_data,
// Avalon signals for local memory access
output t_local_mem_data avs_writedata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
input t_local_mem_data avs_readdata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output t_local_mem_addr avs_address [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
input logic avs_waitrequest [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output logic avs_write [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output logic avs_read [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
input avs_readdatavalid [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS]
);
t_if_ccip_Rx cp2af_sRxPort;
t_if_ccip_Tx af2cp_sTxPort;
vortex_afu #(
.NUM_LOCAL_MEM_BANKS(`PLATFORM_PARAM_LOCAL_MEMORY_BANKS)
) afu (
.clk(clk),
.reset(reset),
.cp2af_sRxPort(cp2af_sRxPort),
.af2cp_sTxPort(af2cp_sTxPort),
.avs_writedata(avs_writedata),
.avs_readdata(avs_readdata),
.avs_address(avs_address),
.avs_waitrequest(avs_waitrequest),
.avs_write(avs_write),
.avs_read(avs_read),
.avs_byteenable(avs_byteenable),
.avs_burstcount(avs_burstcount),
.avs_readdatavalid(avs_readdatavalid)
);
t_if_ccip_c0_RxHdr c0_RxHdr;
always @ (*) begin
c0_RxHdr = 'x;
if (vcp2af_sRxPort_c0_mmioWrValid || vcp2af_sRxPort_c0_mmioRdValid) begin
c0_RxHdr.reqMmioHdr.address = vcp2af_sRxPort_c0_ReqMmioHdr_address;
c0_RxHdr.reqMmioHdr.length = vcp2af_sRxPort_c0_ReqMmioHdr_length;
c0_RxHdr.reqMmioHdr.rsvd = vcp2af_sRxPort_c0_ReqMmioHdr_rsvd;
c0_RxHdr.reqMmioHdr.tid = vcp2af_sRxPort_c0_ReqMmioHdr_tid;
end else begin
c0_RxHdr.rspMemHdr.vc_used = vcp2af_sRxPort_c0_hdr_vc_used;
c0_RxHdr.rspMemHdr.rsvd1 = vcp2af_sRxPort_c0_hdr_rsvd1;
c0_RxHdr.rspMemHdr.hit_miss = vcp2af_sRxPort_c0_hdr_hit_miss;
c0_RxHdr.rspMemHdr.rsvd0 = vcp2af_sRxPort_c0_hdr_rsvd0;
c0_RxHdr.rspMemHdr.cl_num = vcp2af_sRxPort_c0_hdr_cl_num;
c0_RxHdr.rspMemHdr.resp_type = vcp2af_sRxPort_c0_hdr_resp_type;
c0_RxHdr.rspMemHdr.mdata = vcp2af_sRxPort_c0_hdr_mdata;
end
end
assign cp2af_sRxPort.c0TxAlmFull = vcp2af_sRxPort_c0_TxAlmFull;
assign cp2af_sRxPort.c1TxAlmFull = vcp2af_sRxPort_c1_TxAlmFull;
assign cp2af_sRxPort.c0.hdr = c0_RxHdr;
assign cp2af_sRxPort.c0.data = vcp2af_sRxPort_c0_data;
assign cp2af_sRxPort.c0.rspValid = vcp2af_sRxPort_c0_rspValid;
assign cp2af_sRxPort.c0.mmioRdValid = vcp2af_sRxPort_c0_mmioRdValid;
assign cp2af_sRxPort.c0.mmioWrValid = vcp2af_sRxPort_c0_mmioWrValid;
assign cp2af_sRxPort.c1.hdr.vc_used = vcp2af_sRxPort_c1_hdr_vc_used;
assign cp2af_sRxPort.c1.hdr.rsvd1 = vcp2af_sRxPort_c1_hdr_rsvd1;
assign cp2af_sRxPort.c1.hdr.hit_miss = vcp2af_sRxPort_c1_hdr_hit_miss;
assign cp2af_sRxPort.c1.hdr.format = vcp2af_sRxPort_c1_hdr_format;
assign cp2af_sRxPort.c1.hdr.rsvd0 = vcp2af_sRxPort_c1_hdr_rsvd0;
assign cp2af_sRxPort.c1.hdr.cl_num = vcp2af_sRxPort_c1_hdr_cl_num;
assign cp2af_sRxPort.c1.hdr.resp_type = vcp2af_sRxPort_c1_hdr_resp_type;
assign cp2af_sRxPort.c1.hdr.mdata = vcp2af_sRxPort_c1_hdr_mdata;
assign cp2af_sRxPort.c1.rspValid = vcp2af_sRxPort_c1_rspValid;
assign af2cp_sTxPort_c0_hdr_vc_sel = af2cp_sTxPort.c0.hdr.vc_sel;
assign af2cp_sTxPort_c0_hdr_rsvd1 = af2cp_sTxPort.c0.hdr.rsvd1;
assign af2cp_sTxPort_c0_hdr_cl_len = af2cp_sTxPort.c0.hdr.cl_len;
assign af2cp_sTxPort_c0_hdr_req_type = af2cp_sTxPort.c0.hdr.req_type;
assign af2cp_sTxPort_c0_hdr_rsvd0 = af2cp_sTxPort.c0.hdr.rsvd0;
assign af2cp_sTxPort_c0_hdr_address = af2cp_sTxPort.c0.hdr.address;
assign af2cp_sTxPort_c0_hdr_mdata = af2cp_sTxPort.c0.hdr.mdata;
assign af2cp_sTxPort_c0_valid = af2cp_sTxPort.c0.valid;
assign af2cp_sTxPort_c1_hdr_rsvd2 = af2cp_sTxPort.c1.hdr.rsvd2;
assign af2cp_sTxPort_c1_hdr_vc_sel = af2cp_sTxPort.c1.hdr.vc_sel;
assign af2cp_sTxPort_c1_hdr_sop = af2cp_sTxPort.c1.hdr.sop;
assign af2cp_sTxPort_c1_hdr_rsvd1 = af2cp_sTxPort.c1.hdr.rsvd1;
assign af2cp_sTxPort_c1_hdr_cl_len = af2cp_sTxPort.c1.hdr.cl_len;
assign af2cp_sTxPort_c1_hdr_req_type = af2cp_sTxPort.c1.hdr.req_type;
assign af2cp_sTxPort_c1_hdr_rsvd0 = af2cp_sTxPort.c1.hdr.rsvd0;
assign af2cp_sTxPort_c1_hdr_address = af2cp_sTxPort.c1.hdr.address;
assign af2cp_sTxPort_c1_hdr_mdata = af2cp_sTxPort.c1.hdr.mdata;
assign af2cp_sTxPort_c1_data = af2cp_sTxPort.c1.data;
assign af2cp_sTxPort_c1_valid = af2cp_sTxPort.c1.valid;
assign af2cp_sTxPort_c2_hdr_tid = af2cp_sTxPort.c2.hdr.tid;
assign af2cp_sTxPort_c2_mmioRdValid = af2cp_sTxPort.c2.mmioRdValid;
assign af2cp_sTxPort_c2_data = af2cp_sTxPort.c2.data;
endmodule