Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

View File

@@ -1,3 +1,16 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
@@ -5,31 +18,42 @@
#include <queue>
#include <unordered_map>
#include <util.h>
#include <stringutil.h>
#include <VX_config.h>
#include <simobject.h>
#include "uuid_gen.h"
#include "debug.h"
namespace vortex {
typedef uint8_t Byte;
#if XLEN == 32
#if (XLEN == 32)
typedef uint32_t Word;
typedef int32_t WordI;
typedef uint64_t DWord;
typedef int64_t DWordI;
#elif XLEN == 64
typedef uint32_t WordF;
#elif (XLEN == 64)
typedef uint64_t Word;
typedef int64_t WordI;
typedef __uint128_t DWord;
typedef __int128_t DWordI;
typedef uint64_t WordF;
#else
#error unsupported XLEN
#endif
typedef uint64_t FWord;
#define MAX_NUM_CORES 1024
#define MAX_NUM_THREADS 32
#define MAX_NUM_WARPS 32
#define MAX_NUM_REGS 32
typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
typedef std::bitset<MAX_NUM_CORES> CoreMask;
typedef std::bitset<MAX_NUM_REGS> RegMask;
typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
typedef std::bitset<MAX_NUM_WARPS> WarpMask;
typedef std::unordered_map<uint32_t, uint32_t> CSRs;
///////////////////////////////////////////////////////////////////////////////
@@ -40,8 +64,8 @@ enum class RegType {
Vector
};
inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
switch (clss) {
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
switch (type) {
case RegType::None: break;
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
@@ -53,23 +77,19 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& clss) {
///////////////////////////////////////////////////////////////////////////////
enum class ExeType {
NOP,
ALU,
LSU,
CSR,
FPU,
GPU,
SFU,
MAX,
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
switch (type) {
case ExeType::NOP: os << "NOP"; break;
case ExeType::ALU: os << "ALU"; break;
case ExeType::LSU: os << "LSU"; break;
case ExeType::CSR: os << "CSR"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::GPU: os << "GPU"; break;
case ExeType::SFU: os << "SFU"; break;
case ExeType::MAX: break;
}
return os;
@@ -82,8 +102,7 @@ enum class AluType {
BRANCH,
SYSCALL,
IMUL,
IDIV,
CMOV,
IDIV
};
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
@@ -93,7 +112,6 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
case AluType::SYSCALL: os << "SYSCALL"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
case AluType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -103,16 +121,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
enum class LsuType {
LOAD,
STORE,
FENCE,
PREFETCH,
FENCE
};
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
switch (type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
case LsuType::PREFETCH: os << "PREFETCH"; break;
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
}
return os;
}
@@ -141,21 +157,6 @@ struct mem_addr_size_t {
uint32_t size;
};
inline AddrType get_addr_type(Word addr, uint32_t size) {
__unused (size);
if (SM_ENABLE) {
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
&& addr < SMEM_BASE_ADDR) {
assert((addr + size) <= SMEM_BASE_ADDR);
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
///////////////////////////////////////////////////////////////////////////////
enum class FpuType {
@@ -179,23 +180,37 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
///////////////////////////////////////////////////////////////////////////////
enum class GpuType {
enum class SfuType {
TMC,
WSPAWN,
SPLIT,
JOIN,
BAR,
PRED,
CSRRW,
CSRRS,
CSRRC,
TEX,
RASTER,
ROP,
CMOV
};
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
switch (type) {
case GpuType::TMC: os << "TMC"; break;
case GpuType::WSPAWN: os << "WSPAWN"; break;
case GpuType::SPLIT: os << "SPLIT"; break;
case GpuType::JOIN: os << "JOIN"; break;
case GpuType::BAR: os << "BAR"; break;
case GpuType::TEX: os << "TEX"; break;
case SfuType::TMC: os << "TMC"; break;
case SfuType::WSPAWN: os << "WSPAWN"; break;
case SfuType::SPLIT: os << "SPLIT"; break;
case SfuType::JOIN: os << "JOIN"; break;
case SfuType::BAR: os << "BAR"; break;
case SfuType::PRED: os << "PRED"; break;
case SfuType::CSRRW: os << "CSRRW"; break;
case SfuType::CSRRS: os << "CSRRS"; break;
case SfuType::CSRRC: os << "CSRRC"; break;
case SfuType::TEX: os << "TEX"; break;
case SfuType::RASTER: os << "RASTER"; break;
case SfuType::ROP: os << "ROP"; break;
case SfuType::CMOV: os << "CMOV"; break;
}
return os;
}
@@ -218,31 +233,32 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
///////////////////////////////////////////////////////////////////////////////
struct MemReq {
uint64_t addr;
bool write;
bool non_cacheable;
uint32_t tag;
uint32_t core_id;
uint64_t uuid;
uint64_t addr;
bool write;
AddrType type;
uint32_t tag;
uint32_t cid;
uint64_t uuid;
MemReq(uint64_t _addr = 0,
bool _write = false,
bool _non_cacheable = false,
uint64_t _tag = 0,
uint32_t _core_id = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, non_cacheable(_non_cacheable)
, tag(_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
MemReq(uint64_t _addr = 0,
bool _write = false,
AddrType _type = AddrType::Global,
uint64_t _tag = 0,
uint32_t _cid = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, type(_type)
, tag(_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
os << "addr=" << std::hex << req.addr << std::dec << ", tag=" << req.tag << ", core_id=" << req.core_id;
os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
os << " (#" << std::dec << req.uuid << ")";
return os;
}
@@ -250,18 +266,19 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
///////////////////////////////////////////////////////////////////////////////
struct MemRsp {
uint64_t tag;
uint32_t core_id;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
: tag (_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
uint64_t tag;
uint32_t cid;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
: tag (_tag)
, cid(_cid)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
os << "mem-rsp: tag=" << rsp.tag << ", cid=" << rsp.cid;
os << " (#" << std::dec << rsp.uuid << ")";
return os;
}
@@ -270,10 +287,6 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T>
class HashTable {
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
public:
HashTable(uint32_t capacity)
: entries_(capacity)
@@ -336,92 +349,180 @@ public:
}
size_ = 0;
}
private:
std::vector<std::pair<bool, T>> entries_;
uint32_t size_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
template <typename Req, typename Rsp>
class Switch : public SimObject<Switch<Req, Rsp>> {
private:
ArbiterType type_;
uint32_t delay_;
uint32_t cursor_;
uint32_t tag_shift_;
public:
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
std::vector<SimPort<Req>> ReqOut;
std::vector<SimPort<Rsp>> RspOut;
Switch(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_inputs = 1,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
, delay_(delay)
, cursor_(0)
, tag_shift_(log2ceil(num_inputs))
, ReqIn(num_inputs, this)
, ReqOut(this)
, RspIn(this)
, RspOut(num_inputs, this)
, cursors_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay_ != 0);
assert(num_inputs <= MaxInputs);
if (num_inputs == 1) {
// bypass
ReqIn.at(0).bind(&ReqOut);
RspIn.bind(&RspOut.at(0));
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
if (num_inputs == num_outputs) {
// bypass mode
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
RspOut.at(i).bind(&RspIn.at(i));
}
}
}
void reset() {
cursor_ = 0;
for (auto& cursor : cursors_) {
cursor = 0;
}
}
void tick() {
if (ReqIn.size() == 1)
void tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
uint32_t R = 1 << lg_num_reqs_;
// skip bypass mode
if (I == O)
return;
// process incomming requests
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
uint32_t j = (cursor_ + i) % n;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
// process incomming requests
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | i;
}
DT(4, this->name() << "-" << req);
ReqOut.at(o).send(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
}
ReqOut.send(req, delay_);
req_in.pop();
this->update_cursor(j);
break;
}
}
// process incoming reponses
if (!RspIn.empty()) {
auto& rsp = RspIn.front();
uint32_t port_id = 0;
if (tag_shift_) {
port_id = rsp.tag & ((1 << tag_shift_)-1);
rsp.tag >>= tag_shift_;
}
RspOut.at(port_id).send(rsp, 1);
RspIn.pop();
// process incoming reponses
if (!RspOut.at(o).empty()) {
auto& rsp = RspOut.at(o).front();
uint32_t i = 0;
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).send(rsp, 1);
RspOut.at(o).pop();
}
}
}
void update_cursor(uint32_t grant) {
void update_cursor(uint32_t index, uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursor_ = grant + 1;
cursors_.at(index) = grant + 1;
}
}
std::vector<SimPort<Req>> ReqIn;
SimPort<Req> ReqOut;
SimPort<Rsp> RspIn;
std::vector<SimPort<Rsp>> RspOut;
private:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t lg_num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
class SMemDemux : public SimObject<SMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqSm;
SimPort<MemRsp> RspSm;
SimPort<MemReq> ReqDc;
SimPort<MemRsp> RspDc;
SMemDemux(
const SimContext& ctx,
const char* name,
uint32_t delay = 1
) : SimObject<SMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSm(this)
, RspSm(this)
, ReqDc(this)
, RspDc(this)
, delay_(delay)
{}
void reset() {}
void tick() {
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSm.send(req, delay_);
} else {
ReqDc.send(req, delay_);
}
ReqIn.pop();
}
// process incoming reponses
if (!RspSm.empty()) {
auto& rsp = RspSm.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSm.pop();
}
if (!RspDc.empty()) {
auto& rsp = RspDc.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDc.pop();
}
}
private:
uint32_t delay_;
};
}