Merge branch 'main' of https://github.com/ucb-bar/radiance into main

This commit is contained in:
Richard Yan
2024-06-09 15:26:07 -07:00
13 changed files with 1791 additions and 40 deletions

View File

@@ -2,6 +2,7 @@
# extra variables/targets ingested by the chipyard make system
##############################################################
VORTEX_SRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc/vortex
RADPIE_SRC_DIR = $(base_dir)/generators/radiance/radpie
RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release
@@ -10,13 +11,14 @@ RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release
##################################################################
# EXTRA_SIM_REQS += radpie
EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie
# EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie
ifeq ($(shell echo $(CONFIG) | grep -E "SynConfig$$"),$(CONFIG))
EXTRA_SIM_PREPROC_DEFINES += +define+SYNTHESIS +define+NDEBUG +define+DPI_DISABLE
endif
EXTRA_SIM_PREPROC_DEFINES += \
+define+SIMULATION \
+define+GPR_RESET \
+define+GPR_DUPLICATED \
+define+LSU_DUP_DISABLE \
+define+DBG_TRACE_CORE_PIPELINE_VCS \
+define+PERF_ENABLE \
@@ -35,3 +37,30 @@ VCS_NONCC_OPTS += +vcs+initreg+random
.PHONY: radpie
radpie:
cd $(RADPIE_SRC_DIR) && cargo build --release
EXTRA_SIM_REQS += vortex_vsrc.$(CONFIG)
# doesn't work if we use $(call lookup_srcs) from common.mk, the variable
# doesn't expand somehow
ifneq ($(shell which fd 2> /dev/null),)
VORTEX_VLOG_SOURCES := $(shell fd -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR))
endif
# VORTEX_COLLATERAL := $(patsubst $(VORTEX_SRC_DIR)%,$(GEN_COLLATERAL_DIR)%,$(VORTEX_VLOG_SOURCES))
# check if expanded
# $(info VORTEX_VLOG_SOURCES: $(VORTEX_VLOG_SOURCES))
# For every Vortex verilog source file, if there's a matching file in
# gen-collateral/, copy them over. This is a hacky way to ensure the changes
# in the verilog sources are reflected before Verilator/VCS kicks in. This is
# necessary when common.mk does not trigger chipyard jar rebuild upon verilog
# source updates, in which case we need to manually ensure the up-to-date-ness
# of gen-collateral/.
vortex_vsrc.$(CONFIG): $(VORTEX_VLOG_SOURCES)
@for file in $(VORTEX_VLOG_SOURCES); do \
filename=$$(basename "$$file"); \
if [ -f $(GEN_COLLATERAL_DIR)/$$filename ]; then \
if ! diff $$file $(GEN_COLLATERAL_DIR)/$$filename &>/dev/null ; then \
cp -v "$$file" $(GEN_COLLATERAL_DIR); \
fi; \
fi; \
done
touch $@

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,226 @@
// See LICENSE.SiFive for license details.
// See LICENSE.Berkeley for license details.
package radiance.core
import chisel3._
import chisel3.util._
import freechips.rocketchip.tile
// Implements the four-element dot product (FEDP) unit in Volta Tensor Cores.
class TensorDotProductUnit extends Module with tile.HasFPUParameters {
val fLen = 32
val minFLen = 32
def xLen = 32
val dotProductDim = 4
val io = IO(new Bundle {
val in = Flipped(Valid(new Bundle {
val a = Vec(dotProductDim, Bits((fLen).W))
val b = Vec(dotProductDim, Bits((fLen).W))
val c = Bits((fLen).W)
}))
val stall = Input(Bool())
val out = Valid(new Bundle {
val data = Bits((fLen).W)
})
})
val t = tile.FType.S
val in1 = io.in.bits.a.map(x => unbox(recode(x, S), S, Some(tile.FType.S)))
val in2 = io.in.bits.b.map(x => unbox(recode(x, S), S, Some(tile.FType.S)))
val in3 = unbox(recode(io.in.bits.c, S), S, Some(tile.FType.S))
val dpu = Module(new DotProductPipe(dotProductDim, t.exp, t.sig))
dpu.io.in.valid := io.in.valid
dpu.io.in.bits.a := in1
dpu.io.in.bits.b := in2
dpu.io.in.bits.c := in3
dpu.io.stall := io.stall
io.out.valid := dpu.io.out.valid
io.out.bits.data := ieee(box(dpu.io.out.bits.data, S))
}
// Copied from chisel3.util.Pipe.
class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module {
/** A non-ambiguous name of this `StallingPipe` for use in generated Verilog
* names. Includes the latency cycle count in the name as well as the
* parameterized generator's `typeName`, e.g. `Pipe4_UInt4`
*/
override def desiredName = s"${simpleClassName(this.getClass)}${latency}_${gen.typeName}"
class StallingPipeIO extends Bundle {
val stall = Input(Bool())
val enq = Input(Valid(gen))
val deq = Output(Valid(gen))
}
val io = IO(new StallingPipeIO)
io.deq <> StallingPipe(io.stall, io.enq, latency)
}
object StallingPipe {
import chisel3.experimental.prefix
def apply[T <: Data](stall: Bool, enqValid: Bool, enqBits: T, latency: Int): Valid[T] = {
require(latency == 1, "StallingPipe only supports latency equals one!")
prefix("stalling_pipe") {
val out = Wire(Valid(chiselTypeOf(enqBits)))
val v = RegEnable(enqValid, false.B, !stall)
val b = RegEnable(enqBits, !stall && enqValid)
out.valid := v
out.bits := b
out
}
}
def apply[T <: Data](stall: Bool, enqValid: Bool, enqBits: T): Valid[T] = {
apply(stall, enqValid, enqBits, 1)
}
def apply[T <: Data](stall: Bool, enq: Valid[T], latency: Int = 1): Valid[T] = {
apply(stall, enq.valid, enq.bits, latency)
}
}
// Computes d = a(0)*b(0) + ... + a(3)*b(3) + c.
// Fully pipelined with a fixed latency of 4 cycles.
class DotProductPipe(dim: Int, expWidth: Int, sigWidth: Int) extends Module {
require(dim == 4, "DPU currently only supports dimension 4")
val recFLen = expWidth + sigWidth + 1
val io = IO(new Bundle {
val in = Flipped(Valid(new Bundle {
val a = Vec(4, Bits((recFLen).W))
val b = Vec(4, Bits((recFLen).W))
val c = Bits((recFLen).W)
// val roundingMode = UInt(3.W)
// val detectTininess = UInt(1.W)
}))
val stall = Input(Bool())
val out = Valid(new Bundle {
val data = Bits((recFLen).W)
})
})
val mul = Seq.fill(dim)(Module(new hardfloat.MulRecFN(expWidth, sigWidth)))
mul.zipWithIndex.foreach { case (m, i) =>
// FIXME: these settings are arbitrary
m.io.roundingMode := hardfloat.consts.round_near_even
m.io.detectTininess := hardfloat.consts.tininess_afterRounding
m.io.a := io.in.bits.a(i)
m.io.b := io.in.bits.b(i)
}
val mulStageOut = StallingPipe(io.stall, io.in.valid, VecInit(mul.map(_.io.out)))
val mulStageC = StallingPipe(io.stall, io.in.valid, io.in.bits.c)
// mul stage end -------------------------------------------------------------
val add1 = Seq.fill(dim / 2)(Module(new hardfloat.AddRecFN(expWidth, sigWidth)))
add1.zipWithIndex.foreach { case (a, i) =>
a.io.subOp := 0.U // FIXME
a.io.a := mulStageOut.bits(2 * i + 0)
a.io.b := mulStageOut.bits(2 * i + 1)
a.io.roundingMode := hardfloat.consts.round_near_even
a.io.detectTininess := hardfloat.consts.tininess_afterRounding
}
val add1StageOut = StallingPipe(io.stall, mulStageOut.valid, VecInit(add1.map(_.io.out)))
val add1StageC = StallingPipe(io.stall, mulStageOut.valid, mulStageC.bits)
// add1 stage end ------------------------------------------------------------
val add2 = Module(new hardfloat.AddRecFN(expWidth, sigWidth))
add2.io.subOp := 0.U // FIXME
assert(add1StageOut.bits.length == 2)
add2.io.a := add1StageOut.bits(0)
add2.io.b := add1StageOut.bits(1)
add2.io.roundingMode := hardfloat.consts.round_near_even
add2.io.detectTininess := hardfloat.consts.tininess_afterRounding
val add2StageOut = StallingPipe(io.stall, add1StageOut.valid, add2.io.out)
val add2StageC = StallingPipe(io.stall, add1StageOut.valid, add1StageC.bits)
// add2 stage end ------------------------------------------------------------
val acc = Module(new hardfloat.AddRecFN(expWidth, sigWidth))
acc.io.subOp := 0.U // FIXME
acc.io.a := add2StageOut.bits
acc.io.b := add2StageC.bits
acc.io.roundingMode := hardfloat.consts.round_near_even
acc.io.detectTininess := hardfloat.consts.tininess_afterRounding
val accStageOut = StallingPipe(io.stall, add2StageOut.valid, acc.io.out)
// FIXME: exception output ignored
// acc stage end -------------------------------------------------------------
io.out.valid := accStageOut.valid
io.out.bits.data := accStageOut.bits
}
class MulAddRecFNPipe(latency: Int, expWidth: Int, sigWidth: Int) extends Module {
require(latency <= 2)
val io = IO(new Bundle {
val validin = Input(Bool())
val op = Input(Bits(2.W))
val a = Input(Bits((expWidth + sigWidth + 1).W))
val b = Input(Bits((expWidth + sigWidth + 1).W))
val c = Input(Bits((expWidth + sigWidth + 1).W))
val roundingMode = Input(UInt(3.W))
val detectTininess = Input(UInt(1.W))
val out = Output(Bits((expWidth + sigWidth + 1).W))
val exceptionFlags = Output(Bits(5.W))
val validout = Output(Bool())
})
//------------------------------------------------------------------------
//------------------------------------------------------------------------
val mulAddRecFNToRaw_preMul = Module(new hardfloat.MulAddRecFNToRaw_preMul(expWidth, sigWidth))
val mulAddRecFNToRaw_postMul = Module(new hardfloat.MulAddRecFNToRaw_postMul(expWidth, sigWidth))
mulAddRecFNToRaw_preMul.io.op := io.op
mulAddRecFNToRaw_preMul.io.a := io.a
mulAddRecFNToRaw_preMul.io.b := io.b
mulAddRecFNToRaw_preMul.io.c := io.c
val mulAddResult =
(mulAddRecFNToRaw_preMul.io.mulAddA *
mulAddRecFNToRaw_preMul.io.mulAddB) +&
mulAddRecFNToRaw_preMul.io.mulAddC
val valid_stage0 = Wire(Bool())
val roundingMode_stage0 = Wire(UInt(3.W))
val detectTininess_stage0 = Wire(UInt(1.W))
val postmul_regs = if(latency>0) 1 else 0
mulAddRecFNToRaw_postMul.io.fromPreMul := Pipe(io.validin, mulAddRecFNToRaw_preMul.io.toPostMul, postmul_regs).bits
mulAddRecFNToRaw_postMul.io.mulAddResult := Pipe(io.validin, mulAddResult, postmul_regs).bits
mulAddRecFNToRaw_postMul.io.roundingMode := Pipe(io.validin, io.roundingMode, postmul_regs).bits
roundingMode_stage0 := Pipe(io.validin, io.roundingMode, postmul_regs).bits
detectTininess_stage0 := Pipe(io.validin, io.detectTininess, postmul_regs).bits
valid_stage0 := Pipe(io.validin, false.B, postmul_regs).valid
//------------------------------------------------------------------------
//------------------------------------------------------------------------
val roundRawFNToRecFN = Module(new hardfloat.RoundRawFNToRecFN(expWidth, sigWidth, 0))
val round_regs = if(latency==2) 1 else 0
roundRawFNToRecFN.io.invalidExc := Pipe(valid_stage0, mulAddRecFNToRaw_postMul.io.invalidExc, round_regs).bits
roundRawFNToRecFN.io.in := Pipe(valid_stage0, mulAddRecFNToRaw_postMul.io.rawOut, round_regs).bits
roundRawFNToRecFN.io.roundingMode := Pipe(valid_stage0, roundingMode_stage0, round_regs).bits
roundRawFNToRecFN.io.detectTininess := Pipe(valid_stage0, detectTininess_stage0, round_regs).bits
io.validout := Pipe(valid_stage0, false.B, round_regs).valid
roundRawFNToRecFN.io.infiniteExc := false.B
io.out := roundRawFNToRecFN.io.out
io.exceptionFlags := roundRawFNToRecFN.io.exceptionFlags
}

View File

@@ -5,7 +5,8 @@ package radiance.memory
import chisel3._
import chisel3.util._
import org.chipsalliance.cde.config.{Field, Parameters}
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.diplomacy.{IdRange, AddressSet, BufferParams}
import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue}
import freechips.rocketchip.unittest._
import freechips.rocketchip.tilelink._

View File

@@ -4,6 +4,7 @@ import chisel3._
import chisel3.util._
import chisel3.experimental._
import freechips.rocketchip.diplomacy._
import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
import freechips.rocketchip.tilelink._
import org.chipsalliance.cde.config.{Parameters, Field}
@@ -14,15 +15,11 @@ case class VortexL1Config(
numBanks: Int,
inputSize: Int, // This is the read/write granularity of the L1 cache
cacheLineSize: Int,
coreTagWidth: Int,
writeInfoReqQSize: Int,
mshrSize: Int,
memSideSourceIds: Int,
uncachedAddrSets: Seq[AddressSet]
) {
def coreTagPlusSizeWidth: Int = {
log2Ceil(inputSize) + coreTagWidth
}
// NOTE: This assertion depends on the fact that the Vortex cache is
// configured to have 1 bank, and that it uses MSHR id as the tag of
// memory-side requests. Otherwise, it will append bank id to the tag as
@@ -39,7 +36,6 @@ object defaultVortexL1Config
numBanks = 4,
inputSize = 16,
cacheLineSize = 16,
coreTagWidth = 8,
writeInfoReqQSize = 16,
mshrSize = 8,
memSideSourceIds = 8,
@@ -95,13 +91,18 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
)
)
// HACK: Set arbitrarily since we cannot query the coresideNode's sourceId
// here. See comment on the require below.
// @perf: This is quite high
val sourceWidth = 9
// Master node to downstream
val clientParam = Seq(
TLMasterPortParameters.v1(
clients = Seq(
TLMasterParameters.v1(
name = "VortexBankPassthrough",
sourceId = IdRange(0, 1 << config.coreTagWidth),
sourceId = IdRange(0, 1 << sourceWidth),
supportsProbe = TransferSizes(1, config.cacheLineSize),
supportsGet = TransferSizes(1, config.cacheLineSize),
supportsPutFull = TransferSizes(1, config.cacheLineSize),
@@ -121,6 +122,16 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
val (upstream, _) = coresideNode.in(0)
val (downstream, _) = vxCacheFetchNode.out(0)
// Make sure the outgoing edge of this passthrough has enough sourceIds
// that encompasses the core-side incoming edge's. This is an unfortunate
// hack due to not doing proper param negotiations across disconnected
// Diplomacy graphs.
// println(s"${upstream.params.sourceBits} <= ${downstream.params.sourceBits}")
require(upstream.params.sourceBits <= downstream.params.sourceBits,
"mem-side source of L1 cache truncates core-side source! " +
"Try lowering core/coalescer srcIds, or increasing sourceWidth " +
"for VortexBankPassThrough")
downstream.a <> upstream.a
upstream.d <> downstream.d
}
@@ -197,13 +208,17 @@ class VortexBankImp(
outer: VortexBank,
config: VortexL1Config
) extends LazyModuleImp(outer) {
val (tlInFromCoal, _) = outer.coresideNode.in.head
val coreTagWidth = tlInFromCoal.a.bits.source.getWidth
val coreTagWidthPlusSize = coreTagWidth + log2Ceil(config.inputSize)
val vxCache = Module(
new VX_cache_top(
WORD_SIZE = config.inputSize,
// distribute total size across numBanks
CACHE_SIZE = config.cacheSize / config.numBanks,
CACHE_LINE_SIZE = config.cacheLineSize,
CORE_TAG_WIDTH = config.coreTagPlusSizeWidth,
CORE_TAG_WIDTH = coreTagWidthPlusSize,
MSHR_SIZE = config.mshrSize
)
);
@@ -232,7 +247,7 @@ class VortexBankImp(
class ReadReqInfo(config: VortexL1Config) extends Bundle {
val size = UInt(log2Ceil(4).W + 1)
val id = UInt(config.coreTagWidth.W)
val id = UInt(coreTagWidth.W)
}
val coreWriteReqQueue = Module(
@@ -247,8 +262,6 @@ class VortexBankImp(
// Translate TL request from Coalescer to requests for VX_cache
def TLReq2VXReq = {
val (tlInFromCoal, _) = outer.coresideNode.in.head
// coal -> vxCache
tlInFromCoal.a.ready :=
vxCache.io.core_req_ready && coreWriteReqQueue.io.enq.ready // not optimal
@@ -269,13 +282,9 @@ class VortexBankImp(
readReqInfo.id := tlInFromCoal.a.bits.source
readReqInfo.size := tlInFromCoal.a.bits.size
assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth,
s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}")
s"id width mismatch; core-side ${tlInFromCoal.a.bits.source.getWidth}, cache-side ${readReqInfo.id.getWidth}")
assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth,
s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}")
assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth,
s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}")
assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth,
s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}")
s"size width mismatch; core-side ${tlInFromCoal.a.bits.size.getWidth}, cache-side ${readReqInfo.size.getWidth}")
// ignore param, size, corrupt
vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag)

View File

@@ -42,7 +42,7 @@ class WithRadianceCores(
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(`location`) => {
val prev = up(TilesLocated(`location`), site)
val prev = up(TilesLocated(`location`))
val idOffset = prev.size
val vortex = RadianceTileParams(
core = VortexCoreParams(fpu = None),
@@ -87,7 +87,7 @@ class WithRadianceGemmini(location: HierarchicalLocation,
crossing: RocketCrossingParams,
dim: Int, accSizeInKB: Int, tileSize: Int) extends Config((site, _, up) => {
case TilesLocated(`location`) => {
val prev = up(TilesLocated(`location`), site)
val prev = up(TilesLocated(`location`))
val idOffset = prev.size
if (idOffset == 0) {
println("******WARNING****** gemmini tile id is 0! radiance tiles in the same cluster needs to be before gemmini")
@@ -171,7 +171,7 @@ class WithFuzzerCores(
useVxCache: Boolean
) extends Config((site, _, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem), site)
val prev = up(TilesLocated(InSubsystem))
val idOffset = prev.size
val fuzzer = FuzzerTileParams(
core = VortexCoreParams(fpu = None),
@@ -202,11 +202,11 @@ class WithRadianceCluster(
case PossibleTileLocations => up(PossibleTileLocations) :+ InCluster(clusterId)
})
// `nSrcIds`: number of source IDs for dmem requests on each SIMT lane
// `nSrcIds`: number of source IDs for each mem lane. This is for all warps
class WithSimtConfig(nWarps: Int = 4, nCoreLanes: Int = 4, nMemLanes: Int = 4, nSrcIds: Int = 8)
extends Config((site, _, up) => {
case SIMTCoreKey => {
Some(up(SIMTCoreKey, site).getOrElse(SIMTCoreParams()).copy(
Some(up(SIMTCoreKey).getOrElse(SIMTCoreParams()).copy(
nWarps = nWarps,
nCoreLanes = nCoreLanes,
nMemLanes = nMemLanes,
@@ -228,22 +228,18 @@ extends Config((site, _, _) => {
class WithPriorityCoalXbar extends Config((site, _, up) => {
case CoalXbarKey => {
Some(up(CoalXbarKey, site).getOrElse(CoalXbarParam))
Some(up(CoalXbarKey).getOrElse(CoalXbarParam))
}
})
class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, here, up) => {
case VortexL1Key => {
Some(defaultVortexL1Config.copy(
numBanks = nBanks,
inputSize = up(SIMTCoreKey).get.nMemLanes * 4,
cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4,
inputSize = up(SIMTCoreKey).get.nMemLanes * 4/*32b word*/,
cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4/*32b word*/,
memSideSourceIds = 16,
mshrSize = 16,
coreTagWidth = log2Ceil(up(SIMTCoreKey).get.nSrcIds.max(up(CoalescerKey) match {
case Some(key) => key.numNewSrcIds
case None => 0
})) + log2Ceil(up(SIMTCoreKey).get.nMemLanes) + 1
))
}
})
@@ -254,7 +250,7 @@ class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
// to e.g. compare waveforms.
class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config((site, _, up) => {
case CoalescerKey => {
val (nLanes, numOldSrcIds) = up(SIMTCoreKey, site) match {
val (nLanes, numOldSrcIds) = up(SIMTCoreKey) match {
case Some(param) => (param.nMemLanes, param.nSrcIds)
case None => (1,1)
}
@@ -266,7 +262,7 @@ class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config
// If instantiating L1 cache, the maximum coalescing size should match the
// cache line size
val maxCoalSizeInBytes = up(VortexL1Key, site) match {
val maxCoalSizeInBytes = up(VortexL1Key) match {
case Some(param) => param.inputSize
case None => sbusWidthInBytes
}
@@ -291,7 +287,7 @@ class WithNCustomSmallRocketCores(
crossing: RocketCrossingParams = RocketCrossingParams()
) extends Config((site, here, up) => {
case TilesLocated(InSubsystem) => {
val prev = up(TilesLocated(InSubsystem), site)
val prev = up(TilesLocated(InSubsystem))
val idOffset = overrideIdOffset.getOrElse(prev.size)
val med = RocketTileParams(
core = RocketCoreParams(fpu = None),
@@ -325,7 +321,7 @@ class WithNCustomSmallRocketCores(
class WithExtGPUMem(address: BigInt = BigInt("0x100000000", 16),
size: BigInt = 0x80000000) extends Config((site, here, up) => {
case GPUMemory() => Some(GPUMemParams(address, size))
case ExtMem => up(ExtMem, site).map(x => {
case ExtMem => up(ExtMem).map(x => {
val gap = address - x.master.base - x.master.size
x.copy(master = x.master.copy(size = x.master.size + gap + size))
})

View File

@@ -89,7 +89,7 @@ class RadianceCluster (
val stride_by_word = true
val filter_aligned = true
val disable_monitors = true // otherwise it generate 1k+ different tl monitors
val serialize_unaligned = true
val serialize_unaligned = false
def guard_monitors[T](callback: Parameters => T)(implicit p: Parameters): Unit = {
if (disable_monitors) {

View File

@@ -7,8 +7,8 @@ import chisel3._
import chisel3.experimental.AffectsChiselPrefix
import chisel3.util._
import freechips.rocketchip.devices.tilelink._
import org.chipsalliance.diplomacy._
import freechips.rocketchip.diplomacy._
import org.chipsalliance.diplomacy.lazymodule.LazyModule
import freechips.rocketchip.prci.ClockSinkParameters
import freechips.rocketchip.regmapper.RegField
import freechips.rocketchip.rocket._
@@ -319,8 +319,7 @@ class RadianceTile private (
// )
val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(
numBanks = 1,
coreTagWidth = imemSourceWidth
numBanks = 1
)))
val dcache = LazyModule(new VortexL1Cache(vortexL1Config))
// imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ }

View File

@@ -172,6 +172,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
addResource("/vsrc/vortex/hw/rtl/core/VX_decode.sv")
addResource("/vsrc/vortex/hw/rtl/core/VX_dispatch.sv")
addResource("/vsrc/vortex/hw/rtl/core/VX_dispatch_unit.sv")
addResource("/vsrc/vortex/hw/rtl/core/VX_dispatch_unit_sane.sv")
addResource("/vsrc/vortex/hw/rtl/core/VX_execute.sv")
addResource("/vsrc/vortex/hw/rtl/core/VX_fetch.sv")
addResource("/vsrc/vortex/hw/rtl/core/VX_gather_unit.sv")
@@ -329,6 +330,8 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_fpnew.sv")
addResource("/vsrc/vortex/hw/rtl/core/VX_fpu_unit.sv")
addResource("/vsrc/TensorDotProductUnit.sv")
// fpnew
// compile order matters; package definitions (ex. fpnew_pkg) should be
// compiled before all the other modules that reference them. They are added

View File

@@ -0,0 +1,96 @@
package radiance.core
import chisel3._
import chisel3.util._
import chiseltest._
import chiseltest.simulator.VerilatorFlags
import org.chipsalliance.cde.config.Parameters
import freechips.rocketchip.tile
import org.scalatest.flatspec.AnyFlatSpec
class MulAddTest extends AnyFlatSpec with ChiselScalatestTester {
behavior of "MulAddRecFNPipe"
val t = tile.FType.S
it should "do basic arithmetic" in {
test(new MulAddRecFNPipe(2, t.exp, t.sig))
// .withAnnotations(Seq(WriteVcdAnnotation))
{ c =>
c.io.validin.poke(true.B)
// 0: MADD
// 1: MSUB
// 2: NMSUB
// 3: NMADD
c.io.op.poke(0.U)
// rounding mode (p.113 of spec)
// 0: round to nearest, ties to even
c.io.roundingMode.poke(0.U)
c.io.detectTininess.poke(hardfloat.consts.tininess_beforeRounding)
c.io.a.poke(0x3f800000.U)
c.io.b.poke(0x3f800000.U)
c.io.c.poke(0x00000000.U)
c.clock.step()
c.io.validin.poke(false.B)
c.io.validout.expect(false.B)
c.clock.step()
c.io.validout.expect(true.B)
c.io.out.expect(0x40c00000.U)
c.clock.step()
c.io.validout.expect(false.B)
}
}
}
class TensorDotProductUnitTest extends AnyFlatSpec with ChiselScalatestTester {
behavior of "TensorDotProductUnit"
implicit val p: Parameters = Parameters.empty
it should "pass" in {
test(new TensorDotProductUnit)
// .withAnnotations(Seq(VerilatorBackendAnnotation))
// .withAnnotations(Seq(WriteVcdAnnotation))
{ c =>
c.io.in.valid.poke(true.B)
c.io.stall.poke(false.B)
// (1,3,5,7)*(2,4,6,8) + 9 = 109
c.io.in.bits.a(0).poke(0x3f800000L.U(64.W))
c.io.in.bits.a(1).poke(0x40400000L.U(64.W))
c.io.in.bits.a(2).poke(0x40a00000L.U(64.W))
c.io.in.bits.a(3).poke(0x40e00000L.U(64.W))
c.io.in.bits.b(0).poke(0x40000000L.U(64.W))
c.io.in.bits.b(1).poke(0x40800000L.U(64.W))
c.io.in.bits.b(2).poke(0x40c00000L.U(64.W))
c.io.in.bits.b(3).poke(0x41000000L.U(64.W))
c.io.in.bits.c .poke(0x41100000L.U(64.W))
c.io.out.valid.expect(false.B)
c.clock.step()
c.io.in.valid.poke(false.B)
c.io.out.valid.expect(false.B)
// stall the pipeline
c.io.stall.poke(true.B)
c.clock.step()
c.io.stall.poke(true.B)
c.clock.step()
c.io.stall.poke(true.B)
c.clock.step()
c.io.stall.poke(false.B)
c.clock.step()
c.clock.step()
c.clock.step()
// 4-cycle latency + stalls
c.io.out.valid.expect(true.B)
c.io.out.bits.data.expect(0x42da0000L.U)
c.clock.step()
c.io.out.valid.expect(false.B)
}
}
}