Merge branch 'main' of https://github.com/ucb-bar/radiance into main

2024-06-09 15:26:07 -07:00
parent 3badd75473 ca3fd8b515
commit 17756d5f53
13 changed files with 1791 additions and 40 deletions
--- a/radiance.mk
+++ b/radiance.mk
@@ -2,6 +2,7 @@
 # extra variables/targets ingested by the chipyard make system
 ##############################################################

+VORTEX_SRC_DIR = $(base_dir)/generators/radiance/src/main/resources/vsrc/vortex
 RADPIE_SRC_DIR = $(base_dir)/generators/radiance/radpie
 RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release

@@ -10,13 +11,14 @@ RADPIE_BUILD_DIR = $(RADPIE_SRC_DIR)/target/release
 ##################################################################

 # EXTRA_SIM_REQS += radpie
-EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie
+# EXTRA_SIM_LDFLAGS += -L$(RADPIE_BUILD_DIR) -Wl,-rpath,$(RADPIE_BUILD_DIR) -lradpie
 ifeq ($(shell echo $(CONFIG) | grep -E "SynConfig$$"),$(CONFIG))
    EXTRA_SIM_PREPROC_DEFINES += +define+SYNTHESIS +define+NDEBUG +define+DPI_DISABLE
 endif
 EXTRA_SIM_PREPROC_DEFINES += \
 	+define+SIMULATION \
 	+define+GPR_RESET \
+	+define+GPR_DUPLICATED \
 	+define+LSU_DUP_DISABLE \
 	+define+DBG_TRACE_CORE_PIPELINE_VCS \
 	+define+PERF_ENABLE \
@@ -35,3 +37,30 @@ VCS_NONCC_OPTS += +vcs+initreg+random
 .PHONY: radpie
 radpie:
 	cd $(RADPIE_SRC_DIR) && cargo build --release
+
+EXTRA_SIM_REQS += vortex_vsrc.$(CONFIG)
+# doesn't work if we use $(call lookup_srcs) from common.mk, the variable
+# doesn't expand somehow
+ifneq ($(shell which fd 2> /dev/null),)
+	VORTEX_VLOG_SOURCES := $(shell fd -L -t f -e "sv" -e "vh" -e "v" . $(VORTEX_SRC_DIR))
+endif
+# VORTEX_COLLATERAL := $(patsubst $(VORTEX_SRC_DIR)%,$(GEN_COLLATERAL_DIR)%,$(VORTEX_VLOG_SOURCES))
+# check if expanded
+# $(info VORTEX_VLOG_SOURCES: $(VORTEX_VLOG_SOURCES))
+
+# For every Vortex verilog source file, if there's a matching file in
+# gen-collateral/, copy them over.  This is a hacky way to ensure the changes
+# in the verilog sources are reflected before Verilator/VCS kicks in. This is
+# necessary when common.mk does not trigger chipyard jar rebuild upon verilog
+# source updates, in which case we need to manually ensure the up-to-date-ness
+# of gen-collateral/.
+vortex_vsrc.$(CONFIG): $(VORTEX_VLOG_SOURCES)
+	@for file in $(VORTEX_VLOG_SOURCES); do \
+		filename=$$(basename "$$file"); \
+		if [ -f $(GEN_COLLATERAL_DIR)/$$filename ]; then \
+			if ! diff $$file $(GEN_COLLATERAL_DIR)/$$filename &>/dev/null ; then \
+				cp -v "$$file" $(GEN_COLLATERAL_DIR); \
+			fi; \
+		fi; \
+	done
+	touch $@
--- a/src/main/resources/vsrc/TensorDotProductUnit.sv
+++ b/src/main/resources/vsrc/TensorDotProductUnit.sv
--- a/src/main/resources/vsrc/vortex
+++ b/src/main/resources/vsrc/vortex
--- a/src/main/scala/radiance/core/TensorDPU.scala
+++ b/src/main/scala/radiance/core/TensorDPU.scala
@@ -0,0 +1,226 @@
+// See LICENSE.SiFive for license details.
+// See LICENSE.Berkeley for license details.
+
+package radiance.core
+
+import chisel3._
+import chisel3.util._
+import freechips.rocketchip.tile
+
+// Implements the four-element dot product (FEDP) unit in Volta Tensor Cores.
+class TensorDotProductUnit extends Module with tile.HasFPUParameters {
+  val fLen = 32
+  val minFLen = 32
+  def xLen = 32
+  val dotProductDim = 4
+
+  val io = IO(new Bundle {
+    val in = Flipped(Valid(new Bundle {
+      val a = Vec(dotProductDim, Bits((fLen).W))
+      val b = Vec(dotProductDim, Bits((fLen).W))
+      val c = Bits((fLen).W)
+    }))
+    val stall = Input(Bool())
+    val out = Valid(new Bundle {
+      val data = Bits((fLen).W)
+    })
+  })
+
+  val t = tile.FType.S
+
+  val in1 = io.in.bits.a.map(x => unbox(recode(x, S), S, Some(tile.FType.S)))
+  val in2 = io.in.bits.b.map(x => unbox(recode(x, S), S, Some(tile.FType.S)))
+  val in3 = unbox(recode(io.in.bits.c, S), S, Some(tile.FType.S))
+
+  val dpu = Module(new DotProductPipe(dotProductDim, t.exp, t.sig))
+  dpu.io.in.valid := io.in.valid
+  dpu.io.in.bits.a := in1
+  dpu.io.in.bits.b := in2
+  dpu.io.in.bits.c := in3
+  dpu.io.stall := io.stall
+
+  io.out.valid := dpu.io.out.valid
+  io.out.bits.data := ieee(box(dpu.io.out.bits.data, S))
+}
+
+// Copied from chisel3.util.Pipe.
+class StallingPipe[T <: Data](val gen: T, val latency: Int = 1) extends Module {
+  /** A non-ambiguous name of this `StallingPipe` for use in generated Verilog
+   *  names. Includes the latency cycle count in the name as well as the
+   *  parameterized generator's `typeName`, e.g. `Pipe4_UInt4`
+    */
+  override def desiredName = s"${simpleClassName(this.getClass)}${latency}_${gen.typeName}"
+
+  class StallingPipeIO extends Bundle {
+    val stall = Input(Bool())
+    val enq = Input(Valid(gen))
+    val deq = Output(Valid(gen))
+  }
+
+  val io = IO(new StallingPipeIO)
+
+  io.deq <> StallingPipe(io.stall, io.enq, latency)
+}
+
+object StallingPipe {
+  import chisel3.experimental.prefix
+
+  def apply[T <: Data](stall: Bool, enqValid: Bool, enqBits: T, latency: Int): Valid[T] = {
+    require(latency == 1, "StallingPipe only supports latency equals one!")
+    prefix("stalling_pipe") {
+      val out = Wire(Valid(chiselTypeOf(enqBits)))
+      val v = RegEnable(enqValid, false.B, !stall)
+      val b = RegEnable(enqBits, !stall && enqValid)
+      out.valid := v
+      out.bits := b
+      out
+    }
+  }
+
+  def apply[T <: Data](stall: Bool, enqValid: Bool, enqBits: T): Valid[T] = {
+    apply(stall, enqValid, enqBits, 1)
+  }
+
+  def apply[T <: Data](stall: Bool, enq: Valid[T], latency: Int = 1): Valid[T] = {
+    apply(stall, enq.valid, enq.bits, latency)
+  }
+}
+
+// Computes d = a(0)*b(0) + ... + a(3)*b(3) + c.
+// Fully pipelined with a fixed latency of 4 cycles.
+class DotProductPipe(dim: Int, expWidth: Int, sigWidth: Int) extends Module {
+  require(dim == 4, "DPU currently only supports dimension 4")
+
+  val recFLen = expWidth + sigWidth + 1
+  val io = IO(new Bundle {
+    val in = Flipped(Valid(new Bundle {
+      val a = Vec(4, Bits((recFLen).W))
+      val b = Vec(4, Bits((recFLen).W))
+      val c = Bits((recFLen).W)
+      // val roundingMode   = UInt(3.W)
+      // val detectTininess = UInt(1.W)
+    }))
+    val stall = Input(Bool())
+    val out = Valid(new Bundle {
+      val data = Bits((recFLen).W)
+    })
+  })
+
+  val mul = Seq.fill(dim)(Module(new hardfloat.MulRecFN(expWidth, sigWidth)))
+  mul.zipWithIndex.foreach { case (m, i) =>
+    // FIXME: these settings are arbitrary
+    m.io.roundingMode := hardfloat.consts.round_near_even
+    m.io.detectTininess := hardfloat.consts.tininess_afterRounding
+    m.io.a := io.in.bits.a(i)
+    m.io.b := io.in.bits.b(i)
+  }
+
+  val mulStageOut = StallingPipe(io.stall, io.in.valid, VecInit(mul.map(_.io.out)))
+  val mulStageC   = StallingPipe(io.stall, io.in.valid, io.in.bits.c)
+
+  // mul stage end -------------------------------------------------------------
+
+  val add1 = Seq.fill(dim / 2)(Module(new hardfloat.AddRecFN(expWidth, sigWidth)))
+  add1.zipWithIndex.foreach { case (a, i) =>
+    a.io.subOp := 0.U // FIXME
+    a.io.a := mulStageOut.bits(2 * i + 0)
+    a.io.b := mulStageOut.bits(2 * i + 1)
+    a.io.roundingMode := hardfloat.consts.round_near_even
+    a.io.detectTininess := hardfloat.consts.tininess_afterRounding
+  }
+
+  val add1StageOut = StallingPipe(io.stall, mulStageOut.valid, VecInit(add1.map(_.io.out)))
+  val add1StageC   = StallingPipe(io.stall, mulStageOut.valid, mulStageC.bits)
+
+  // add1 stage end ------------------------------------------------------------
+
+  val add2 = Module(new hardfloat.AddRecFN(expWidth, sigWidth))
+  add2.io.subOp := 0.U // FIXME
+  assert(add1StageOut.bits.length == 2)
+  add2.io.a := add1StageOut.bits(0)
+  add2.io.b := add1StageOut.bits(1)
+  add2.io.roundingMode := hardfloat.consts.round_near_even
+  add2.io.detectTininess := hardfloat.consts.tininess_afterRounding
+
+  val add2StageOut = StallingPipe(io.stall, add1StageOut.valid, add2.io.out)
+  val add2StageC   = StallingPipe(io.stall, add1StageOut.valid, add1StageC.bits)
+
+  // add2 stage end ------------------------------------------------------------
+
+  val acc = Module(new hardfloat.AddRecFN(expWidth, sigWidth))
+  acc.io.subOp := 0.U // FIXME
+  acc.io.a := add2StageOut.bits
+  acc.io.b := add2StageC.bits
+  acc.io.roundingMode := hardfloat.consts.round_near_even
+  acc.io.detectTininess := hardfloat.consts.tininess_afterRounding
+
+  val accStageOut = StallingPipe(io.stall, add2StageOut.valid, acc.io.out)
+  // FIXME: exception output ignored
+
+  // acc stage end -------------------------------------------------------------
+
+  io.out.valid := accStageOut.valid
+  io.out.bits.data := accStageOut.bits
+}
+
+class MulAddRecFNPipe(latency: Int, expWidth: Int, sigWidth: Int) extends Module {
+  require(latency <= 2)
+
+  val io = IO(new Bundle {
+    val validin = Input(Bool())
+    val op = Input(Bits(2.W))
+    val a = Input(Bits((expWidth + sigWidth + 1).W))
+    val b = Input(Bits((expWidth + sigWidth + 1).W))
+    val c = Input(Bits((expWidth + sigWidth + 1).W))
+    val roundingMode   = Input(UInt(3.W))
+    val detectTininess = Input(UInt(1.W))
+    val out = Output(Bits((expWidth + sigWidth + 1).W))
+    val exceptionFlags = Output(Bits(5.W))
+    val validout = Output(Bool())
+  })
+
+  //------------------------------------------------------------------------
+  //------------------------------------------------------------------------
+
+  val mulAddRecFNToRaw_preMul = Module(new hardfloat.MulAddRecFNToRaw_preMul(expWidth, sigWidth))
+  val mulAddRecFNToRaw_postMul = Module(new hardfloat.MulAddRecFNToRaw_postMul(expWidth, sigWidth))
+
+  mulAddRecFNToRaw_preMul.io.op := io.op
+  mulAddRecFNToRaw_preMul.io.a  := io.a
+  mulAddRecFNToRaw_preMul.io.b  := io.b
+  mulAddRecFNToRaw_preMul.io.c  := io.c
+
+  val mulAddResult =
+      (mulAddRecFNToRaw_preMul.io.mulAddA *
+           mulAddRecFNToRaw_preMul.io.mulAddB) +&
+          mulAddRecFNToRaw_preMul.io.mulAddC
+
+  val valid_stage0 = Wire(Bool())
+  val roundingMode_stage0 = Wire(UInt(3.W))
+  val detectTininess_stage0 = Wire(UInt(1.W))
+
+  val postmul_regs = if(latency>0) 1 else 0
+  mulAddRecFNToRaw_postMul.io.fromPreMul   := Pipe(io.validin, mulAddRecFNToRaw_preMul.io.toPostMul, postmul_regs).bits
+  mulAddRecFNToRaw_postMul.io.mulAddResult := Pipe(io.validin, mulAddResult, postmul_regs).bits
+  mulAddRecFNToRaw_postMul.io.roundingMode := Pipe(io.validin, io.roundingMode, postmul_regs).bits
+  roundingMode_stage0                      := Pipe(io.validin, io.roundingMode, postmul_regs).bits
+  detectTininess_stage0                    := Pipe(io.validin, io.detectTininess, postmul_regs).bits
+  valid_stage0                             := Pipe(io.validin, false.B, postmul_regs).valid
+
+  //------------------------------------------------------------------------
+  //------------------------------------------------------------------------
+
+  val roundRawFNToRecFN = Module(new hardfloat.RoundRawFNToRecFN(expWidth, sigWidth, 0))
+
+  val round_regs = if(latency==2) 1 else 0
+  roundRawFNToRecFN.io.invalidExc         := Pipe(valid_stage0, mulAddRecFNToRaw_postMul.io.invalidExc, round_regs).bits
+  roundRawFNToRecFN.io.in                 := Pipe(valid_stage0, mulAddRecFNToRaw_postMul.io.rawOut, round_regs).bits
+  roundRawFNToRecFN.io.roundingMode       := Pipe(valid_stage0, roundingMode_stage0, round_regs).bits
+  roundRawFNToRecFN.io.detectTininess     := Pipe(valid_stage0, detectTininess_stage0, round_regs).bits
+  io.validout                             := Pipe(valid_stage0, false.B, round_regs).valid
+
+  roundRawFNToRecFN.io.infiniteExc := false.B
+
+  io.out            := roundRawFNToRecFN.io.out
+  io.exceptionFlags := roundRawFNToRecFN.io.exceptionFlags
+}
--- a/src/main/scala/radiance/memory/Coalescing.scala
+++ b/src/main/scala/radiance/memory/Coalescing.scala
@@ -5,7 +5,8 @@ package radiance.memory
 import chisel3._
 import chisel3.util._
 import org.chipsalliance.cde.config.{Field, Parameters}
-import freechips.rocketchip.diplomacy._
+import freechips.rocketchip.diplomacy.{IdRange, AddressSet, BufferParams}
+import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.util.{Code, MultiPortQueue, OnePortLanePositionedQueue}
 import freechips.rocketchip.unittest._
 import freechips.rocketchip.tilelink._
--- a/src/main/scala/radiance/memory/VortexCache.scala
+++ b/src/main/scala/radiance/memory/VortexCache.scala
@@ -4,6 +4,7 @@ import chisel3._
 import chisel3.util._
 import chisel3.experimental._
 import freechips.rocketchip.diplomacy._
+import org.chipsalliance.diplomacy.lazymodule.{LazyModule, LazyModuleImp}
 import freechips.rocketchip.tilelink._
 import org.chipsalliance.cde.config.{Parameters, Field}

@@ -14,15 +15,11 @@ case class VortexL1Config(
                           numBanks: Int,
                           inputSize: Int, // This is the read/write granularity of the L1 cache
                           cacheLineSize: Int,
-                           coreTagWidth: Int,
                           writeInfoReqQSize: Int,
                           mshrSize: Int,
                           memSideSourceIds: Int,
                           uncachedAddrSets: Seq[AddressSet]
 ) {
-  def coreTagPlusSizeWidth: Int = {
-    log2Ceil(inputSize) + coreTagWidth
-  }
  // NOTE: This assertion depends on the fact that the Vortex cache is
  // configured to have 1 bank, and that it uses MSHR id as the tag of
  // memory-side requests.  Otherwise, it will append bank id to the tag as
@@ -39,7 +36,6 @@ object defaultVortexL1Config
      numBanks = 4,
      inputSize = 16,
      cacheLineSize = 16,
-      coreTagWidth = 8,
      writeInfoReqQSize = 16,
      mshrSize = 8,
      memSideSourceIds = 8,
@@ -95,13 +91,18 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
    )
  )

+  // HACK: Set arbitrarily since we cannot query the coresideNode's sourceId
+  // here. See comment on the require below.
+  // @perf: This is quite high
+  val sourceWidth = 9
+
  // Master node to downstream
  val clientParam = Seq(
    TLMasterPortParameters.v1(
      clients = Seq(
        TLMasterParameters.v1(
          name = "VortexBankPassthrough",
-          sourceId = IdRange(0, 1 << config.coreTagWidth),
+          sourceId = IdRange(0, 1 << sourceWidth),
          supportsProbe = TransferSizes(1, config.cacheLineSize),
          supportsGet = TransferSizes(1, config.cacheLineSize),
          supportsPutFull = TransferSizes(1, config.cacheLineSize),
@@ -121,6 +122,16 @@ class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
    val (upstream, _) = coresideNode.in(0)
    val (downstream, _) = vxCacheFetchNode.out(0)

+    // Make sure the outgoing edge of this passthrough has enough sourceIds
+    // that encompasses the core-side incoming edge's.  This is an unfortunate
+    // hack due to not doing proper param negotiations across disconnected
+    // Diplomacy graphs.
+    // println(s"${upstream.params.sourceBits} <= ${downstream.params.sourceBits}")
+    require(upstream.params.sourceBits <= downstream.params.sourceBits,
+            "mem-side source of L1 cache truncates core-side source! " +
+            "Try lowering core/coalescer srcIds, or increasing sourceWidth " +
+            "for VortexBankPassThrough")
+
    downstream.a <> upstream.a
    upstream.d <> downstream.d
  }
@@ -197,13 +208,17 @@ class VortexBankImp(
    outer: VortexBank,
    config: VortexL1Config
 ) extends LazyModuleImp(outer) {
+  val (tlInFromCoal, _) = outer.coresideNode.in.head
+  val coreTagWidth = tlInFromCoal.a.bits.source.getWidth
+  val coreTagWidthPlusSize = coreTagWidth + log2Ceil(config.inputSize)
+
  val vxCache = Module(
    new VX_cache_top(
      WORD_SIZE = config.inputSize,
      // distribute total size across numBanks
      CACHE_SIZE = config.cacheSize / config.numBanks,
      CACHE_LINE_SIZE = config.cacheLineSize,
-      CORE_TAG_WIDTH = config.coreTagPlusSizeWidth,
+      CORE_TAG_WIDTH = coreTagWidthPlusSize,
      MSHR_SIZE = config.mshrSize
    )
  );
@@ -232,7 +247,7 @@ class VortexBankImp(

  class ReadReqInfo(config: VortexL1Config) extends Bundle {
    val size = UInt(log2Ceil(4).W + 1)
-    val id = UInt(config.coreTagWidth.W)
+    val id = UInt(coreTagWidth.W)
  }

  val coreWriteReqQueue = Module(
@@ -247,8 +262,6 @@ class VortexBankImp(

  // Translate TL request from Coalescer to requests for VX_cache
  def TLReq2VXReq = {
-    val (tlInFromCoal, _) = outer.coresideNode.in.head
-
    // coal -> vxCache
    tlInFromCoal.a.ready :=
      vxCache.io.core_req_ready && coreWriteReqQueue.io.enq.ready // not optimal
@@ -269,13 +282,9 @@ class VortexBankImp(
    readReqInfo.id := tlInFromCoal.a.bits.source
    readReqInfo.size := tlInFromCoal.a.bits.size
    assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth,
-      s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}")
+      s"id width mismatch; core-side ${tlInFromCoal.a.bits.source.getWidth}, cache-side ${readReqInfo.id.getWidth}")
    assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth,
-      s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}")
-    assert(readReqInfo.id.getWidth == tlInFromCoal.a.bits.source.getWidth,
-      s"id width mismatch; coalescer ${tlInFromCoal.a.bits.source.getWidth}, cache ${readReqInfo.id.getWidth}")
-    assert(readReqInfo.size.getWidth == tlInFromCoal.a.bits.size.getWidth,
-      s"size width mismatch; coalescer ${tlInFromCoal.a.bits.size.getWidth}, cache ${readReqInfo.size.getWidth}")
+      s"size width mismatch; core-side ${tlInFromCoal.a.bits.size.getWidth}, cache-side ${readReqInfo.size.getWidth}")
    // ignore param, size, corrupt
    vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag)

--- a/src/main/scala/radiance/subsystem/Configs.scala
+++ b/src/main/scala/radiance/subsystem/Configs.scala
@@ -42,7 +42,7 @@ class WithRadianceCores(
  useVxCache: Boolean
 ) extends Config((site, _, up) => {
  case TilesLocated(`location`) => {
-    val prev = up(TilesLocated(`location`), site)
+    val prev = up(TilesLocated(`location`))
    val idOffset = prev.size
    val vortex = RadianceTileParams(
      core = VortexCoreParams(fpu = None),
@@ -87,7 +87,7 @@ class WithRadianceGemmini(location: HierarchicalLocation,
                          crossing: RocketCrossingParams,
                          dim: Int, accSizeInKB: Int, tileSize: Int) extends Config((site, _, up) => {
  case TilesLocated(`location`) => {
-    val prev = up(TilesLocated(`location`), site)
+    val prev = up(TilesLocated(`location`))
    val idOffset = prev.size
    if (idOffset == 0) {
      println("******WARNING****** gemmini tile id is 0! radiance tiles in the same cluster needs to be before gemmini")
@@ -171,7 +171,7 @@ class WithFuzzerCores(
  useVxCache: Boolean
 ) extends Config((site, _, up) => {
  case TilesLocated(InSubsystem) => {
-    val prev = up(TilesLocated(InSubsystem), site)
+    val prev = up(TilesLocated(InSubsystem))
    val idOffset = prev.size
    val fuzzer = FuzzerTileParams(
      core = VortexCoreParams(fpu = None),
@@ -202,11 +202,11 @@ class WithRadianceCluster(
  case PossibleTileLocations => up(PossibleTileLocations) :+ InCluster(clusterId)
 })

-// `nSrcIds`: number of source IDs for dmem requests on each SIMT lane
+// `nSrcIds`: number of source IDs for each mem lane.  This is for all warps
 class WithSimtConfig(nWarps: Int = 4, nCoreLanes: Int = 4, nMemLanes: Int = 4, nSrcIds: Int = 8)
 extends Config((site, _, up) => {
  case SIMTCoreKey => {
-    Some(up(SIMTCoreKey, site).getOrElse(SIMTCoreParams()).copy(
+    Some(up(SIMTCoreKey).getOrElse(SIMTCoreParams()).copy(
      nWarps = nWarps,
      nCoreLanes = nCoreLanes,
      nMemLanes = nMemLanes,
@@ -228,22 +228,18 @@ extends Config((site, _, _) => {

 class WithPriorityCoalXbar extends Config((site, _, up) => {
  case CoalXbarKey => {
-    Some(up(CoalXbarKey, site).getOrElse(CoalXbarParam))
+    Some(up(CoalXbarKey).getOrElse(CoalXbarParam))
  }
 })

-class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
+class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, here, up) => {
  case VortexL1Key => {
    Some(defaultVortexL1Config.copy(
      numBanks = nBanks,
-      inputSize = up(SIMTCoreKey).get.nMemLanes * 4,
-      cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4,
+      inputSize = up(SIMTCoreKey).get.nMemLanes * 4/*32b word*/,
+      cacheLineSize = up(SIMTCoreKey).get.nMemLanes * 4/*32b word*/,
      memSideSourceIds = 16,
      mshrSize = 16,
-      coreTagWidth = log2Ceil(up(SIMTCoreKey).get.nSrcIds.max(up(CoalescerKey) match {
-        case Some(key) => key.numNewSrcIds
-        case None => 0
-      })) + log2Ceil(up(SIMTCoreKey).get.nMemLanes) + 1
    ))
  }
 })
@@ -254,7 +250,7 @@ class WithVortexL1Banks(nBanks: Int = 4) extends Config ((site, _, up) => {
 // to e.g. compare waveforms.
 class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config((site, _, up) => {
  case CoalescerKey => {
-    val (nLanes, numOldSrcIds) = up(SIMTCoreKey, site) match {
+    val (nLanes, numOldSrcIds) = up(SIMTCoreKey) match {
      case Some(param) => (param.nMemLanes, param.nSrcIds)
      case None => (1,1)
    }
@@ -266,7 +262,7 @@ class WithCoalescer(nNewSrcIds: Int = 8, enable : Boolean = true) extends Config

    // If instantiating L1 cache, the maximum coalescing size should match the
    // cache line size
-    val maxCoalSizeInBytes = up(VortexL1Key, site) match {
+    val maxCoalSizeInBytes = up(VortexL1Key) match {
      case Some(param) => param.inputSize
      case None => sbusWidthInBytes
    }
@@ -291,7 +287,7 @@ class WithNCustomSmallRocketCores(
                             crossing: RocketCrossingParams = RocketCrossingParams()
                           ) extends Config((site, here, up) => {
  case TilesLocated(InSubsystem) => {
-    val prev = up(TilesLocated(InSubsystem), site)
+    val prev = up(TilesLocated(InSubsystem))
    val idOffset = overrideIdOffset.getOrElse(prev.size)
    val med = RocketTileParams(
      core = RocketCoreParams(fpu = None),
@@ -325,7 +321,7 @@ class WithNCustomSmallRocketCores(
 class WithExtGPUMem(address: BigInt = BigInt("0x100000000", 16),
                    size: BigInt = 0x80000000) extends Config((site, here, up) => {
  case GPUMemory() => Some(GPUMemParams(address, size))
-  case ExtMem => up(ExtMem, site).map(x => {
+  case ExtMem => up(ExtMem).map(x => {
    val gap = address - x.master.base - x.master.size
    x.copy(master = x.master.copy(size = x.master.size + gap + size))
  })
--- a/src/main/scala/radiance/tile/RadianceCluster.scala
+++ b/src/main/scala/radiance/tile/RadianceCluster.scala
@@ -89,7 +89,7 @@ class RadianceCluster (
  val stride_by_word = true
  val filter_aligned = true
  val disable_monitors = true // otherwise it generate 1k+ different tl monitors
-  val serialize_unaligned = true
+  val serialize_unaligned = false

  def guard_monitors[T](callback: Parameters => T)(implicit p: Parameters): Unit = {
    if (disable_monitors) {
--- a/src/main/scala/radiance/tile/RadianceTile.scala
+++ b/src/main/scala/radiance/tile/RadianceTile.scala
@@ -7,8 +7,8 @@ import chisel3._
 import chisel3.experimental.AffectsChiselPrefix
 import chisel3.util._
 import freechips.rocketchip.devices.tilelink._
-import org.chipsalliance.diplomacy._
 import freechips.rocketchip.diplomacy._
+import org.chipsalliance.diplomacy.lazymodule.LazyModule
 import freechips.rocketchip.prci.ClockSinkParameters
 import freechips.rocketchip.regmapper.RegField
 import freechips.rocketchip.rocket._
@@ -319,8 +319,7 @@ class RadianceTile private (
      // )

      val icache = LazyModule(new VortexL1Cache(vortexL1Config.copy(
-        numBanks = 1,
-        coreTagWidth = imemSourceWidth
+        numBanks = 1
      )))
      val dcache = LazyModule(new VortexL1Cache(vortexL1Config))
      // imemNodes.foreach { icache.coresideNode := TLWidthWidget(4) := _ }
--- a/src/main/scala/radiance/tile/VortexCore.scala
+++ b/src/main/scala/radiance/tile/VortexCore.scala
@@ -172,6 +172,7 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
  addResource("/vsrc/vortex/hw/rtl/core/VX_decode.sv")
  addResource("/vsrc/vortex/hw/rtl/core/VX_dispatch.sv")
  addResource("/vsrc/vortex/hw/rtl/core/VX_dispatch_unit.sv")
+  addResource("/vsrc/vortex/hw/rtl/core/VX_dispatch_unit_sane.sv")
  addResource("/vsrc/vortex/hw/rtl/core/VX_execute.sv")
  addResource("/vsrc/vortex/hw/rtl/core/VX_fetch.sv")
  addResource("/vsrc/vortex/hw/rtl/core/VX_gather_unit.sv")
@@ -329,6 +330,8 @@ class Vortex(tile: RadianceTile)(implicit p: Parameters)
  addResource("/vsrc/vortex/hw/rtl/fpu/VX_fpu_fpnew.sv")
  addResource("/vsrc/vortex/hw/rtl/core/VX_fpu_unit.sv")

+  addResource("/vsrc/TensorDotProductUnit.sv")
+
  // fpnew
  // compile order matters; package definitions (ex. fpnew_pkg) should be
  // compiled before all the other modules that reference them.  They are added
--- a/src/test/scala/coalescing/CoalescerXbarUnitTest.scala.unused
+++ b/src/test/scala/coalescing/CoalescerXbarUnitTest.scala.unused
--- a/src/test/scala/coalescing/CoalescingUnitTest.scala.unused
+++ b/src/test/scala/coalescing/CoalescingUnitTest.scala.unused
--- a/src/test/scala/radiance/TensorDPUTest.scala
+++ b/src/test/scala/radiance/TensorDPUTest.scala
@@ -0,0 +1,96 @@
+package radiance.core
+
+import chisel3._
+import chisel3.util._
+import chiseltest._
+import chiseltest.simulator.VerilatorFlags
+import org.chipsalliance.cde.config.Parameters
+import freechips.rocketchip.tile
+import org.scalatest.flatspec.AnyFlatSpec
+
+class MulAddTest extends AnyFlatSpec with ChiselScalatestTester {
+  behavior of "MulAddRecFNPipe"
+
+  val t = tile.FType.S
+  it should "do basic arithmetic" in {
+    test(new MulAddRecFNPipe(2, t.exp, t.sig))
+      // .withAnnotations(Seq(WriteVcdAnnotation))
+      { c =>
+        c.io.validin.poke(true.B)
+        // 0: MADD
+        // 1: MSUB
+        // 2: NMSUB
+        // 3: NMADD
+        c.io.op.poke(0.U)
+        // rounding mode (p.113 of spec)
+        // 0: round to nearest, ties to even
+        c.io.roundingMode.poke(0.U)
+        c.io.detectTininess.poke(hardfloat.consts.tininess_beforeRounding)
+        c.io.a.poke(0x3f800000.U)
+        c.io.b.poke(0x3f800000.U)
+        c.io.c.poke(0x00000000.U)
+        c.clock.step()
+        c.io.validin.poke(false.B)
+        c.io.validout.expect(false.B)
+        c.clock.step()
+        c.io.validout.expect(true.B)
+        c.io.out.expect(0x40c00000.U)
+        c.clock.step()
+        c.io.validout.expect(false.B)
+      }
+  }
+}
+
+class TensorDotProductUnitTest extends AnyFlatSpec with ChiselScalatestTester {
+  behavior of "TensorDotProductUnit"
+
+  implicit val p: Parameters = Parameters.empty
+
+  it should "pass" in {
+    test(new TensorDotProductUnit)
+      // .withAnnotations(Seq(VerilatorBackendAnnotation))
+      // .withAnnotations(Seq(WriteVcdAnnotation))
+      { c =>
+        c.io.in.valid.poke(true.B)
+        c.io.stall.poke(false.B)
+        // (1,3,5,7)*(2,4,6,8) + 9 = 109
+        c.io.in.bits.a(0).poke(0x3f800000L.U(64.W))
+        c.io.in.bits.a(1).poke(0x40400000L.U(64.W))
+        c.io.in.bits.a(2).poke(0x40a00000L.U(64.W))
+        c.io.in.bits.a(3).poke(0x40e00000L.U(64.W))
+        c.io.in.bits.b(0).poke(0x40000000L.U(64.W))
+        c.io.in.bits.b(1).poke(0x40800000L.U(64.W))
+        c.io.in.bits.b(2).poke(0x40c00000L.U(64.W))
+        c.io.in.bits.b(3).poke(0x41000000L.U(64.W))
+        c.io.in.bits.c   .poke(0x41100000L.U(64.W))
+
+        c.io.out.valid.expect(false.B)
+
+        c.clock.step()
+        c.io.in.valid.poke(false.B)
+        c.io.out.valid.expect(false.B)
+
+        // stall the pipeline
+        c.io.stall.poke(true.B)
+        c.clock.step()
+        c.io.stall.poke(true.B)
+        c.clock.step()
+        c.io.stall.poke(true.B)
+        c.clock.step()
+        c.io.stall.poke(false.B)
+
+        c.clock.step()
+        c.clock.step()
+        c.clock.step()
+        // 4-cycle latency + stalls
+
+        c.io.out.valid.expect(true.B)
+        c.io.out.bits.data.expect(0x42da0000L.U)
+
+        c.clock.step()
+
+        c.io.out.valid.expect(false.B)
+      }
+  }
+}
+