diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 08540bc..6c08858 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -22,7 +22,9 @@ MemTraceReader::MemTraceReader(const std::string &filename) infile.open(filename); if (infile.fail()) { - fprintf(stderr, "failed to open file %s\n", filename.c_str()); + fprintf(stderr, "MemTraceReader: error: failed to open file %s\n", + filename.c_str()); + exit(EXIT_FAILURE); } } @@ -60,8 +62,6 @@ void MemTraceReader::parse(const bool has_source) { } if (!(infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id)) { - printf("char=[%c]\n", infile.peek()); - // assert(!infile.eof()); error(fileline, "failed parsing cycle..lane_id"); } if (has_source && !(infile >> source)) { @@ -101,8 +101,6 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, MemTraceLine line; line.valid = false; - // printf("tick(): cycle=%ld\n", cycle); - if (finished()) { return line; } @@ -112,7 +110,11 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, // the next line is in the future. if (line.cycle < cycle) { long fileline = read_pos - std::cbegin(trace_buf) + 1; - error(fileline, "some trace lines are left unread in the past"); + error(fileline, "some trace lines are left unread in the past. " + "Tried cycle=" + + std::to_string(cycle) + + ", found line.cycle=" + std::to_string(line.cycle) + + ". Is NUM_LANES set correctly?"); return MemTraceLine{}; } @@ -134,14 +136,17 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, // monotonically increment read_pos. lane_id need not be contiguous, e.g. // 0->1->3 is fine. ++read_pos; - return line; } else { // For debugging purposes, instead of early-returning on // !trace_read_ready, print something to notify we are blocking a valid // trace line. printf("All Lanes Blocked on this cycle! cycle=%ld \n", cycle); - return MemTraceLine{}; } + // We want to return valid line regardless of `trace_read_ready` or not, + // because we want to let the driver know that it missed a valid line at the + // given cycle, so that it holds its cycle counter and safely reads back the + // line in the future. + return line; } assert(!"unreachable"); diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index 4d630fd..74594cb 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -39,49 +39,32 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, output trace_read_finished ); - bit __in_valid [NUM_LANES-1:0]; - longint __in_address [NUM_LANES-1:0]; - - bit __in_is_store [NUM_LANES-1:0]; + bit __in_valid [NUM_LANES-1:0]; + longint __in_address [NUM_LANES-1:0]; + bit __in_is_store [NUM_LANES-1:0]; reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0]; - longint __in_data [NUM_LANES-1:0]; - - bit __in_finished; - string __uartlog; - - // Cycle counter that is used to query C parser whether we have a request - // coming in at the current cycle. - - - // registers that stage outputs of the C parser - reg [NUM_LANES-1:0] __in_valid_wire; - reg [`DATA_WIDTH-1:0] __in_address_wire [NUM_LANES-1:0]; - - reg [NUM_LANES-1:0] __in_is_store_wire; - reg [`LOGSIZE_WIDTH-1:0] __in_size_wire [NUM_LANES-1:0]; - reg [`DATA_WIDTH-1:0] __in_data_wire [NUM_LANES-1:0]; - reg __in_finished_wire; + longint __in_data [NUM_LANES-1:0]; + bit __in_finished; genvar g; - generate for (g = 0; g < NUM_LANES; g = g + 1) begin - assign trace_read_valid[g] = __in_valid_wire[g]; - assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_wire[g]; + assign trace_read_valid[g] = __in_valid[g]; + assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address[g]; - assign trace_read_is_store[g] = __in_is_store_wire[g]; - assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size_wire[g]; - assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_wire[g]; + assign trace_read_is_store[g] = __in_is_store[g]; + assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size[g]; + assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data[g]; end endgenerate - assign trace_read_finished = __in_finished_wire; + assign trace_read_finished = __in_finished; initial begin /* $value$plusargs("uartlog=%s", __uartlog); */ memtrace_init(FILENAME); end - always @(*) begin + always @(posedge clock) begin if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid[tid] = 1'b0; @@ -91,55 +74,29 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_size[tid] = `LOGSIZE_WIDTH'b0; __in_data[tid] = `DATA_WIDTH'b0; end - __in_finished = 1'b0; - - //cycle_counter <= `DATA_WIDTH'b0; - - // setting default value for register to avoid latches - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - __in_valid_wire[tid] = 1'b0; - __in_address_wire[tid] = `DATA_WIDTH'b0; - - __in_is_store_wire[tid] = 1'b0; - __in_size_wire[tid] = `LOGSIZE_WIDTH'b0; - __in_data_wire[tid] = `DATA_WIDTH'b0; - end - - __in_finished_wire = 1'b0; end else begin + // We have to write to __in_ regs only when trace_read_ready, or + // otherwise we might overwrite lines that were previously valid + // but the downstream missed by being not ready. + if (trace_read_ready) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + memtrace_query( + trace_read_ready, + trace_read_cycle, + tid, - // Getting values from C function into pseudeo register - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - memtrace_query( - trace_read_ready, - // Since parsed results are latched to the output on the next - // cycle due to staging registers, we need to pass in the next cycle - // to sync up. - trace_read_cycle, // the left replace next_cycle_counter, - tid, + __in_valid[tid], + __in_address[tid], - __in_valid[tid], - __in_address[tid], - - __in_is_store[tid], - __in_size[tid], - __in_data[tid], + __in_is_store[tid], + __in_size[tid], + __in_data[tid], - __in_finished - ); + __in_finished + ); + end end - - // Connect values from pseudo register into verilog register - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - __in_valid_wire[tid] = __in_valid[tid]; - __in_address_wire[tid] = __in_address[tid]; - - __in_is_store_wire[tid] = __in_is_store[tid]; - __in_size_wire[tid] = __in_size[tid]; - __in_data_wire[tid] = __in_data[tid]; - end - __in_finished_wire = __in_finished; end end endmodule diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index b1cd865..faaf569 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -5,12 +5,17 @@ package freechips.rocketchip.tilelink import chisel3._ import chisel3.util._ import chisel3.experimental.ChiselEnum -import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.cde.config.{Parameters, Field} import freechips.rocketchip.diplomacy._ // import freechips.rocketchip.devices.tilelink.TLTestRAM import freechips.rocketchip.util.MultiPortQueue import freechips.rocketchip.unittest._ +// TODO: find better place for these +case class SIMTCoreParams(nLanes: Int = 4, tracefilename: String = "undefined") + +case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/) + trait InFlightTableSizeEnum extends ChiselEnum { val INVALID: Type val FOUR: Type @@ -233,11 +238,14 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e })) // shift hint is when the heads have no more coalescable left this or next cycle - val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, i) => - c && !(io.invalidate.valid && i) + val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) => + c && !(io.invalidate.valid && inv) }.reduce(_ || _) val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _) - val syncedDeqValid = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire + // valid && !fire means we enable enqueueing to a full queue, provided the + // arbiter is taking away all remaining valid queue heads in the next cycle so + // that we make space for the entire next warp. + val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) for (i <- 0 until config.numLanes) { val enq = io.queue.enq(i) @@ -247,7 +255,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e ctrl.full := writePtr(i) === entries.U ctrl.empty := writePtr(i) === 0.U // shift when no outstanding dequeue, no more coalescable chunks, and not empty - ctrl.shift := !syncedDeqValid && shiftHint && !ctrl.empty + ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty // dequeue is valid when: // head entry is valid, has not been processed by downstream, and is not coalescable @@ -293,6 +301,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e } } + // When doing spatial-only coalescing, queues should never drift from each + // other, i.e. the queue heads should always contain mem requests from the + // same instruction. val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) && writePtr.map(_ === writePtr.head).reduce(_ && _) assert(queueInSync, "shift queue lanes are not in sync") @@ -326,23 +337,15 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], val leaders = io.window.elts.map(_.head) val leadersValid = io.window.mask.map(_.asBools.head) - // When doing spatial-only coalescing, queues should never drift from each - // other, i.e. the queue heads should always contain mem requests from the - // same instruction. - // FIXME: This relies on the MemTraceDriver's behavior of generating TL - // requests with full source info even when the corresponding lane is not - // active. - def testNoQueueDrift: Bool = leaders.map(_.source === leaders.head.source).reduce(_ || _) def printQueueHeads = { leaders.zipWithIndex.foreach{ case (head, i) => printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n", leadersValid(i), head.source, head.address) } } - when (leadersValid.reduce(_ || _)) { - assert(testNoQueueDrift, "unexpected drift between lane request queues") - // printQueueHeads - } + // when (leadersValid.reduce(_ || _)) { + // printQueueHeads + // } val size = coalLogSize val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U @@ -360,8 +363,8 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // compare leader's head against follower's every queue entry (followers zip followerValids.asBools).map { case (follower, followerValid) => canMatch(follower, followerValid, leader, leaderValid) - // disabling halving optimization because it does not give the correct - // per-lane coalescable indication to the shift queue + // FIXME: disabling halving optimization because it does not give the + // correct per-lane coalescable indication to the shift queue // // match leader to only followers at lanes >= leader idx // // this halves the number of comparators // if (followerIndex < leaderIndex) false.B @@ -375,16 +378,23 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], .reduce(_ +& _)) val canCoalesce = matchCounts.map(_ > 1.U) - // Elect the leader out of all potential leaders that have matchCounts > 1. + // Elect the leader that has the most match counts. // TODO: potentially expensive: magnitude comparator - // Maybe choose leftmost leader (priority encoder) instead of argmax - val chosenLeaderIdx = matchCounts.zipWithIndex.map { - case (c, i) => (c, i.U) - }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => - (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) - }._2 + def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = { + matchCounts.zipWithIndex.map { + case (c, i) => (c, i.U) + }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => + (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) + }._2 + } + // Elect leader by choosing the smallest-index lane that has a valid + // match, i.e. using priority encoder. + def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = { + PriorityEncoder(matchCounts.map(_ > 1.U)) + } + val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts) - val chosenLeader = VecInit(leaders)(chosenLeaderIdx) + val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux // matchTable for the chosen lane, but converted to a Vec[UInt] val chosenMatches = VecInit(matchTablePerLane.map{ table => VecInit(table.map(VecInit(_).asUInt)) @@ -428,6 +438,11 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], io.results.canCoalesce := canCoalesce } +// Combinational logic that generates a coalesced request given a request +// window, and a selection of possible coalesced sizes. May utilize multiple +// MonoCoalescers and apply size-choosing policy to determine the final +// coalesced request out of all possible combinations. +// // Software model: coalescer.py class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry, config: CoalescerConfig) extends Module { @@ -578,11 +593,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends reqQueues.io.coalescable := coalescer.io.coalescable reqQueues.io.invalidate := coalescer.io.invalidate - // Per-lane request and response queues + // =========================================================================== + // Request flow + // =========================================================================== // // Override IdentityNode implementation so that we can instantiate // queues between input and output edges to buffer requests and responses. // See IdentityNode definition in `diplomacy/Nodes.scala`. + // (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach { case (((tlIn, _), (tlOut, edgeOut)), lane) => // Request queue @@ -604,7 +622,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends val deq = reqQueues.io.queue.deq(lane) enq.valid := tlIn.a.valid enq.bits := req - deq.ready := true.B // TODO: deq.ready should respect downstream arbiter + // TODO: deq.ready should respect downstream arbiter + deq.ready := true.B + // Stall upstream core or memtrace driver when shiftqueue is not ready + tlIn.a.ready := enq.ready tlOut.a.valid := deq.valid tlOut.a.bits := deq.bits.toTLA(edgeOut) @@ -641,11 +662,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends tlCoal.e.valid := false.B - // ================================================================== - // ****************************************************************** - // ************************* REORG BOUNDARY ************************* - // ****************************************************************** - // ================================================================== + // =========================================================================== + // Response flow + // =========================================================================== + // + // Connect uncoalescer output and noncoalesced response ports to the response + // queues. // The maximum number of requests from a single lane that can go into a // coalesced request. Upper bound is min(DEPTH, 2**sourceWidth). @@ -810,6 +832,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle { val source = UInt(log2Ceil(config.numNewSrcIds).W) val data = UInt((8 * (1 << config.maxCoalLogSize)).W) + + def fromTLD(bundle:TLBundleD): Unit = { + this.source := bundle.source + this.data := bundle.data + } + } class Uncoalescer(config: CoalescerConfig) extends Module { @@ -882,8 +910,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module { // Un-coalesce responses back to individual lanes val found = inflightTable.io.lookup.bits (found.lanes zip io.uncoalResps).foreach { case (perLane, ioPerLane) => - perLane.reqs.zipWithIndex.foreach { case (oldReq, i) => - val ioOldReq = ioPerLane(i) + perLane.reqs.zipWithIndex.foreach { case (oldReq, depth) => + val ioOldReq = ioPerLane(depth) // TODO: spatial-only coalescing: only looking at 0th srcId entry ioOldReq.valid := false.B @@ -1077,24 +1105,18 @@ class TraceLine extends Bundle with HasTraceLine { class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFile: String) extends LazyModuleImp(outer) with UnitTestModule { + // Current cycle mark to read from trace + val traceReadCycle = RegInit(1.U(64.W)) - val globalClkCounter = RegInit(1.U(64.W)) - val traceReadCycle = RegInit(1.U(64.W)) - val downstreamSQready = WireInit(true.B) + // If any of the downstream lane is not ready, hold on from advancing + val downstreamReady = outer.laneNodes.map(_.out(0)._1.a.ready).reduce(_ && _) - //make the downstream only ready 1/4 of the time - //This is to test Tracer System's ability to hold on requests - //FIXME - downstreamSQready := (globalClkCounter(1,0) =/= 0.U) - //Connect Signals to Verilog BlackBox val sim = Module(new SimMemTrace(traceFile, config.numLanes)) sim.io.clock := clock sim.io.reset := reset.asBool - sim.io.trace_read.ready := downstreamSQready - //FIXME - 1.U hardcoded, currently there is a delay between chisel and verilog + sim.io.trace_read.ready := downstreamReady sim.io.trace_read.cycle := traceReadCycle - // Read output from Verilog BlackBox // Split output of SimMemTrace, which is flattened across all lanes,back to each lane's. val laneReqs = Wire(Vec(config.numLanes, new TraceLine)) @@ -1103,26 +1125,28 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil val dataW = laneReqs(0).data.getWidth laneReqs.zipWithIndex.foreach { case (req, i) => req.valid := sim.io.trace_read.valid(i) - // TODO: driver trace doesn't contain source id - req.source := 0.U + req.source := 0.U // driver trace doesn't contain source id req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i) req.is_store := sim.io.trace_read.is_store(i) req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i) req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } - globalClkCounter := globalClkCounter + 1.U - val existValidReq = WireInit(false.B) - existValidReq := laneReqs.map(_.valid).reduce(_||_) - val validReqBlocked = WireInit(false.B) - validReqBlocked := !downstreamSQready && existValidReq - //Debug - dontTouch(downstreamSQready) - dontTouch(existValidReq) - dontTouch(validReqBlocked) - // Do Not Update TraceReadCycle if downstream is blocking - when(!validReqBlocked){ - traceReadCycle := traceReadCycle + 1.U + // def missedLine = { + // val existsValidLine = WireInit(false.B) + // existsValidLine := laneReqs.map(_.valid).reduce(_||_) + // val missedLine = WireInit(false.B) + // missedLine := !downstreamReady && existsValidLine + + // // Debug + // dontTouch(downstreamReady) + // dontTouch(existsValidLine) + // dontTouch(missedLine) + + // missedLine + // } + when (downstreamReady){ + traceReadCycle := traceReadCycle + 1.U } // To prevent collision of sourceId with a current in-flight message, @@ -1157,19 +1181,6 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W) val wordAlignedSize = Mux(subword, 2.U, req.size) - // when(req.valid && subword) { - // printf( - // "address=%x, size=%d, data=%x, addressMask=%x, wordAlignedAddress=%x, mask=%x, wordData=%x\n", - // req.address, - // req.size, - // req.data, - // ~((1 << log2Ceil(config.WORD_SIZE)) - 1).U(addrW.W), - // wordAlignedAddress, - // mask, - // wordData - // ) - // } - val (tlOut, edge) = node.out(0) val (plegal, pbits) = edge.Put( fromSource = sourceIdCounter, @@ -1350,7 +1361,9 @@ class MemTraceLogger( // requests on TL A channel // - req.valid := tlIn.a.valid + // Only log trace when fired, e.g. both upstream and downstream is ready + // and transaction happened. + req.valid := tlIn.a.fire req.size := tlIn.a.bits.size req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source @@ -1358,27 +1371,6 @@ class MemTraceLogger( // originally requested, so no postprocessing required req.address := tlIn.a.bits.address - // TL data - // - // When tlIn.a.bits.size is smaller than the data bus width, need to - // figure out which byte lanes we actually accessed so that - // we can write that to the memory trace. - // See Section 4.5 Byte Lanes in spec 1.8.1 - - // This assert only holds true for PutFullData and not PutPartialData, - // where HIGH bits in the mask may not be contiguous. - assert( - PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), - "mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic" - ) - val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) - val dataW = tlIn.params.dataBits - val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U)) - req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) - // when (req.valid) { - // printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data) - // } - when(req.valid) { TLPrintf( s"MemTraceLogger (${loggerName}:downstream)", @@ -1391,9 +1383,33 @@ class MemTraceLogger( ) } + // TL data + // + // When tlIn.a.bits.size is smaller than the data bus width, need to + // figure out which byte lanes we actually accessed so that + // we can write that to the memory trace. + // See Section 4.5 Byte Lanes in spec 1.8.1 + + // This assert only holds true for PutFullData and not PutPartialData, + // where HIGH bits in the mask may not be contiguous. + assert( + PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), + "mask HIGH popcount do not match the TL size. " + + "Partial masks are not allowed for PutFull" + ) + val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) + val dataW = tlIn.params.dataBits + val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U)) + req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) + // when (req.valid) { + // printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data) + // } + // responses on TL D channel // - resp.valid := tlOut.d.valid + // Only log trace when fired, e.g. both upstream and downstream is ready + // and transaction happened. + resp.valid := tlOut.d.fire resp.size := tlOut.d.bits.size resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) resp.source := tlOut.d.bits.source @@ -1427,7 +1443,7 @@ class MemTraceLogger( // // This is a clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. - def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { + def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { // these will get optimized out val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid))) val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source))) @@ -1443,12 +1459,12 @@ class MemTraceLogger( vecSize(i) := l.size vecData(i) := l.data } - traceLogIO.valid := vecValid.asUInt - traceLogIO.source := vecSource.asUInt - traceLogIO.address := vecAddress.asUInt - traceLogIO.is_store := vecIsStore.asUInt - traceLogIO.size := vecSize.asUInt - traceLogIO.data := vecData.asUInt + simIO.valid := vecValid.asUInt + simIO.source := vecSource.asUInt + simIO.address := vecAddress.asUInt + simIO.is_store := vecIsStore.asUInt + simIO.size := vecSize.asUInt + simIO.data := vecData.asUInt } if (simReq.isDefined) { @@ -1538,7 +1554,7 @@ class DummyDriver(config: CoalescerConfig)(implicit p: Parameters) val clientParam = Seq( TLMasterParameters.v1( name = "dummy-core-node-" + i.toString, - sourceId = IdRange(0, defaultConfig.numOldSrcIds) + sourceId = IdRange(0, config.numOldSrcIds) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) ) @@ -1599,18 +1615,22 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig) // A dummy harness around the coalescer for use in VLSI flow. // Should not instantiate any memtrace modules. class DummyCoalescer(implicit p: Parameters) extends LazyModule { - val driver = LazyModule(new DummyDriver(defaultConfig)) - val rams = Seq.fill(defaultConfig.numLanes + 1)( // +1 for coalesced edge + val numLanes = p(SIMTCoreKey).get.nLanes + println(s"============ numLanes: ${numLanes}") + val config = defaultConfig.copy(numLanes = numLanes) + + val driver = LazyModule(new DummyDriver(config)) + val rams = Seq.fill(config.numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. new TLRAM(address = AddressSet(0x0000, 0xffffff), - beatBytes = (1 << defaultConfig.dataBusWidth)) + beatBytes = (1 << config.dataBusWidth)) ) ) - val coal = LazyModule(new CoalescingUnit(defaultConfig)) + val coal = LazyModule(new CoalescingUnit(config)) coal.cpuNode :=* driver.node rams.foreach(_.node := coal.aggregateNode) @@ -1629,18 +1649,15 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) } // tracedriver --> coalescer --> tracelogger --> tlram -class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { - // val filename = "test.trace" - val filename = "vecadd.core1.thread4.trace" - // val filename = "nvbit.vecadd.n100000.filter_sm0.trace" - // TODO: use parameters for numLanes - val numLanes = defaultConfig.numLanes +class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { + val numLanes = p(SIMTCoreKey).get.nLanes + val config = defaultConfig.copy(numLanes = numLanes) - val driver = LazyModule(new MemTraceDriver(defaultConfig, filename)) + val driver = LazyModule(new MemTraceDriver(config, filename)) val coreSideLogger = LazyModule( new MemTraceLogger(numLanes, filename, loggerName = "coreside") ) - val coal = LazyModule(new CoalescingUnit(defaultConfig)) + val coal = LazyModule(new CoalescingUnit(config)) val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")) val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( @@ -1648,7 +1665,7 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. new TLRAM(address = AddressSet(0x0000, 0xffffff), - beatBytes = (1 << defaultConfig.dataBusWidth)) + beatBytes = (1 << config.dataBusWidth)) ) ) @@ -1674,13 +1691,14 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { (coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes), "FAIL: requests and responses traffic to the coalescer do not match" ) + printf("SUCCESS: coalescer response traffic matched requests!\n") } } } -class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) +class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new TLRAMCoalescerLogger).module) + val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module) dut.io.start := io.start io.finished := dut.io.finished } @@ -1766,12 +1784,20 @@ class CoalArbiter(config: CoalescerConfig) (implicit p: Parameters) extends Lazy coalNode := coalReqNode ) - + //Assertion Section + def isPowerOfTwo(n: Int): Boolean = { + (n > 0) && ((n & (n - 1)) == 0) + } + assert(isPowerOfTwo(config.numOldSrcIds), "Number of old source id must be power of 2") + assert(isPowerOfTwo(config.numNewSrcIds), "Number of new source id must be power of 2") + //Below is for efficient conversion from Global to Local bits + //Also, we should have more source id for coalesced request for better perf + assert(config.numNewSrcIds >= config.numOldSrcIds, "new source id must be equal or greater than old source id") // 1 Final Output Identity Node val outputNode = TLIdentityNode() - //Explictly define I/O bundule tyoe + val nonCoalEntryT = new ReqQueueEntry( log2Ceil(config.numOldSrcIds), config.wordWidth, @@ -1791,10 +1817,13 @@ class CoalArbiter(config: CoalescerConfig) (implicit p: Parameters) extends Lazy ) val respCoalBundleT = new CoalescedResponseBundle(config) - + + lazy val module = new CoalArbiterImpl( this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT) + + } class CoalArbiterImpl(outer: CoalArbiter, @@ -1814,5 +1843,311 @@ class CoalArbiterImpl(outer: CoalArbiter, } ) + //Helper Class & Method Section + + //Provide an simple decoupled interface between bundle of 2 different type + class ConverterTunnel[T <: Data, U <: Data]( + genA: T, + genB: U, + conversionFn: T => U + ) extends Module { + val io = IO(new Bundle { + val in = Flipped(Decoupled(genA.cloneType)) + val out = Decoupled(genB.cloneType) + }) + io.in.ready := io.out.ready + io.out.valid := io.in.valid + io.out.bits := conversionFn(io.in.bits) + } + + + def canHitBank(addr: UInt, bankNum: UInt) : Bool = { + val byteOffset = 3 + val bankBase = log2Ceil(config.bankStrideInBytes) + val bankOffset = log2Ceil(config.numArbiterOutputPorts) + (addr(bankBase+bankOffset-byteOffset, bankBase - byteOffset) === bankNum) + } + + //This Operation Could be Expensive + def toGlobalSourceId(isCoalReq : Bool, laneIdx : UInt, sourceID : UInt) : UInt = { + val gid = Mux(isCoalReq, + config.numNewSrcIds.U * laneIdx + sourceID, + config.numOldSrcIds.U * laneIdx + sourceID + config.numNewSrcIds.U * config.numCoalReqs.U + ) + gid + } + //All the ids are power of 2, so we can just look at bottom bits + def toLocalSourceId(isCoalReq : Bool, sourceID : UInt) : UInt = { + val sid = Mux(isCoalReq, + sourceID(log2Ceil(config.numNewSrcIds)-1, 0), + sourceID(log2Ceil(config.numOldSrcIds)-1, 0) + ) + sid + } + def belongsToLane(laneIdx: UInt, gid: UInt) : Bool = { + val base = config.numNewSrcIds.U * config.numCoalReqs.U + ((gid >= base + config.numOldSrcIds.U * laneIdx) && + (gid < base + config.numOldSrcIds.U * (laneIdx+1.U))) + } + + def isCoalReq(gid : UInt) : Bool = { + gid <= config.numNewSrcIds.U * config.numCoalReqs.U + } + + // + val fullSourceIdRange = config.numOldSrcIds * config.numLanes + config.numNewSrcIds * config.numCoalReqs + + + val nonCoalGiDEntryT = new ReqQueueEntry( + log2Ceil(fullSourceIdRange), + config.wordWidth, + config.addressWidth, + log2Ceil(config.wordSizeInBytes) + ) + val coalGiDEntryT = new ReqQueueEntry( + log2Ceil(fullSourceIdRange), + log2Ceil(config.maxCoalLogSize), + config.addressWidth, + config.maxCoalLogSize //already log 2 + ) + + // Before either a coalesced or non coalesced request enter RR arbiter + // It needs to turn its source into global source id + // Unfortunately this involves extending the width of sourceid field, and a new bundle must be created + // This is a higher order function + def reqEntry2GidReqFn(laneIndex : UInt, reqEntryT : ReqQueueEntry, isCoalReq : Bool) : ReqQueueEntry => ReqQueueEntry = { + def func(lid_req : ReqQueueEntry) : ReqQueueEntry = { + val gid_req = reqEntryT.cloneType + gid_req <> lid_req + gid_req.source := toGlobalSourceId(isCoalReq, laneIndex, lid_req.source) + gid_req + } + func + } + + + def reqEntry2TLAFn(edgeOut: TLEdgeOut) : ReqQueueEntry => TLBundleA = { + def func(gid_req : ReqQueueEntry) : TLBundleA = { + gid_req.toTLA(edgeOut) + } + func + } + + def tlD2respEntryFn() : TLBundleD => RespQueueEntry = { + def func(bundle: TLBundleD) : RespQueueEntry = { + val resp = Wire(respNonCoalEntryT) + resp.fromTLD(bundle) + resp.source := toLocalSourceId(false.B, bundle.source) + resp + } + func + } + def tlD2CoalBundleFn() : TLBundleD => CoalescedResponseBundle = { + def func(bundle: TLBundleD) : CoalescedResponseBundle = { + val coalbundle = Wire(respCoalBundleT) + coalbundle.fromTLD(bundle) + coalbundle.source := toLocalSourceId(true.B, bundle.source) + coalbundle + } + func + } + + ///////////////////////////////////////////////////// + //HDL Implementation Section + ///////////////////////////////////////////////////// + + //Stage 1: Create Queue for nonCoalReqs and CoalReqs + val nonCoalReqsQueues = Seq.tabulate(config.numLanes){_=> + Module(new Queue(nonCoalEntryT.cloneType, 1, true, false)) + } + val coalReqsQueues = Seq.tabulate(config.numCoalReqs){_=> + Module(new Queue(coalEntryT.cloneType, 1, true, false)) + } + //Stage 1a: connect two Queue groups to the input + (io.nonCoalReqs zip nonCoalReqsQueues).foreach{ + case (req, q) => q.io.enq <> req + } + (io.coalReqs zip coalReqsQueues).foreach{ + case (req, q) => q.io.enq <> req + } + //Stage 1b: connect output of Queues to the RR arbiters (each arbiter is for a unique bank) + // the two loops below could be merged into one loop, but separated for readability + val nonCoalRRArbiters = Seq.tabulate(config.numArbiterOutputPorts){_=> + Module(new RRArbiter(nonCoalGiDEntryT.cloneType, config.numLanes)) + } + nonCoalReqsQueues.zipWithIndex.foreach{ case(q, q_idx) => + nonCoalRRArbiters.zipWithIndex.foreach{ case(arb, arb_idx) => + val nonCoal2gidFunc = reqEntry2GidReqFn(q_idx.U, nonCoalGiDEntryT, false.B) + val nonCoalRRArbTunnel = Module(new ConverterTunnel( + nonCoalEntryT.cloneType, + nonCoalGiDEntryT.cloneType, + nonCoal2gidFunc) + ) + nonCoalRRArbTunnel.io.in <> q.io.deq + arb.io.in(q_idx) <> nonCoalRRArbTunnel.io.out + //OverWrite Valid base on if we can actually hit this bank + arb.io.in(q_idx).valid := canHitBank(nonCoalRRArbTunnel.io.out.bits.address, arb_idx.U) && + nonCoalRRArbTunnel.io.out.valid + } + } + val coalRRArbiters = Seq.tabulate(config.numArbiterOutputPorts){_=> + Module(new RRArbiter(coalGiDEntryT.cloneType, config.numCoalReqs)) + } + coalReqsQueues.zipWithIndex.foreach{ case(q, q_idx) => + coalRRArbiters.zipWithIndex.foreach{ case(arb, arb_idx) => + val coal2gidFunc = reqEntry2GidReqFn(q_idx.U, coalGiDEntryT, true.B) + val coalRRArbTunnel = Module(new ConverterTunnel( + coalEntryT.cloneType, + coalGiDEntryT.cloneType, + coal2gidFunc) + ) + coalRRArbTunnel.io.in <> q.io.deq + arb.io.in(q_idx) <> coalRRArbTunnel.io.out + //OverWrite Valid + arb.io.in(q_idx).valid := canHitBank(coalRRArbTunnel.io.out.bits.address, arb_idx.U) && + coalRRArbTunnel.io.out.valid + } + } + + + //Stage 2, Connect the output of Arbiters to respective nonCoal node + + // Concatenate the nodes , concatenates the arbiters, and zip them together, then loop + // the reqEntry2TLA will generate different TLA bundle depending on if the Req is coal or non coal + ((outer.nonCoalNarrowNodes++outer.coalReqNodes) zip + (nonCoalRRArbiters++coalRRArbiters)).foreach{ + case (node, arb) => + val (tlOut, edgeOut) = node.out(0) + val coal2TLAFunc = reqEntry2TLAFn(edgeOut) + val nonCoalTLATunnel = Module(new ConverterTunnel( + arb.io.out.bits.cloneType, + tlOut.a.bits.cloneType, + coal2TLAFunc + ) + ) + nonCoalTLATunnel.io.in <> arb.io.out + tlOut.a <> nonCoalTLATunnel.io.out + } + + + //Stage 3, Make the Idenity node pass through channel A + // Connect the K edges Identity Node to PO arbiter + // noncoalesced to port 1, coalesced to port 0 + + val priorityArbs = Seq.tabulate(config.numArbiterOutputPorts){_=> + Module(new Arbiter(outer.outputNode.out(0)._1.a.bits.cloneType, 2)) + } + + //Make both Idenity node Pass Through Channel A, for both Coal and NonCoal + ((outer.nonCoalNode.out ++ outer.coalNode.out) zip + (outer.nonCoalNode.in ++ outer.coalNode.in)).foreach{ + case ((tlOut,_),(tlIn,_)) => + tlOut.a <> tlIn.a + } + //Connection to PO Arbiters + ((outer.nonCoalNode.out zip outer.coalNode.out) zip priorityArbs).foreach{ + case (((nonCoalOut, _),(coalOut, _)), arb) => + arb.io.in(1) <> nonCoalOut.a + arb.io.in(0) <> coalOut.a + } + + + //Stage 4, Connect PO arbiter to each edge of output Node + //And make idenitity node passs through the inputs + ((outer.outputNode.in zip outer.outputNode.out) zip priorityArbs).foreach{ + case (((tlIn, _), (tlOut, _)), arb) => + tlOut.a <> tlIn.a + tlIn.a <> arb.io.out + } + + + + //////////////// + // Incoming Data Handling + + //Stage 1, Forward data from output node to the Idenity node of Coal and NonCoal + // while setting the correct valid signal to base on if the request is Coalesced or not + + ((outer.outputNode.in zip outer.outputNode.out) zip + (outer.nonCoalNode.out zip outer.coalNode.out)).foreach{ + case( ((tlIn, _),(tlOut, _)), ((nonCoalOut, _),(coalOut, _)) ) => + tlIn.d <> tlOut.d + nonCoalOut.d <> tlIn.d + coalOut.d <> tlIn.d + //rewrite valid signal + nonCoalOut.d.valid := !isCoalReq(tlIn.d.bits.source) && tlIn.d.valid + coalOut.d.valid := isCoalReq(tlIn.d.bits.source) && tlIn.d.valid + } + + //Stage 2, Make both Idenity node Pass Through Channel D, for both Coal and NonCoal + // + ((outer.nonCoalNode.out ++ outer.coalNode.out) zip + (outer.nonCoalNode.in ++ outer.coalNode.in)).foreach{ + case ((tlOut,_),(tlIn,_)) => + tlIn.d <> tlOut.d + } + + //Stage 3, Connect the channel D of nonCoalNodes to the perLane arbiters + + //Stage 3a, connect the noncoalesced edge to every single perlane arbiter + val perLaneRespRRArbs = Seq.tabulate(config.numLanes){_=> + Module(new RRArbiter(respNonCoalEntryT.cloneType, config.numArbiterOutputPorts)) + } + outer.nonCoalNarrowNodes.zipWithIndex.foreach{ + case (node, node_idx) => + val (tlOut, edgeOut) = node.out(0) + perLaneRespRRArbs.zipWithIndex.foreach{ + case(arb, arb_idx) => + val tlD2RespEntryFunc = tlD2respEntryFn() + val perLaneArbTunnel = Module(new ConverterTunnel( + tlOut.d.bits.cloneType, + arb.io.in(0).bits.cloneType, + tlD2RespEntryFunc + ) + ) + perLaneArbTunnel.io.in <> tlOut.d + arb.io.in(node_idx) <> perLaneArbTunnel.io.out + //rewrite valid base on if source id actually belongs to this lane + arb.io.in(node_idx).valid := belongsToLane(arb_idx.U, perLaneArbTunnel.io.out.bits.source) && + perLaneArbTunnel.io.out.valid + } + } + //Stage 3b, connect coalesced request to + val coalBundleRRArbiter = Module(new RRArbiter(respCoalBundleT.cloneType, config.numArbiterOutputPorts)) + outer.coalReqNodes.zipWithIndex.foreach{ + case(node, node_idx) => + val (tlOut, edgeOut) = node.out(0) + val tlD2CoalBundleFunc = tlD2CoalBundleFn() + val coalBundleArbTunnel = Module(new ConverterTunnel( + tlOut.d.bits.cloneType, + coalBundleRRArbiter.io.in(0).bits.cloneType, + tlD2CoalBundleFunc + ) + ) + coalBundleArbTunnel.io.in <> tlOut.d + coalBundleRRArbiter.io.in(node_idx) <> coalBundleArbTunnel.io.out + } + + + //Connect 4, Connect the arbiters to output + // connect the noncoalesced vector + (perLaneRespRRArbs zip io.nonCoalResps).foreach{ + case (arb, resp) => + resp <> arb.io.out + } + // connect the coalesced bundle + io.coalResp <> coalBundleRRArbiter.io.out + + + + + + } + + + + + + + -} diff --git a/src/main/scala/tilelink/TracerSystemMem.scala b/src/main/scala/tilelink/TracerSystemMem.scala index e0d495b..ca31864 100644 --- a/src/main/scala/tilelink/TracerSystemMem.scala +++ b/src/main/scala/tilelink/TracerSystemMem.scala @@ -1,44 +1,25 @@ - package freechips.rocketchip.tilelink -import chisel3._ -import chisel3.util._ import freechips.rocketchip.diplomacy._ -import freechips.rocketchip.subsystem.{BaseSubsystem, CacheBlockBytes} -import org.chipsalliance.cde.config.{Parameters, Field, Config} +import freechips.rocketchip.subsystem.{BaseSubsystem} +import org.chipsalliance.cde.config.{Parameters, Config} -// class class, consumed by WithGPUTacer config and GPUTracerKey - -case class GPUTracerConfig(numLanes: Int, traceFile : String) // FIXME, add lane number and file name - -case object GPUTracerKey extends Field[Option[GPUTracerConfig]](None) - - - -// Both LazyModule of Tracer and Impl are both in Coalescing.scala - - -//The trait is attached to DigitalTop of Chipyard system, informing it indeed has the ability -//to attach GPU tracer node onto the system bus +// The trait is attached to DigitalTop of Chipyard system, informing it indeed +// has the ability to attach GPU tracer node onto the system bus trait CanHaveGPUTracer { this: BaseSubsystem => implicit val p: Parameters - //p(GPUTracerKey) is the mechnimism to pass Config's parameter down to lazymodule - p(GPUTracerKey) .map { k => - val config = p(GPUTracerKey).get - val tracer = LazyModule(new MemTraceDriver(defaultConfig, config.traceFile)(p)) - // Must use :=* to ensure the N edges from Tracer doesn't get merged into 1 when connecting to SBus + p(SIMTCoreKey).map { _ => + val config = p(SIMTCoreKey).get + val tracer = LazyModule(new MemTraceDriver(defaultConfig, config.tracefilename)(p)) + // Must use :=* to ensure the N edges from Tracer doesn't get merged into 1 + // when connecting to SBus sbus.fromPort(Some("gpu-tracer"))() :=* tracer.node } } - //This is used by Chip Level Config, the config which creates the SoC -class WithGPUTracer(numLanes: Int, traceFile : String) extends Config((site, here, up) => { - case GPUTracerKey => Some( GPUTracerConfig(numLanes, traceFile) ) -} -) - - - - +class WithGPUTracer(numLanes: Int, tracefilename: String) + extends Config((_, _, _) => { case SIMTCoreKey => + Some(SIMTCoreParams(numLanes, tracefilename)) + }) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 1833746..94c8a67 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -2,10 +2,12 @@ package freechips.rocketchip.tilelink.coalescing import chisel3._ import chiseltest._ +import chiseltest.simulator.VerilatorFlags import org.scalatest.flatspec.AnyFlatSpec import freechips.rocketchip.tilelink._ import freechips.rocketchip.util.MultiPortQueue import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.subsystem.WithoutTLMonitors import org.chipsalliance.cde.config.Parameters import chisel3.util.{DecoupledIO, Valid} import chisel3.util.experimental.BoringUtils @@ -190,8 +192,8 @@ object testConfig extends CoalescerConfig( respQueueDepth = 4, coalLogSizes = Seq(4, 5), sizeEnum = DefaultInFlightTableSizeEnum, - numArbiterOutputPorts = 4, numCoalReqs = 1, + numArbiterOutputPorts = 4, bankStrideInBytes = 64 ) @@ -229,8 +231,8 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } it should "coalesce fully consecutive accesses at size 4, only once" in { - test(LazyModule(new DummyCoalescingUnitTB()).module) - .withAnnotations(Seq(VerilatorBackendAnnotation, WriteFstAnnotation)) + test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module) + .withAnnotations(Seq(VerilatorBackendAnnotation, VerilatorFlags(Seq("--coverage-line")), WriteFstAnnotation)) // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => val nodes = c.coalIOs.map(_.head) @@ -291,8 +293,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } it should "coalesce identical addresses (stride of 0)" in { - test(LazyModule(new DummyCoalescingUnitTB()).module) -// .withAnnotations(Seq(VcsBackendAnnotation)) + test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module) .withAnnotations(Seq(VerilatorBackendAnnotation)) { c => println(s"coalIO length = ${c.coalIOs(0).length}")