From 5e073f2dec3421b45b2fe38baab94f198bdfaca5 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 12:37:33 -0700 Subject: [PATCH 01/19] Doc update --- src/main/scala/tilelink/Coalescing.scala | 78 ++++++++++++------------ 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index b1cd865..05e6ab3 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -293,6 +293,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e } } + // When doing spatial-only coalescing, queues should never drift from each + // other, i.e. the queue heads should always contain mem requests from the + // same instruction. val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) && writePtr.map(_ === writePtr.head).reduce(_ && _) assert(queueInSync, "shift queue lanes are not in sync") @@ -326,23 +329,15 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], val leaders = io.window.elts.map(_.head) val leadersValid = io.window.mask.map(_.asBools.head) - // When doing spatial-only coalescing, queues should never drift from each - // other, i.e. the queue heads should always contain mem requests from the - // same instruction. - // FIXME: This relies on the MemTraceDriver's behavior of generating TL - // requests with full source info even when the corresponding lane is not - // active. - def testNoQueueDrift: Bool = leaders.map(_.source === leaders.head.source).reduce(_ || _) def printQueueHeads = { leaders.zipWithIndex.foreach{ case (head, i) => printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n", leadersValid(i), head.source, head.address) } } - when (leadersValid.reduce(_ || _)) { - assert(testNoQueueDrift, "unexpected drift between lane request queues") - // printQueueHeads - } + // when (leadersValid.reduce(_ || _)) { + // printQueueHeads + // } val size = coalLogSize val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U @@ -578,11 +573,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends reqQueues.io.coalescable := coalescer.io.coalescable reqQueues.io.invalidate := coalescer.io.invalidate - // Per-lane request and response queues + // =========================================================================== + // Request flow + // =========================================================================== // // Override IdentityNode implementation so that we can instantiate // queues between input and output edges to buffer requests and responses. // See IdentityNode definition in `diplomacy/Nodes.scala`. + // (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach { case (((tlIn, _), (tlOut, edgeOut)), lane) => // Request queue @@ -641,11 +639,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends tlCoal.e.valid := false.B - // ================================================================== - // ****************************************************************** - // ************************* REORG BOUNDARY ************************* - // ****************************************************************** - // ================================================================== + // =========================================================================== + // Response flow + // =========================================================================== + // + // Connect uncoalescer output and noncoalesced response ports to the response + // queues. // The maximum number of requests from a single lane that can go into a // coalesced request. Upper bound is min(DEPTH, 2**sourceWidth). @@ -1358,27 +1357,6 @@ class MemTraceLogger( // originally requested, so no postprocessing required req.address := tlIn.a.bits.address - // TL data - // - // When tlIn.a.bits.size is smaller than the data bus width, need to - // figure out which byte lanes we actually accessed so that - // we can write that to the memory trace. - // See Section 4.5 Byte Lanes in spec 1.8.1 - - // This assert only holds true for PutFullData and not PutPartialData, - // where HIGH bits in the mask may not be contiguous. - assert( - PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), - "mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic" - ) - val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) - val dataW = tlIn.params.dataBits - val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U)) - req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) - // when (req.valid) { - // printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data) - // } - when(req.valid) { TLPrintf( s"MemTraceLogger (${loggerName}:downstream)", @@ -1391,6 +1369,28 @@ class MemTraceLogger( ) } + // TL data + // + // When tlIn.a.bits.size is smaller than the data bus width, need to + // figure out which byte lanes we actually accessed so that + // we can write that to the memory trace. + // See Section 4.5 Byte Lanes in spec 1.8.1 + + // This assert only holds true for PutFullData and not PutPartialData, + // where HIGH bits in the mask may not be contiguous. + assert( + PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), + "mask HIGH popcount do not match the TL size. " + + "Partial masks are not allowed for PutFull" + ) + val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) + val dataW = tlIn.params.dataBits + val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U)) + req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) + // when (req.valid) { + // printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data) + // } + // responses on TL D channel // resp.valid := tlOut.d.valid @@ -1813,6 +1813,4 @@ class CoalArbiterImpl(outer: CoalArbiter, val coalResp = Decoupled(respCoalBundleT) } ) - - } From a6dbfc39010cd5449467ff9f901aebf668bad72d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 16:12:59 -0700 Subject: [PATCH 02/19] Fix config for unittest --- src/test/scala/coalescing/CoalescingUnitTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 2d31711..1f8c849 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -190,8 +190,8 @@ object testConfig extends CoalescerConfig( respQueueDepth = 4, coalLogSizes = Seq(4, 5), sizeEnum = DefaultInFlightTableSizeEnum, - numArbiterOutputPorts = 4, numCoalReqs = 1, + numArbiterOutputPorts = 4, bankStrideInBytes = 64 ) @@ -292,7 +292,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { it should "coalesce identical addresses (stride of 0)" in { test(LazyModule(new DummyCoalescingUnitTB()).module) - .withAnnotations(Seq(VcsBackendAnnotation)) + .withAnnotations(Seq(VerilatorBackendAnnotation)) { c => println(s"coalIO length = ${c.coalIOs(0).length}") val nodes = c.coalIOs.map(_.head) From c75eaaf7272ac6ef0f0f821b3e1f6d356f6e2490 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 16:13:28 -0700 Subject: [PATCH 03/19] Backport SimMemTrace --- src/main/resources/csrc/SimMemTrace.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 08540bc..aed11a1 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -23,6 +23,7 @@ MemTraceReader::MemTraceReader(const std::string &filename) infile.open(filename); if (infile.fail()) { fprintf(stderr, "failed to open file %s\n", filename.c_str()); + exit(EXIT_FAILURE); } } @@ -60,8 +61,6 @@ void MemTraceReader::parse(const bool has_source) { } if (!(infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id)) { - printf("char=[%c]\n", infile.peek()); - // assert(!infile.eof()); error(fileline, "failed parsing cycle..lane_id"); } if (has_source && !(infile >> source)) { @@ -101,8 +100,6 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, MemTraceLine line; line.valid = false; - // printf("tick(): cycle=%ld\n", cycle); - if (finished()) { return line; } From 15889d766770f36b12dfb5b725634ba6c3f3d1bc Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 19:09:25 -0700 Subject: [PATCH 04/19] Take filename from Configs for easier trace testing --- src/main/scala/tilelink/Coalescing.scala | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 05e6ab3..3d4639d 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -233,11 +233,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e })) // shift hint is when the heads have no more coalescable left this or next cycle - val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, i) => - c && !(io.invalidate.valid && i) + val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) => + c && !(io.invalidate.valid && inv) }.reduce(_ || _) val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _) - val syncedDeqValid = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire + // syncedDeqValidNextCycle being true means the arbiter has completed + // processing all of the ready-to-go requests. + val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire for (i <- 0 until config.numLanes) { val enq = io.queue.enq(i) @@ -247,7 +249,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e ctrl.full := writePtr(i) === entries.U ctrl.empty := writePtr(i) === 0.U // shift when no outstanding dequeue, no more coalescable chunks, and not empty - ctrl.shift := !syncedDeqValid && shiftHint && !ctrl.empty + ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty // dequeue is valid when: // head entry is valid, has not been processed by downstream, and is not coalescable @@ -1629,10 +1631,7 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) } // tracedriver --> coalescer --> tracelogger --> tlram -class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { - // val filename = "test.trace" - val filename = "vecadd.core1.thread4.trace" - // val filename = "nvbit.vecadd.n100000.filter_sm0.trace" +class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = defaultConfig.numLanes @@ -1674,13 +1673,14 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { (coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes), "FAIL: requests and responses traffic to the coalescer do not match" ) + printf("SUCCESS: coalescer response traffic matched requests!\n") } } } -class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) +class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new TLRAMCoalescerLogger).module) + val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module) dut.io.start := io.start io.finished := dut.io.finished } From 737a760fcd46a4e7a25ea3b7f25682ebff8a9a5e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 22:58:01 -0700 Subject: [PATCH 05/19] Enable coverage tests for chiseltest --- src/test/scala/coalescing/CoalescingUnitTest.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 1f8c849..08db521 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -2,6 +2,7 @@ package freechips.rocketchip.tilelink.coalescing import chisel3._ import chiseltest._ +import chiseltest.simulator.VerilatorFlags import org.scalatest.flatspec.AnyFlatSpec import freechips.rocketchip.tilelink._ import freechips.rocketchip.util.MultiPortQueue @@ -230,7 +231,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { it should "coalesce fully consecutive accesses at size 4, only once" in { test(LazyModule(new DummyCoalescingUnitTB()).module) - .withAnnotations(Seq(VerilatorBackendAnnotation, WriteFstAnnotation)) + .withAnnotations(Seq(VerilatorBackendAnnotation, VerilatorFlags(Seq("--coverage-line")), WriteFstAnnotation)) // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => val nodes = c.coalIOs.map(_.head) From ba600db7e49bed2f987418c3cf570ed5c2f9b325 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 23:54:49 -0700 Subject: [PATCH 06/19] Backport SimMemTrace fix --- src/main/resources/csrc/SimMemTrace.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index aed11a1..6c08858 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -22,7 +22,8 @@ MemTraceReader::MemTraceReader(const std::string &filename) infile.open(filename); if (infile.fail()) { - fprintf(stderr, "failed to open file %s\n", filename.c_str()); + fprintf(stderr, "MemTraceReader: error: failed to open file %s\n", + filename.c_str()); exit(EXIT_FAILURE); } } @@ -109,7 +110,11 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, // the next line is in the future. if (line.cycle < cycle) { long fileline = read_pos - std::cbegin(trace_buf) + 1; - error(fileline, "some trace lines are left unread in the past"); + error(fileline, "some trace lines are left unread in the past. " + "Tried cycle=" + + std::to_string(cycle) + + ", found line.cycle=" + std::to_string(line.cycle) + + ". Is NUM_LANES set correctly?"); return MemTraceLine{}; } @@ -131,14 +136,17 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, // monotonically increment read_pos. lane_id need not be contiguous, e.g. // 0->1->3 is fine. ++read_pos; - return line; } else { // For debugging purposes, instead of early-returning on // !trace_read_ready, print something to notify we are blocking a valid // trace line. printf("All Lanes Blocked on this cycle! cycle=%ld \n", cycle); - return MemTraceLine{}; } + // We want to return valid line regardless of `trace_read_ready` or not, + // because we want to let the driver know that it missed a valid line at the + // given cycle, so that it holds its cycle counter and safely reads back the + // line in the future. + return line; } assert(!"unreachable"); From f7df5045d4e31b5b7f3e6ea4bce4d8441bcc5c4a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 23:55:54 -0700 Subject: [PATCH 07/19] Respect downstream TL A ready in MemTraceDriver --- src/main/scala/tilelink/Coalescing.scala | 61 +++++++++--------------- 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 3d4639d..4895cc0 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -1078,24 +1078,18 @@ class TraceLine extends Bundle with HasTraceLine { class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFile: String) extends LazyModuleImp(outer) with UnitTestModule { + // Current cycle mark to read from trace + val traceReadCycle = RegInit(1.U(64.W)) - val globalClkCounter = RegInit(1.U(64.W)) - val traceReadCycle = RegInit(1.U(64.W)) - val downstreamSQready = WireInit(true.B) + // If any of the downstream lane is not ready, hold on from advancing + val downstreamReady = outer.laneNodes.map(_.out(0)._1.a.ready).reduce(_ && _) - //make the downstream only ready 1/4 of the time - //This is to test Tracer System's ability to hold on requests - //FIXME - downstreamSQready := (globalClkCounter(1,0) =/= 0.U) - //Connect Signals to Verilog BlackBox val sim = Module(new SimMemTrace(traceFile, config.numLanes)) sim.io.clock := clock sim.io.reset := reset.asBool - sim.io.trace_read.ready := downstreamSQready - //FIXME - 1.U hardcoded, currently there is a delay between chisel and verilog + sim.io.trace_read.ready := downstreamReady sim.io.trace_read.cycle := traceReadCycle - // Read output from Verilog BlackBox // Split output of SimMemTrace, which is flattened across all lanes,back to each lane's. val laneReqs = Wire(Vec(config.numLanes, new TraceLine)) @@ -1104,26 +1098,30 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil val dataW = laneReqs(0).data.getWidth laneReqs.zipWithIndex.foreach { case (req, i) => req.valid := sim.io.trace_read.valid(i) - // TODO: driver trace doesn't contain source id - req.source := 0.U + req.source := 0.U // driver trace doesn't contain source id req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i) req.is_store := sim.io.trace_read.is_store(i) req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i) req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } - globalClkCounter := globalClkCounter + 1.U - val existValidReq = WireInit(false.B) - existValidReq := laneReqs.map(_.valid).reduce(_||_) - val validReqBlocked = WireInit(false.B) - validReqBlocked := !downstreamSQready && existValidReq - //Debug - dontTouch(downstreamSQready) - dontTouch(existValidReq) - dontTouch(validReqBlocked) - // Do Not Update TraceReadCycle if downstream is blocking - when(!validReqBlocked){ - traceReadCycle := traceReadCycle + 1.U + def missedLine = { + val existsValidLine = WireInit(false.B) + existsValidLine := laneReqs.map(_.valid).reduce(_||_) + val missedLine = WireInit(false.B) + missedLine := !downstreamReady && existsValidLine + + // Debug + dontTouch(downstreamReady) + dontTouch(existsValidLine) + dontTouch(missedLine) + + missedLine + } + // Do not increment trace read cycle if we didn't fire a valid line because + // downstream was blocking. This prevents missing any line in the trace. + when (!missedLine){ + traceReadCycle := traceReadCycle + 1.U } // To prevent collision of sourceId with a current in-flight message, @@ -1158,19 +1156,6 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W) val wordAlignedSize = Mux(subword, 2.U, req.size) - // when(req.valid && subword) { - // printf( - // "address=%x, size=%d, data=%x, addressMask=%x, wordAlignedAddress=%x, mask=%x, wordData=%x\n", - // req.address, - // req.size, - // req.data, - // ~((1 << log2Ceil(config.WORD_SIZE)) - 1).U(addrW.W), - // wordAlignedAddress, - // mask, - // wordData - // ) - // } - val (tlOut, edge) = node.out(0) val (plegal, pbits) = edge.Put( fromSource = sourceIdCounter, From 99d1e45a49baea628580c8349b04a0bdd0c74d68 Mon Sep 17 00:00:00 2001 From: Vamber Yang Date: Mon, 8 May 2023 00:05:16 -0700 Subject: [PATCH 08/19] CoalArbiter RTL outgoing side implementation --- src/main/scala/tilelink/Coalescing.scala | 148 ++++++++++++++++++++++- 1 file changed, 147 insertions(+), 1 deletion(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index b1cd865..84e1641 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -1814,5 +1814,151 @@ class CoalArbiterImpl(outer: CoalArbiter, } ) + //Helper Class & Method Section + + //Provide an decoupled interface between bundle of 2 different type + class ConverterTunnel[T <: Data, U <: Data]( + genA: T, + genB: U, + conversionFn: T => U + ) extends Module { + val io = IO(new Bundle { + val in = Flipped(Decoupled(genA.cloneType)) + val out = Decoupled(genB.cloneType) + }) + io.in.ready := io.out.ready + io.out.valid := io.in.valid + io.out.bits := conversionFn(io.in.bits) + } + + + def canHitBank(addr: UInt, bankNum: UInt) : Bool = { + val byteOffset = 3 + val bankBase = log2Ceil(config.bankStrideInBytes) + val bankOffset = log2Ceil(config.numArbiterOutputPorts) + (addr(bankBase+bankOffset-byteOffset, bankBase - byteOffset) === bankNum) + } + + //This Operation Could be Expensive + def toGlobalSourceId(isCoalReq : Bool, laneIndex : UInt, sourceID : UInt) : UInt = { + val res = Mux(isCoalReq, + config.numNewSrcIds.U * laneIndex + sourceID, + config.numOldSrcIds.U * laneIndex + sourceID + config.numNewSrcIds.U * config.numCoalReqs.U + ) + res + } + + // + val fullSourceIdRange = config.numOldSrcIds * config.numLanes + config.numNewSrcIds * config.numCoalReqs + + + val nonCoalGiDEntryT = new ReqQueueEntry( + log2Ceil(fullSourceIdRange), + config.wordWidth, + config.addressWidth, + log2Ceil(config.wordSizeInBytes) + ) + // Before a nonCoalesced request enter RR arbiter + // It needs to turn its source into global source id + // Unfortunately this involves extending the width of sourceid field, and a new bundle must be created + // This is a higher order function + def nonCoal2gidReqFn(laneIndex : UInt) + : ReqQueueEntry => ReqQueueEntry = { + def func(lid_req : ReqQueueEntry) : ReqQueueEntry = { + val gid_req = nonCoalGiDEntryT.cloneType + gid_req <> lid_req + gid_req.source := toGlobalSourceId(false.B, laneIndex, lid_req.source) + gid_req + } + func + } + + def nonCoal2TLAFn(edgeOut: TLEdgeOut) : ReqQueueEntry => TLBundleA = { + def func(gid_req : ReqQueueEntry) : TLBundleA = { + gid_req.toTLA(edgeOut) + } + func + } + + ///////////////////////////////////////////////////// + //HDL Implementation Section + ///////////////////////////////////////////////////// + + //Stage 1: Create Queue for nonCoalReqs and CoalReqs + val nonCoalReqsQueues = Seq.tabulate(config.numLanes){_=> + Module(new Queue(nonCoalEntryT.cloneType, 1, true, false)) + } + val coalReqsQueues = Seq.tabulate(config.numCoalReqs){_=> + Module(new Queue(coalEntryT.cloneType, 1, true, false)) + } + //Stage 1a: connect two Queue groups to the input + (io.nonCoalReqs zip nonCoalReqsQueues).foreach{ + case (req, q) => q.io.enq <> req + } + (io.coalReqs zip coalReqsQueues).foreach{ + case (req, q) => q.io.enq <> req + } + //Stage 1b: connect output of Queues to the RR arbiters (each arbiter is for a unique bank) + val nonCoalRRArbiters = Seq.tabulate(config.numArbiterOutputPorts){_=> + Module(new RRArbiter(nonCoalGiDEntryT.cloneType, config.numLanes)) + } + nonCoalReqsQueues.zipWithIndex.foreach{ case(q, q_idx) => + nonCoalRRArbiters.zipWithIndex.foreach{ case(arb, arb_idx) => + val nonCoal2gidFunc = nonCoal2gidReqFn(q_idx.U) + val nonCoalRRArbTunnel = Module(new ConverterTunnel( + coalEntryT.cloneType, + nonCoalGiDEntryT.cloneType, + nonCoal2gidFunc) + ) + nonCoalRRArbTunnel.io.in <> q.io.deq + arb.io.in(q_idx) <> nonCoalRRArbTunnel.io.out + //OverWrite Valid base on if we can actually hit this bank + arb.io.in(q_idx).valid := canHitBank(nonCoalRRArbTunnel.io.out.bits.address, arb_idx.U) + } + } + + //Stage 2, Connect the output of Arbiters to respective nonCoal node + + //Stage 2a, the K Arbiters for Coalesced + (outer.nonCoalNarrowNodes zip nonCoalRRArbiters).foreach{ + case (node, arb) => + val (tlOut, edgeOut) = node.out(0) + val coal2TLAFunc = nonCoal2TLAFn(edgeOut) + val nonCoalTLATunnel = Module(new ConverterTunnel( + arb.io.out.bits.cloneType, + tlOut.a.bits.cloneType, + coal2TLAFunc + ) + ) + nonCoalTLATunnel.io.in <> arb.io.out + tlOut.a <> nonCoalTLATunnel.io.out + } + + //Stage 3, Connect the K edges Identity Node to PO arbiter + // noncoalesced to port 1, coalesced to por1 + + val priorityArbs = Seq.tabulate(config.numArbiterOutputPorts){_=> + Module(new Arbiter(outer.outputNode.out(0)._1.a.bits.cloneType, 2)) + } + ((outer.nonCoalNode.out zip outer.coalNode.out) zip priorityArbs).foreach{ + case (((nonCoalOut, _),(coalOut, _)), arb) => + arb.io.in(1) <> nonCoalOut.a + arb.io.in(0) <> coalOut.a + } + + + //Stage 4, Connect PO arbiter to each edge of output Node + //And make idenitity node passs through the inputs + ((outer.outputNode.in zip outer.outputNode.out) zip priorityArbs).foreach{ + case (((tlIn, _), (tlOut, _)), arb) => + tlOut <> tlIn + tlIn.a <> arb.io.out + } + + } + + + + + -} From 6b97b77572918259b7f8eea311e8998a0e13270d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 00:14:48 -0700 Subject: [PATCH 09/19] Revert SimMemTrace.v to use posedge clock Doing function calls inside @(*) causes lint errors. Instead, remove staging registers to eliminate 1 cycle latency between DPI call and when output is visible to Chisel. --- src/main/resources/vsrc/SimMemTrace.v | 74 +++++---------------------- 1 file changed, 13 insertions(+), 61 deletions(-) diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index 4d630fd..fea2bed 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -39,49 +39,32 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, output trace_read_finished ); - bit __in_valid [NUM_LANES-1:0]; - longint __in_address [NUM_LANES-1:0]; - - bit __in_is_store [NUM_LANES-1:0]; + bit __in_valid [NUM_LANES-1:0]; + longint __in_address [NUM_LANES-1:0]; + bit __in_is_store [NUM_LANES-1:0]; reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0]; - longint __in_data [NUM_LANES-1:0]; - - bit __in_finished; - string __uartlog; - - // Cycle counter that is used to query C parser whether we have a request - // coming in at the current cycle. - - - // registers that stage outputs of the C parser - reg [NUM_LANES-1:0] __in_valid_wire; - reg [`DATA_WIDTH-1:0] __in_address_wire [NUM_LANES-1:0]; - - reg [NUM_LANES-1:0] __in_is_store_wire; - reg [`LOGSIZE_WIDTH-1:0] __in_size_wire [NUM_LANES-1:0]; - reg [`DATA_WIDTH-1:0] __in_data_wire [NUM_LANES-1:0]; - reg __in_finished_wire; + longint __in_data [NUM_LANES-1:0]; + bit __in_finished; genvar g; - generate for (g = 0; g < NUM_LANES; g = g + 1) begin - assign trace_read_valid[g] = __in_valid_wire[g]; - assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_wire[g]; + assign trace_read_valid[g] = __in_valid[g]; + assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address[g]; - assign trace_read_is_store[g] = __in_is_store_wire[g]; - assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size_wire[g]; - assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_wire[g]; + assign trace_read_is_store[g] = __in_is_store[g]; + assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size[g]; + assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data[g]; end endgenerate - assign trace_read_finished = __in_finished_wire; + assign trace_read_finished = __in_finished; initial begin /* $value$plusargs("uartlog=%s", __uartlog); */ memtrace_init(FILENAME); end - always @(*) begin + always @(posedge clock) begin if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid[tid] = 1'b0; @@ -91,32 +74,12 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_size[tid] = `LOGSIZE_WIDTH'b0; __in_data[tid] = `DATA_WIDTH'b0; end - __in_finished = 1'b0; - - //cycle_counter <= `DATA_WIDTH'b0; - - // setting default value for register to avoid latches - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - __in_valid_wire[tid] = 1'b0; - __in_address_wire[tid] = `DATA_WIDTH'b0; - - __in_is_store_wire[tid] = 1'b0; - __in_size_wire[tid] = `LOGSIZE_WIDTH'b0; - __in_data_wire[tid] = `DATA_WIDTH'b0; - end - - __in_finished_wire = 1'b0; end else begin - - // Getting values from C function into pseudeo register for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin memtrace_query( trace_read_ready, - // Since parsed results are latched to the output on the next - // cycle due to staging registers, we need to pass in the next cycle - // to sync up. - trace_read_cycle, // the left replace next_cycle_counter, + trace_read_cycle, tid, __in_valid[tid], @@ -129,17 +92,6 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_finished ); end - - // Connect values from pseudo register into verilog register - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - __in_valid_wire[tid] = __in_valid[tid]; - __in_address_wire[tid] = __in_address[tid]; - - __in_is_store_wire[tid] = __in_is_store[tid]; - __in_size_wire[tid] = __in_size[tid]; - __in_data_wire[tid] = __in_data[tid]; - end - __in_finished_wire = __in_finished; end end endmodule From 6755cb3eeca31197f0b3ceaea6754d6393038388 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 00:49:30 -0700 Subject: [PATCH 10/19] Lax traceReadCycle advancing logic Trying to advance trace cycle while downstream is blocking is tricky because DPI call is synchronous, and that gives timing difference between the line we have fired to downstream and the current cycle counter we maintain. Just stall the counter whenever downstream is not ready for now. --- src/main/scala/tilelink/Coalescing.scala | 26 +++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4895cc0..62c0473 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -1105,22 +1105,20 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } - def missedLine = { - val existsValidLine = WireInit(false.B) - existsValidLine := laneReqs.map(_.valid).reduce(_||_) - val missedLine = WireInit(false.B) - missedLine := !downstreamReady && existsValidLine + // def missedLine = { + // val existsValidLine = WireInit(false.B) + // existsValidLine := laneReqs.map(_.valid).reduce(_||_) + // val missedLine = WireInit(false.B) + // missedLine := !downstreamReady && existsValidLine - // Debug - dontTouch(downstreamReady) - dontTouch(existsValidLine) - dontTouch(missedLine) + // // Debug + // dontTouch(downstreamReady) + // dontTouch(existsValidLine) + // dontTouch(missedLine) - missedLine - } - // Do not increment trace read cycle if we didn't fire a valid line because - // downstream was blocking. This prevents missing any line in the trace. - when (!missedLine){ + // missedLine + // } + when (downstreamReady){ traceReadCycle := traceReadCycle + 1.U } From 8cdbc81bddcd4871ee61ca754072e65143f2d2bb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:26:05 -0700 Subject: [PATCH 11/19] Only write in MemTraceLogger when TL fire Without this we log extraneous lines that were valid but not transacted with the downstream as it was not ready, which affects validity of memtrace testing. --- src/main/scala/tilelink/Coalescing.scala | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 62c0473..0addef1 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -1334,7 +1334,9 @@ class MemTraceLogger( // requests on TL A channel // - req.valid := tlIn.a.valid + // Only log trace when fired, e.g. both upstream and downstream is ready + // and transaction happened. + req.valid := tlIn.a.fire req.size := tlIn.a.bits.size req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source @@ -1378,7 +1380,9 @@ class MemTraceLogger( // responses on TL D channel // - resp.valid := tlOut.d.valid + // Only log trace when fired, e.g. both upstream and downstream is ready + // and transaction happened. + resp.valid := tlOut.d.fire resp.size := tlOut.d.bits.size resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) resp.source := tlOut.d.bits.source @@ -1412,7 +1416,7 @@ class MemTraceLogger( // // This is a clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. - def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { + def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { // these will get optimized out val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid))) val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source))) @@ -1428,12 +1432,12 @@ class MemTraceLogger( vecSize(i) := l.size vecData(i) := l.data } - traceLogIO.valid := vecValid.asUInt - traceLogIO.source := vecSource.asUInt - traceLogIO.address := vecAddress.asUInt - traceLogIO.is_store := vecIsStore.asUInt - traceLogIO.size := vecSize.asUInt - traceLogIO.data := vecData.asUInt + simIO.valid := vecValid.asUInt + simIO.source := vecSource.asUInt + simIO.address := vecAddress.asUInt + simIO.is_store := vecIsStore.asUInt + simIO.size := vecSize.asUInt + simIO.data := vecData.asUInt } if (simReq.isDefined) { From 2e219ea15a6627a8532ce1daa84a715e28776d63 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:27:54 -0700 Subject: [PATCH 12/19] Connect CoalShiftQueue enq.ready to upstream TL.ready Now CoalShiftQueue can properly stall memtrace driver. --- src/main/scala/tilelink/Coalescing.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 0addef1..4b4a0f6 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -604,7 +604,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends val deq = reqQueues.io.queue.deq(lane) enq.valid := tlIn.a.valid enq.bits := req - deq.ready := true.B // TODO: deq.ready should respect downstream arbiter + // TODO: deq.ready should respect downstream arbiter + deq.ready := true.B + // Stall upstream core or memtrace driver when shiftqueue is not ready + tlIn.a.ready := enq.ready tlOut.a.valid := deq.valid tlOut.a.bits := deq.bits.toTLA(edgeOut) From 54a3e3cf72de3f20bb7b24aa307af8a04b7aa9d9 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:34:52 -0700 Subject: [PATCH 13/19] Initiate memtrace DPI only when trace_read_ready This is required because otherwise we might overwrite into the Verilog registers that contain a valid trace line that was missed by downstream when it was not ready. Basically whenever trace_read_cycle stalls, we also want to stall __in_* registers. --- src/main/resources/vsrc/SimMemTrace.v | 31 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index fea2bed..74594cb 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -76,21 +76,26 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( end __in_finished = 1'b0; end else begin - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - memtrace_query( - trace_read_ready, - trace_read_cycle, - tid, + // We have to write to __in_ regs only when trace_read_ready, or + // otherwise we might overwrite lines that were previously valid + // but the downstream missed by being not ready. + if (trace_read_ready) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + memtrace_query( + trace_read_ready, + trace_read_cycle, + tid, - __in_valid[tid], - __in_address[tid], - - __in_is_store[tid], - __in_size[tid], - __in_data[tid], + __in_valid[tid], + __in_address[tid], - __in_finished - ); + __in_is_store[tid], + __in_size[tid], + __in_data[tid], + + __in_finished + ); + end end end end From 2d4e28e862ca270b3bcc4f37d8a8788cbc20fe31 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:38:15 -0700 Subject: [PATCH 14/19] Use WithoutTLMonitors to slightly speed up chiseltests --- src/test/scala/coalescing/CoalescingUnitTest.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 08db521..5a48733 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -7,6 +7,7 @@ import org.scalatest.flatspec.AnyFlatSpec import freechips.rocketchip.tilelink._ import freechips.rocketchip.util.MultiPortQueue import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.subsystem.WithoutTLMonitors import org.chipsalliance.cde.config.Parameters import chisel3.util.{DecoupledIO, Valid} import chisel3.util.experimental.BoringUtils @@ -230,7 +231,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } it should "coalesce fully consecutive accesses at size 4, only once" in { - test(LazyModule(new DummyCoalescingUnitTB()).module) + test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module) .withAnnotations(Seq(VerilatorBackendAnnotation, VerilatorFlags(Seq("--coverage-line")), WriteFstAnnotation)) // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => @@ -292,7 +293,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } it should "coalesce identical addresses (stride of 0)" in { - test(LazyModule(new DummyCoalescingUnitTB()).module) + test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module) .withAnnotations(Seq(VerilatorBackendAnnotation)) { c => println(s"coalIO length = ${c.coalIOs(0).length}") From 3fae0b2c7a4df7c8dde97878a2ad0b2adc58d525 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 15:18:45 -0700 Subject: [PATCH 15/19] Use priority encoder for chooseLeaderIdx --- src/main/scala/tilelink/Coalescing.scala | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4b4a0f6..ae4da82 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -372,14 +372,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], .reduce(_ +& _)) val canCoalesce = matchCounts.map(_ > 1.U) - // Elect the leader out of all potential leaders that have matchCounts > 1. + // Elect the leader that has the most match counts. // TODO: potentially expensive: magnitude comparator - // Maybe choose leftmost leader (priority encoder) instead of argmax - val chosenLeaderIdx = matchCounts.zipWithIndex.map { - case (c, i) => (c, i.U) - }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => - (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) - }._2 + def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = { + matchCounts.zipWithIndex.map { + case (c, i) => (c, i.U) + }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => + (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) + }._2 + } + // Elect leader by choosing the smallest-index lane that has a valid + // match, i.e. using priority encoder. + def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = { + PriorityEncoder(matchCounts.map(_ > 1.U)) + } + val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts) val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // matchTable for the chosen lane, but converted to a Vec[UInt] @@ -1530,7 +1537,7 @@ class DummyDriver(config: CoalescerConfig)(implicit p: Parameters) val clientParam = Seq( TLMasterParameters.v1( name = "dummy-core-node-" + i.toString, - sourceId = IdRange(0, defaultConfig.numOldSrcIds) + sourceId = IdRange(0, config.numOldSrcIds) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) ) From 9c2a55ae79e625d3d4e821ad2c5741f52c5eaa9c Mon Sep 17 00:00:00 2001 From: Vamber Yang Date: Mon, 8 May 2023 16:52:45 -0700 Subject: [PATCH 16/19] CoalArbiter RTL implementation first draft, verification WIP --- src/main/scala/tilelink/Coalescing.scala | 271 +++++++++++++++++++---- 1 file changed, 223 insertions(+), 48 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 84e1641..90237b7 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -810,6 +810,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends class CoalescedResponseBundle(config: CoalescerConfig) extends Bundle { val source = UInt(log2Ceil(config.numNewSrcIds).W) val data = UInt((8 * (1 << config.maxCoalLogSize)).W) + + def fromTLD(bundle:TLBundleD): Unit = { + this.source := bundle.source + this.data := bundle.data + } + } class Uncoalescer(config: CoalescerConfig) extends Module { @@ -1766,12 +1772,20 @@ class CoalArbiter(config: CoalescerConfig) (implicit p: Parameters) extends Lazy coalNode := coalReqNode ) - + //Assertion Section + def isPowerOfTwo(n: Int): Boolean = { + (n > 0) && ((n & (n - 1)) == 0) + } + assert(isPowerOfTwo(config.numOldSrcIds), "Number of old source id must be power of 2") + assert(isPowerOfTwo(config.numNewSrcIds), "Number of new source id must be power of 2") + //Below is for efficient conversion from Global to Local bits + //Also, we should have more source id for coalesced request for better perf + assert(config.numNewSrcIds >= config.numOldSrcIds, "new source id must be equal or greater than old source id") // 1 Final Output Identity Node val outputNode = TLIdentityNode() - //Explictly define I/O bundule tyoe + val nonCoalEntryT = new ReqQueueEntry( log2Ceil(config.numOldSrcIds), config.wordWidth, @@ -1791,10 +1805,13 @@ class CoalArbiter(config: CoalescerConfig) (implicit p: Parameters) extends Lazy ) val respCoalBundleT = new CoalescedResponseBundle(config) - + + lazy val module = new CoalArbiterImpl( this, config, nonCoalEntryT, coalEntryT, respNonCoalEntryT, respCoalBundleT) + + } class CoalArbiterImpl(outer: CoalArbiter, @@ -1816,7 +1833,7 @@ class CoalArbiterImpl(outer: CoalArbiter, //Helper Class & Method Section - //Provide an decoupled interface between bundle of 2 different type + //Provide an simple decoupled interface between bundle of 2 different type class ConverterTunnel[T <: Data, U <: Data]( genA: T, genB: U, @@ -1833,19 +1850,36 @@ class CoalArbiterImpl(outer: CoalArbiter, def canHitBank(addr: UInt, bankNum: UInt) : Bool = { - val byteOffset = 3 - val bankBase = log2Ceil(config.bankStrideInBytes) - val bankOffset = log2Ceil(config.numArbiterOutputPorts) - (addr(bankBase+bankOffset-byteOffset, bankBase - byteOffset) === bankNum) + val byteOffset = 3 + val bankBase = log2Ceil(config.bankStrideInBytes) + val bankOffset = log2Ceil(config.numArbiterOutputPorts) + (addr(bankBase+bankOffset-byteOffset, bankBase - byteOffset) === bankNum) } //This Operation Could be Expensive - def toGlobalSourceId(isCoalReq : Bool, laneIndex : UInt, sourceID : UInt) : UInt = { - val res = Mux(isCoalReq, - config.numNewSrcIds.U * laneIndex + sourceID, - config.numOldSrcIds.U * laneIndex + sourceID + config.numNewSrcIds.U * config.numCoalReqs.U - ) - res + def toGlobalSourceId(isCoalReq : Bool, laneIdx : UInt, sourceID : UInt) : UInt = { + val gid = Mux(isCoalReq, + config.numNewSrcIds.U * laneIdx + sourceID, + config.numOldSrcIds.U * laneIdx + sourceID + config.numNewSrcIds.U * config.numCoalReqs.U + ) + gid + } + //All the ids are power of 2, so we can just look at bottom bits + def toLocalSourceId(isCoalReq : Bool, sourceID : UInt) : UInt = { + val sid = Mux(isCoalReq, + sourceID(log2Ceil(config.numNewSrcIds)-1, 0), + sourceID(log2Ceil(config.numOldSrcIds)-1, 0) + ) + sid + } + def belongsToLane(laneIdx: UInt, gid: UInt) : Bool = { + val base = config.numNewSrcIds.U * config.numCoalReqs.U + ((gid >= base + config.numOldSrcIds.U * laneIdx) && + (gid < base + config.numOldSrcIds.U * (laneIdx+1.U))) + } + + def isCoalReq(gid : UInt) : Bool = { + gid <= config.numNewSrcIds.U * config.numCoalReqs.U } // @@ -1858,26 +1892,52 @@ class CoalArbiterImpl(outer: CoalArbiter, config.addressWidth, log2Ceil(config.wordSizeInBytes) ) - // Before a nonCoalesced request enter RR arbiter + val coalGiDEntryT = new ReqQueueEntry( + log2Ceil(fullSourceIdRange), + log2Ceil(config.maxCoalLogSize), + config.addressWidth, + config.maxCoalLogSize //already log 2 + ) + + // Before either a coalesced or non coalesced request enter RR arbiter // It needs to turn its source into global source id // Unfortunately this involves extending the width of sourceid field, and a new bundle must be created // This is a higher order function - def nonCoal2gidReqFn(laneIndex : UInt) - : ReqQueueEntry => ReqQueueEntry = { - def func(lid_req : ReqQueueEntry) : ReqQueueEntry = { - val gid_req = nonCoalGiDEntryT.cloneType - gid_req <> lid_req - gid_req.source := toGlobalSourceId(false.B, laneIndex, lid_req.source) - gid_req - } - func + def reqEntry2GidReqFn(laneIndex : UInt, reqEntryT : ReqQueueEntry, isCoalReq : Bool) : ReqQueueEntry => ReqQueueEntry = { + def func(lid_req : ReqQueueEntry) : ReqQueueEntry = { + val gid_req = reqEntryT.cloneType + gid_req <> lid_req + gid_req.source := toGlobalSourceId(isCoalReq, laneIndex, lid_req.source) + gid_req + } + func } - def nonCoal2TLAFn(edgeOut: TLEdgeOut) : ReqQueueEntry => TLBundleA = { - def func(gid_req : ReqQueueEntry) : TLBundleA = { - gid_req.toTLA(edgeOut) - } - func + + def reqEntry2TLAFn(edgeOut: TLEdgeOut) : ReqQueueEntry => TLBundleA = { + def func(gid_req : ReqQueueEntry) : TLBundleA = { + gid_req.toTLA(edgeOut) + } + func + } + + def tlD2respEntryFn() : TLBundleD => RespQueueEntry = { + def func(bundle: TLBundleD) : RespQueueEntry = { + val resp = Wire(respNonCoalEntryT) + resp.fromTLD(bundle) + resp.source := toLocalSourceId(false.B, bundle.source) + resp + } + func + } + def tlD2CoalBundleFn() : TLBundleD => CoalescedResponseBundle = { + def func(bundle: TLBundleD) : CoalescedResponseBundle = { + val coalbundle = Wire(respCoalBundleT) + coalbundle.fromTLD(bundle) + coalbundle.source := toLocalSourceId(true.B, bundle.source) + coalbundle + } + func } ///////////////////////////////////////////////////// @@ -1899,47 +1959,80 @@ class CoalArbiterImpl(outer: CoalArbiter, case (req, q) => q.io.enq <> req } //Stage 1b: connect output of Queues to the RR arbiters (each arbiter is for a unique bank) + // the two loops below could be merged into one loop, but separated for readability val nonCoalRRArbiters = Seq.tabulate(config.numArbiterOutputPorts){_=> Module(new RRArbiter(nonCoalGiDEntryT.cloneType, config.numLanes)) } nonCoalReqsQueues.zipWithIndex.foreach{ case(q, q_idx) => nonCoalRRArbiters.zipWithIndex.foreach{ case(arb, arb_idx) => - val nonCoal2gidFunc = nonCoal2gidReqFn(q_idx.U) + val nonCoal2gidFunc = reqEntry2GidReqFn(q_idx.U, nonCoalGiDEntryT, false.B) val nonCoalRRArbTunnel = Module(new ConverterTunnel( - coalEntryT.cloneType, + nonCoalEntryT.cloneType, nonCoalGiDEntryT.cloneType, nonCoal2gidFunc) ) nonCoalRRArbTunnel.io.in <> q.io.deq arb.io.in(q_idx) <> nonCoalRRArbTunnel.io.out //OverWrite Valid base on if we can actually hit this bank - arb.io.in(q_idx).valid := canHitBank(nonCoalRRArbTunnel.io.out.bits.address, arb_idx.U) + arb.io.in(q_idx).valid := canHitBank(nonCoalRRArbTunnel.io.out.bits.address, arb_idx.U) && + nonCoalRRArbTunnel.io.out.valid } } + val coalRRArbiters = Seq.tabulate(config.numArbiterOutputPorts){_=> + Module(new RRArbiter(coalGiDEntryT.cloneType, config.numCoalReqs)) + } + coalReqsQueues.zipWithIndex.foreach{ case(q, q_idx) => + coalRRArbiters.zipWithIndex.foreach{ case(arb, arb_idx) => + val coal2gidFunc = reqEntry2GidReqFn(q_idx.U, coalGiDEntryT, true.B) + val coalRRArbTunnel = Module(new ConverterTunnel( + coalEntryT.cloneType, + coalGiDEntryT.cloneType, + coal2gidFunc) + ) + coalRRArbTunnel.io.in <> q.io.deq + arb.io.in(q_idx) <> coalRRArbTunnel.io.out + //OverWrite Valid + arb.io.in(q_idx).valid := canHitBank(coalRRArbTunnel.io.out.bits.address, arb_idx.U) && + coalRRArbTunnel.io.out.valid + } + } + //Stage 2, Connect the output of Arbiters to respective nonCoal node - - //Stage 2a, the K Arbiters for Coalesced - (outer.nonCoalNarrowNodes zip nonCoalRRArbiters).foreach{ - case (node, arb) => - val (tlOut, edgeOut) = node.out(0) - val coal2TLAFunc = nonCoal2TLAFn(edgeOut) - val nonCoalTLATunnel = Module(new ConverterTunnel( - arb.io.out.bits.cloneType, - tlOut.a.bits.cloneType, - coal2TLAFunc + + // Concatenate the nodes , concatenates the arbiters, and zip them together, then loop + // the reqEntry2TLA will generate different TLA bundle depending on if the Req is coal or non coal + ((outer.nonCoalNarrowNodes++outer.coalReqNodes) zip + (nonCoalRRArbiters++coalRRArbiters)).foreach{ + case (node, arb) => + val (tlOut, edgeOut) = node.out(0) + val coal2TLAFunc = reqEntry2TLAFn(edgeOut) + val nonCoalTLATunnel = Module(new ConverterTunnel( + arb.io.out.bits.cloneType, + tlOut.a.bits.cloneType, + coal2TLAFunc + ) ) - ) - nonCoalTLATunnel.io.in <> arb.io.out - tlOut.a <> nonCoalTLATunnel.io.out + nonCoalTLATunnel.io.in <> arb.io.out + tlOut.a <> nonCoalTLATunnel.io.out } + - //Stage 3, Connect the K edges Identity Node to PO arbiter - // noncoalesced to port 1, coalesced to por1 + //Stage 3, Make the Idenity node pass through channel A + // Connect the K edges Identity Node to PO arbiter + // noncoalesced to port 1, coalesced to port 0 val priorityArbs = Seq.tabulate(config.numArbiterOutputPorts){_=> Module(new Arbiter(outer.outputNode.out(0)._1.a.bits.cloneType, 2)) } + + //Make both Idenity node Pass Through Channel A, for both Coal and NonCoal + ((outer.nonCoalNode.out ++ outer.coalNode.out) zip + (outer.nonCoalNode.in ++ outer.coalNode.in)).foreach{ + case ((tlOut,_),(tlIn,_)) => + tlOut.a <> tlIn.a + } + //Connection to PO Arbiters ((outer.nonCoalNode.out zip outer.coalNode.out) zip priorityArbs).foreach{ case (((nonCoalOut, _),(coalOut, _)), arb) => arb.io.in(1) <> nonCoalOut.a @@ -1951,10 +2044,92 @@ class CoalArbiterImpl(outer: CoalArbiter, //And make idenitity node passs through the inputs ((outer.outputNode.in zip outer.outputNode.out) zip priorityArbs).foreach{ case (((tlIn, _), (tlOut, _)), arb) => - tlOut <> tlIn + tlOut.a <> tlIn.a tlIn.a <> arb.io.out } + + + //////////////// + // Incoming Data Handling + + //Stage 1, Forward data from output node to the Idenity node of Coal and NonCoal + // while setting the correct valid signal to base on if the request is Coalesced or not + + ((outer.outputNode.in zip outer.outputNode.out) zip + (outer.nonCoalNode.out zip outer.coalNode.out)).foreach{ + case( ((tlIn, _),(tlOut, _)), ((nonCoalOut, _),(coalOut, _)) ) => + tlIn.d <> tlOut.d + nonCoalOut.d <> tlIn.d + coalOut.d <> tlIn.d + //rewrite valid signal + nonCoalOut.d.valid := !isCoalReq(tlIn.d.bits.source) && tlIn.d.valid + coalOut.d.valid := isCoalReq(tlIn.d.bits.source) && tlIn.d.valid + } + + //Stage 2, Make both Idenity node Pass Through Channel D, for both Coal and NonCoal + // + ((outer.nonCoalNode.out ++ outer.coalNode.out) zip + (outer.nonCoalNode.in ++ outer.coalNode.in)).foreach{ + case ((tlOut,_),(tlIn,_)) => + tlIn.d <> tlOut.d + } + + //Stage 3, Connect the channel D of nonCoalNodes to the perLane arbiters + + //Stage 3a, connect the noncoalesced edge to every single perlane arbiter + val perLaneRespRRArbs = Seq.tabulate(config.numLanes){_=> + Module(new RRArbiter(respNonCoalEntryT.cloneType, config.numArbiterOutputPorts)) + } + outer.nonCoalNarrowNodes.zipWithIndex.foreach{ + case (node, node_idx) => + val (tlOut, edgeOut) = node.out(0) + perLaneRespRRArbs.zipWithIndex.foreach{ + case(arb, arb_idx) => + val tlD2RespEntryFunc = tlD2respEntryFn() + val perLaneArbTunnel = Module(new ConverterTunnel( + tlOut.d.bits.cloneType, + arb.io.in(0).bits.cloneType, + tlD2RespEntryFunc + ) + ) + perLaneArbTunnel.io.in <> tlOut.d + arb.io.in(node_idx) <> perLaneArbTunnel.io.out + //rewrite valid base on if source id actually belongs to this lane + arb.io.in(node_idx).valid := belongsToLane(arb_idx.U, perLaneArbTunnel.io.out.bits.source) && + perLaneArbTunnel.io.out.valid + } + } + //Stage 3b, connect coalesced request to + val coalBundleRRArbiter = Module(new RRArbiter(respCoalBundleT.cloneType, config.numArbiterOutputPorts)) + outer.coalReqNodes.zipWithIndex.foreach{ + case(node, node_idx) => + val (tlOut, edgeOut) = node.out(0) + val tlD2CoalBundleFunc = tlD2CoalBundleFn() + val coalBundleArbTunnel = Module(new ConverterTunnel( + tlOut.d.bits.cloneType, + coalBundleRRArbiter.io.in(0).bits.cloneType, + tlD2CoalBundleFunc + ) + ) + coalBundleArbTunnel.io.in <> tlOut.d + coalBundleRRArbiter.io.in(node_idx) <> coalBundleArbTunnel.io.out + } + + + //Connect 4, Connect the arbiters to output + // connect the noncoalesced vector + (perLaneRespRRArbs zip io.nonCoalResps).foreach{ + case (arb, resp) => + resp <> arb.io.out + } + // connect the coalesced bundle + io.coalResp <> coalBundleRRArbiter.io.out + + + + + } From f6be54a12265ae893314410f6bd2691aba31a30e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 17:32:31 -0700 Subject: [PATCH 17/19] Set up proper Config system for numLanes TODO: tracefilename should not really be inside SIMTCoreParam. --- src/main/scala/tilelink/Coalescing.scala | 18 +++++--- src/main/scala/tilelink/TracerSystemMem.scala | 45 ++++++------------- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 8ded755..2ce2e6d 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -5,12 +5,17 @@ package freechips.rocketchip.tilelink import chisel3._ import chisel3.util._ import chisel3.experimental.ChiselEnum -import org.chipsalliance.cde.config.Parameters +import org.chipsalliance.cde.config.{Parameters, Field} import freechips.rocketchip.diplomacy._ // import freechips.rocketchip.devices.tilelink.TLTestRAM import freechips.rocketchip.util.MultiPortQueue import freechips.rocketchip.unittest._ +// TODO: find better place for these +case class SIMTCoreParams(nLanes: Int = 4, tracefilename: String = "undefined") + +case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None) + trait InFlightTableSizeEnum extends ChiselEnum { val INVALID: Type val FOUR: Type @@ -1635,14 +1640,15 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) // tracedriver --> coalescer --> tracelogger --> tlram class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { - // TODO: use parameters for numLanes - val numLanes = defaultConfig.numLanes + val numLanes = p(SIMTCoreKey).get.nLanes + println(s"============ numLanes: ${numLanes}") + val config = defaultConfig.copy(numLanes = numLanes) - val driver = LazyModule(new MemTraceDriver(defaultConfig, filename)) + val driver = LazyModule(new MemTraceDriver(config, filename)) val coreSideLogger = LazyModule( new MemTraceLogger(numLanes, filename, loggerName = "coreside") ) - val coal = LazyModule(new CoalescingUnit(defaultConfig)) + val coal = LazyModule(new CoalescingUnit(config)) val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, filename, loggerName = "memside")) val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( @@ -1650,7 +1656,7 @@ class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends Laz // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. new TLRAM(address = AddressSet(0x0000, 0xffffff), - beatBytes = (1 << defaultConfig.dataBusWidth)) + beatBytes = (1 << config.dataBusWidth)) ) ) diff --git a/src/main/scala/tilelink/TracerSystemMem.scala b/src/main/scala/tilelink/TracerSystemMem.scala index e0d495b..ca31864 100644 --- a/src/main/scala/tilelink/TracerSystemMem.scala +++ b/src/main/scala/tilelink/TracerSystemMem.scala @@ -1,44 +1,25 @@ - package freechips.rocketchip.tilelink -import chisel3._ -import chisel3.util._ import freechips.rocketchip.diplomacy._ -import freechips.rocketchip.subsystem.{BaseSubsystem, CacheBlockBytes} -import org.chipsalliance.cde.config.{Parameters, Field, Config} +import freechips.rocketchip.subsystem.{BaseSubsystem} +import org.chipsalliance.cde.config.{Parameters, Config} -// class class, consumed by WithGPUTacer config and GPUTracerKey - -case class GPUTracerConfig(numLanes: Int, traceFile : String) // FIXME, add lane number and file name - -case object GPUTracerKey extends Field[Option[GPUTracerConfig]](None) - - - -// Both LazyModule of Tracer and Impl are both in Coalescing.scala - - -//The trait is attached to DigitalTop of Chipyard system, informing it indeed has the ability -//to attach GPU tracer node onto the system bus +// The trait is attached to DigitalTop of Chipyard system, informing it indeed +// has the ability to attach GPU tracer node onto the system bus trait CanHaveGPUTracer { this: BaseSubsystem => implicit val p: Parameters - //p(GPUTracerKey) is the mechnimism to pass Config's parameter down to lazymodule - p(GPUTracerKey) .map { k => - val config = p(GPUTracerKey).get - val tracer = LazyModule(new MemTraceDriver(defaultConfig, config.traceFile)(p)) - // Must use :=* to ensure the N edges from Tracer doesn't get merged into 1 when connecting to SBus + p(SIMTCoreKey).map { _ => + val config = p(SIMTCoreKey).get + val tracer = LazyModule(new MemTraceDriver(defaultConfig, config.tracefilename)(p)) + // Must use :=* to ensure the N edges from Tracer doesn't get merged into 1 + // when connecting to SBus sbus.fromPort(Some("gpu-tracer"))() :=* tracer.node } } - //This is used by Chip Level Config, the config which creates the SoC -class WithGPUTracer(numLanes: Int, traceFile : String) extends Config((site, here, up) => { - case GPUTracerKey => Some( GPUTracerConfig(numLanes, traceFile) ) -} -) - - - - +class WithGPUTracer(numLanes: Int, tracefilename: String) + extends Config((_, _, _) => { case SIMTCoreKey => + Some(SIMTCoreParams(numLanes, tracefilename)) + }) From fd0d09691b1fd0865d646ac72dce33eb186bb3f8 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 17:51:24 -0700 Subject: [PATCH 18/19] Create separate Configs for synthesizable dummy testbenches --- src/main/scala/tilelink/Coalescing.scala | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 2ce2e6d..5bef07d 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -14,7 +14,7 @@ import freechips.rocketchip.unittest._ // TODO: find better place for these case class SIMTCoreParams(nLanes: Int = 4, tracefilename: String = "undefined") -case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None) +case object SIMTCoreKey extends Field[Option[SIMTCoreParams]](None /*default*/) trait InFlightTableSizeEnum extends ChiselEnum { val INVALID: Type @@ -1609,18 +1609,22 @@ class DummyDriverImp(outer: DummyDriver, config: CoalescerConfig) // A dummy harness around the coalescer for use in VLSI flow. // Should not instantiate any memtrace modules. class DummyCoalescer(implicit p: Parameters) extends LazyModule { - val driver = LazyModule(new DummyDriver(defaultConfig)) - val rams = Seq.fill(defaultConfig.numLanes + 1)( // +1 for coalesced edge + val numLanes = p(SIMTCoreKey).get.nLanes + println(s"============ numLanes: ${numLanes}") + val config = defaultConfig.copy(numLanes = numLanes) + + val driver = LazyModule(new DummyDriver(config)) + val rams = Seq.fill(config.numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave // parameters to the upstream nodes. new TLRAM(address = AddressSet(0x0000, 0xffffff), - beatBytes = (1 << defaultConfig.dataBusWidth)) + beatBytes = (1 << config.dataBusWidth)) ) ) - val coal = LazyModule(new CoalescingUnit(defaultConfig)) + val coal = LazyModule(new CoalescingUnit(config)) coal.cpuNode :=* driver.node rams.foreach(_.node := coal.aggregateNode) @@ -1641,7 +1645,6 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) // tracedriver --> coalescer --> tracelogger --> tlram class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { val numLanes = p(SIMTCoreKey).get.nLanes - println(s"============ numLanes: ${numLanes}") val config = defaultConfig.copy(numLanes = numLanes) val driver = LazyModule(new MemTraceDriver(config, filename)) From 9059d1e436ef8dc39936697fe2d175cf8045a2ae Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 20:35:40 -0700 Subject: [PATCH 19/19] Misc doc --- src/main/scala/tilelink/Coalescing.scala | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 5bef07d..faaf569 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -242,9 +242,10 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e c && !(io.invalidate.valid && inv) }.reduce(_ || _) val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _) - // syncedDeqValidNextCycle being true means the arbiter has completed - // processing all of the ready-to-go requests. - val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire + // valid && !fire means we enable enqueueing to a full queue, provided the + // arbiter is taking away all remaining valid queue heads in the next cycle so + // that we make space for the entire next warp. + val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) for (i <- 0 until config.numLanes) { val enq = io.queue.enq(i) @@ -362,8 +363,8 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], // compare leader's head against follower's every queue entry (followers zip followerValids.asBools).map { case (follower, followerValid) => canMatch(follower, followerValid, leader, leaderValid) - // disabling halving optimization because it does not give the correct - // per-lane coalescable indication to the shift queue + // FIXME: disabling halving optimization because it does not give the + // correct per-lane coalescable indication to the shift queue // // match leader to only followers at lanes >= leader idx // // this halves the number of comparators // if (followerIndex < leaderIndex) false.B @@ -393,7 +394,7 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], } val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts) - val chosenLeader = VecInit(leaders)(chosenLeaderIdx) + val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // mux // matchTable for the chosen lane, but converted to a Vec[UInt] val chosenMatches = VecInit(matchTablePerLane.map{ table => VecInit(table.map(VecInit(_).asUInt)) @@ -437,6 +438,11 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], io.results.canCoalesce := canCoalesce } +// Combinational logic that generates a coalesced request given a request +// window, and a selection of possible coalesced sizes. May utilize multiple +// MonoCoalescers and apply size-choosing policy to determine the final +// coalesced request out of all possible combinations. +// // Software model: coalescer.py class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry, config: CoalescerConfig) extends Module { @@ -904,8 +910,8 @@ class Uncoalescer(config: CoalescerConfig) extends Module { // Un-coalesce responses back to individual lanes val found = inflightTable.io.lookup.bits (found.lanes zip io.uncoalResps).foreach { case (perLane, ioPerLane) => - perLane.reqs.zipWithIndex.foreach { case (oldReq, i) => - val ioOldReq = ioPerLane(i) + perLane.reqs.zipWithIndex.foreach { case (oldReq, depth) => + val ioOldReq = ioPerLane(depth) // TODO: spatial-only coalescing: only looking at 0th srcId entry ioOldReq.valid := false.B