From 5e073f2dec3421b45b2fe38baab94f198bdfaca5 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 12:37:33 -0700 Subject: [PATCH 01/14] Doc update --- src/main/scala/tilelink/Coalescing.scala | 78 ++++++++++++------------ 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index b1cd865..05e6ab3 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -293,6 +293,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e } } + // When doing spatial-only coalescing, queues should never drift from each + // other, i.e. the queue heads should always contain mem requests from the + // same instruction. val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) && writePtr.map(_ === writePtr.head).reduce(_ && _) assert(queueInSync, "shift queue lanes are not in sync") @@ -326,23 +329,15 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], val leaders = io.window.elts.map(_.head) val leadersValid = io.window.mask.map(_.asBools.head) - // When doing spatial-only coalescing, queues should never drift from each - // other, i.e. the queue heads should always contain mem requests from the - // same instruction. - // FIXME: This relies on the MemTraceDriver's behavior of generating TL - // requests with full source info even when the corresponding lane is not - // active. - def testNoQueueDrift: Bool = leaders.map(_.source === leaders.head.source).reduce(_ || _) def printQueueHeads = { leaders.zipWithIndex.foreach{ case (head, i) => printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n", leadersValid(i), head.source, head.address) } } - when (leadersValid.reduce(_ || _)) { - assert(testNoQueueDrift, "unexpected drift between lane request queues") - // printQueueHeads - } + // when (leadersValid.reduce(_ || _)) { + // printQueueHeads + // } val size = coalLogSize val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U @@ -578,11 +573,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends reqQueues.io.coalescable := coalescer.io.coalescable reqQueues.io.invalidate := coalescer.io.invalidate - // Per-lane request and response queues + // =========================================================================== + // Request flow + // =========================================================================== // // Override IdentityNode implementation so that we can instantiate // queues between input and output edges to buffer requests and responses. // See IdentityNode definition in `diplomacy/Nodes.scala`. + // (outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach { case (((tlIn, _), (tlOut, edgeOut)), lane) => // Request queue @@ -641,11 +639,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends tlCoal.e.valid := false.B - // ================================================================== - // ****************************************************************** - // ************************* REORG BOUNDARY ************************* - // ****************************************************************** - // ================================================================== + // =========================================================================== + // Response flow + // =========================================================================== + // + // Connect uncoalescer output and noncoalesced response ports to the response + // queues. // The maximum number of requests from a single lane that can go into a // coalesced request. Upper bound is min(DEPTH, 2**sourceWidth). @@ -1358,27 +1357,6 @@ class MemTraceLogger( // originally requested, so no postprocessing required req.address := tlIn.a.bits.address - // TL data - // - // When tlIn.a.bits.size is smaller than the data bus width, need to - // figure out which byte lanes we actually accessed so that - // we can write that to the memory trace. - // See Section 4.5 Byte Lanes in spec 1.8.1 - - // This assert only holds true for PutFullData and not PutPartialData, - // where HIGH bits in the mask may not be contiguous. - assert( - PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), - "mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic" - ) - val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) - val dataW = tlIn.params.dataBits - val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U)) - req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) - // when (req.valid) { - // printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data) - // } - when(req.valid) { TLPrintf( s"MemTraceLogger (${loggerName}:downstream)", @@ -1391,6 +1369,28 @@ class MemTraceLogger( ) } + // TL data + // + // When tlIn.a.bits.size is smaller than the data bus width, need to + // figure out which byte lanes we actually accessed so that + // we can write that to the memory trace. + // See Section 4.5 Byte Lanes in spec 1.8.1 + + // This assert only holds true for PutFullData and not PutPartialData, + // where HIGH bits in the mask may not be contiguous. + assert( + PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), + "mask HIGH popcount do not match the TL size. " + + "Partial masks are not allowed for PutFull" + ) + val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) + val dataW = tlIn.params.dataBits + val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U)) + req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) + // when (req.valid) { + // printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data) + // } + // responses on TL D channel // resp.valid := tlOut.d.valid @@ -1813,6 +1813,4 @@ class CoalArbiterImpl(outer: CoalArbiter, val coalResp = Decoupled(respCoalBundleT) } ) - - } From a6dbfc39010cd5449467ff9f901aebf668bad72d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 16:12:59 -0700 Subject: [PATCH 02/14] Fix config for unittest --- src/test/scala/coalescing/CoalescingUnitTest.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 2d31711..1f8c849 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -190,8 +190,8 @@ object testConfig extends CoalescerConfig( respQueueDepth = 4, coalLogSizes = Seq(4, 5), sizeEnum = DefaultInFlightTableSizeEnum, - numArbiterOutputPorts = 4, numCoalReqs = 1, + numArbiterOutputPorts = 4, bankStrideInBytes = 64 ) @@ -292,7 +292,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { it should "coalesce identical addresses (stride of 0)" in { test(LazyModule(new DummyCoalescingUnitTB()).module) - .withAnnotations(Seq(VcsBackendAnnotation)) + .withAnnotations(Seq(VerilatorBackendAnnotation)) { c => println(s"coalIO length = ${c.coalIOs(0).length}") val nodes = c.coalIOs.map(_.head) From c75eaaf7272ac6ef0f0f821b3e1f6d356f6e2490 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 16:13:28 -0700 Subject: [PATCH 03/14] Backport SimMemTrace --- src/main/resources/csrc/SimMemTrace.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 08540bc..aed11a1 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -23,6 +23,7 @@ MemTraceReader::MemTraceReader(const std::string &filename) infile.open(filename); if (infile.fail()) { fprintf(stderr, "failed to open file %s\n", filename.c_str()); + exit(EXIT_FAILURE); } } @@ -60,8 +61,6 @@ void MemTraceReader::parse(const bool has_source) { } if (!(infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id)) { - printf("char=[%c]\n", infile.peek()); - // assert(!infile.eof()); error(fileline, "failed parsing cycle..lane_id"); } if (has_source && !(infile >> source)) { @@ -101,8 +100,6 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, MemTraceLine line; line.valid = false; - // printf("tick(): cycle=%ld\n", cycle); - if (finished()) { return line; } From 15889d766770f36b12dfb5b725634ba6c3f3d1bc Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 19:09:25 -0700 Subject: [PATCH 04/14] Take filename from Configs for easier trace testing --- src/main/scala/tilelink/Coalescing.scala | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 05e6ab3..3d4639d 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -233,11 +233,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e })) // shift hint is when the heads have no more coalescable left this or next cycle - val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, i) => - c && !(io.invalidate.valid && i) + val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) => + c && !(io.invalidate.valid && inv) }.reduce(_ || _) val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _) - val syncedDeqValid = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire + // syncedDeqValidNextCycle being true means the arbiter has completed + // processing all of the ready-to-go requests. + val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire for (i <- 0 until config.numLanes) { val enq = io.queue.enq(i) @@ -247,7 +249,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e ctrl.full := writePtr(i) === entries.U ctrl.empty := writePtr(i) === 0.U // shift when no outstanding dequeue, no more coalescable chunks, and not empty - ctrl.shift := !syncedDeqValid && shiftHint && !ctrl.empty + ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty // dequeue is valid when: // head entry is valid, has not been processed by downstream, and is not coalescable @@ -1629,10 +1631,7 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) } // tracedriver --> coalescer --> tracelogger --> tlram -class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { - // val filename = "test.trace" - val filename = "vecadd.core1.thread4.trace" - // val filename = "nvbit.vecadd.n100000.filter_sm0.trace" +class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = defaultConfig.numLanes @@ -1674,13 +1673,14 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { (coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes), "FAIL: requests and responses traffic to the coalescer do not match" ) + printf("SUCCESS: coalescer response traffic matched requests!\n") } } } -class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) +class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new TLRAMCoalescerLogger).module) + val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module) dut.io.start := io.start io.finished := dut.io.finished } From 737a760fcd46a4e7a25ea3b7f25682ebff8a9a5e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 22:58:01 -0700 Subject: [PATCH 05/14] Enable coverage tests for chiseltest --- src/test/scala/coalescing/CoalescingUnitTest.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 1f8c849..08db521 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -2,6 +2,7 @@ package freechips.rocketchip.tilelink.coalescing import chisel3._ import chiseltest._ +import chiseltest.simulator.VerilatorFlags import org.scalatest.flatspec.AnyFlatSpec import freechips.rocketchip.tilelink._ import freechips.rocketchip.util.MultiPortQueue @@ -230,7 +231,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { it should "coalesce fully consecutive accesses at size 4, only once" in { test(LazyModule(new DummyCoalescingUnitTB()).module) - .withAnnotations(Seq(VerilatorBackendAnnotation, WriteFstAnnotation)) + .withAnnotations(Seq(VerilatorBackendAnnotation, VerilatorFlags(Seq("--coverage-line")), WriteFstAnnotation)) // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => val nodes = c.coalIOs.map(_.head) From ba600db7e49bed2f987418c3cf570ed5c2f9b325 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 23:54:49 -0700 Subject: [PATCH 06/14] Backport SimMemTrace fix --- src/main/resources/csrc/SimMemTrace.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index aed11a1..6c08858 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -22,7 +22,8 @@ MemTraceReader::MemTraceReader(const std::string &filename) infile.open(filename); if (infile.fail()) { - fprintf(stderr, "failed to open file %s\n", filename.c_str()); + fprintf(stderr, "MemTraceReader: error: failed to open file %s\n", + filename.c_str()); exit(EXIT_FAILURE); } } @@ -109,7 +110,11 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, // the next line is in the future. if (line.cycle < cycle) { long fileline = read_pos - std::cbegin(trace_buf) + 1; - error(fileline, "some trace lines are left unread in the past"); + error(fileline, "some trace lines are left unread in the past. " + "Tried cycle=" + + std::to_string(cycle) + + ", found line.cycle=" + std::to_string(line.cycle) + + ". Is NUM_LANES set correctly?"); return MemTraceLine{}; } @@ -131,14 +136,17 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id, // monotonically increment read_pos. lane_id need not be contiguous, e.g. // 0->1->3 is fine. ++read_pos; - return line; } else { // For debugging purposes, instead of early-returning on // !trace_read_ready, print something to notify we are blocking a valid // trace line. printf("All Lanes Blocked on this cycle! cycle=%ld \n", cycle); - return MemTraceLine{}; } + // We want to return valid line regardless of `trace_read_ready` or not, + // because we want to let the driver know that it missed a valid line at the + // given cycle, so that it holds its cycle counter and safely reads back the + // line in the future. + return line; } assert(!"unreachable"); From f7df5045d4e31b5b7f3e6ea4bce4d8441bcc5c4a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 7 May 2023 23:55:54 -0700 Subject: [PATCH 07/14] Respect downstream TL A ready in MemTraceDriver --- src/main/scala/tilelink/Coalescing.scala | 61 +++++++++--------------- 1 file changed, 23 insertions(+), 38 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 3d4639d..4895cc0 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -1078,24 +1078,18 @@ class TraceLine extends Bundle with HasTraceLine { class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFile: String) extends LazyModuleImp(outer) with UnitTestModule { + // Current cycle mark to read from trace + val traceReadCycle = RegInit(1.U(64.W)) - val globalClkCounter = RegInit(1.U(64.W)) - val traceReadCycle = RegInit(1.U(64.W)) - val downstreamSQready = WireInit(true.B) + // If any of the downstream lane is not ready, hold on from advancing + val downstreamReady = outer.laneNodes.map(_.out(0)._1.a.ready).reduce(_ && _) - //make the downstream only ready 1/4 of the time - //This is to test Tracer System's ability to hold on requests - //FIXME - downstreamSQready := (globalClkCounter(1,0) =/= 0.U) - //Connect Signals to Verilog BlackBox val sim = Module(new SimMemTrace(traceFile, config.numLanes)) sim.io.clock := clock sim.io.reset := reset.asBool - sim.io.trace_read.ready := downstreamSQready - //FIXME - 1.U hardcoded, currently there is a delay between chisel and verilog + sim.io.trace_read.ready := downstreamReady sim.io.trace_read.cycle := traceReadCycle - // Read output from Verilog BlackBox // Split output of SimMemTrace, which is flattened across all lanes,back to each lane's. val laneReqs = Wire(Vec(config.numLanes, new TraceLine)) @@ -1104,26 +1098,30 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil val dataW = laneReqs(0).data.getWidth laneReqs.zipWithIndex.foreach { case (req, i) => req.valid := sim.io.trace_read.valid(i) - // TODO: driver trace doesn't contain source id - req.source := 0.U + req.source := 0.U // driver trace doesn't contain source id req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i) req.is_store := sim.io.trace_read.is_store(i) req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i) req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } - globalClkCounter := globalClkCounter + 1.U - val existValidReq = WireInit(false.B) - existValidReq := laneReqs.map(_.valid).reduce(_||_) - val validReqBlocked = WireInit(false.B) - validReqBlocked := !downstreamSQready && existValidReq - //Debug - dontTouch(downstreamSQready) - dontTouch(existValidReq) - dontTouch(validReqBlocked) - // Do Not Update TraceReadCycle if downstream is blocking - when(!validReqBlocked){ - traceReadCycle := traceReadCycle + 1.U + def missedLine = { + val existsValidLine = WireInit(false.B) + existsValidLine := laneReqs.map(_.valid).reduce(_||_) + val missedLine = WireInit(false.B) + missedLine := !downstreamReady && existsValidLine + + // Debug + dontTouch(downstreamReady) + dontTouch(existsValidLine) + dontTouch(missedLine) + + missedLine + } + // Do not increment trace read cycle if we didn't fire a valid line because + // downstream was blocking. This prevents missing any line in the trace. + when (!missedLine){ + traceReadCycle := traceReadCycle + 1.U } // To prevent collision of sourceId with a current in-flight message, @@ -1158,19 +1156,6 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W) val wordAlignedSize = Mux(subword, 2.U, req.size) - // when(req.valid && subword) { - // printf( - // "address=%x, size=%d, data=%x, addressMask=%x, wordAlignedAddress=%x, mask=%x, wordData=%x\n", - // req.address, - // req.size, - // req.data, - // ~((1 << log2Ceil(config.WORD_SIZE)) - 1).U(addrW.W), - // wordAlignedAddress, - // mask, - // wordData - // ) - // } - val (tlOut, edge) = node.out(0) val (plegal, pbits) = edge.Put( fromSource = sourceIdCounter, From 6b97b77572918259b7f8eea311e8998a0e13270d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 00:14:48 -0700 Subject: [PATCH 08/14] Revert SimMemTrace.v to use posedge clock Doing function calls inside @(*) causes lint errors. Instead, remove staging registers to eliminate 1 cycle latency between DPI call and when output is visible to Chisel. --- src/main/resources/vsrc/SimMemTrace.v | 74 +++++---------------------- 1 file changed, 13 insertions(+), 61 deletions(-) diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index 4d630fd..fea2bed 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -39,49 +39,32 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, output trace_read_finished ); - bit __in_valid [NUM_LANES-1:0]; - longint __in_address [NUM_LANES-1:0]; - - bit __in_is_store [NUM_LANES-1:0]; + bit __in_valid [NUM_LANES-1:0]; + longint __in_address [NUM_LANES-1:0]; + bit __in_is_store [NUM_LANES-1:0]; reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0]; - longint __in_data [NUM_LANES-1:0]; - - bit __in_finished; - string __uartlog; - - // Cycle counter that is used to query C parser whether we have a request - // coming in at the current cycle. - - - // registers that stage outputs of the C parser - reg [NUM_LANES-1:0] __in_valid_wire; - reg [`DATA_WIDTH-1:0] __in_address_wire [NUM_LANES-1:0]; - - reg [NUM_LANES-1:0] __in_is_store_wire; - reg [`LOGSIZE_WIDTH-1:0] __in_size_wire [NUM_LANES-1:0]; - reg [`DATA_WIDTH-1:0] __in_data_wire [NUM_LANES-1:0]; - reg __in_finished_wire; + longint __in_data [NUM_LANES-1:0]; + bit __in_finished; genvar g; - generate for (g = 0; g < NUM_LANES; g = g + 1) begin - assign trace_read_valid[g] = __in_valid_wire[g]; - assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_wire[g]; + assign trace_read_valid[g] = __in_valid[g]; + assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address[g]; - assign trace_read_is_store[g] = __in_is_store_wire[g]; - assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size_wire[g]; - assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_wire[g]; + assign trace_read_is_store[g] = __in_is_store[g]; + assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size[g]; + assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data[g]; end endgenerate - assign trace_read_finished = __in_finished_wire; + assign trace_read_finished = __in_finished; initial begin /* $value$plusargs("uartlog=%s", __uartlog); */ memtrace_init(FILENAME); end - always @(*) begin + always @(posedge clock) begin if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid[tid] = 1'b0; @@ -91,32 +74,12 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_size[tid] = `LOGSIZE_WIDTH'b0; __in_data[tid] = `DATA_WIDTH'b0; end - __in_finished = 1'b0; - - //cycle_counter <= `DATA_WIDTH'b0; - - // setting default value for register to avoid latches - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - __in_valid_wire[tid] = 1'b0; - __in_address_wire[tid] = `DATA_WIDTH'b0; - - __in_is_store_wire[tid] = 1'b0; - __in_size_wire[tid] = `LOGSIZE_WIDTH'b0; - __in_data_wire[tid] = `DATA_WIDTH'b0; - end - - __in_finished_wire = 1'b0; end else begin - - // Getting values from C function into pseudeo register for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin memtrace_query( trace_read_ready, - // Since parsed results are latched to the output on the next - // cycle due to staging registers, we need to pass in the next cycle - // to sync up. - trace_read_cycle, // the left replace next_cycle_counter, + trace_read_cycle, tid, __in_valid[tid], @@ -129,17 +92,6 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_finished ); end - - // Connect values from pseudo register into verilog register - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - __in_valid_wire[tid] = __in_valid[tid]; - __in_address_wire[tid] = __in_address[tid]; - - __in_is_store_wire[tid] = __in_is_store[tid]; - __in_size_wire[tid] = __in_size[tid]; - __in_data_wire[tid] = __in_data[tid]; - end - __in_finished_wire = __in_finished; end end endmodule From 6755cb3eeca31197f0b3ceaea6754d6393038388 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 00:49:30 -0700 Subject: [PATCH 09/14] Lax traceReadCycle advancing logic Trying to advance trace cycle while downstream is blocking is tricky because DPI call is synchronous, and that gives timing difference between the line we have fired to downstream and the current cycle counter we maintain. Just stall the counter whenever downstream is not ready for now. --- src/main/scala/tilelink/Coalescing.scala | 26 +++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4895cc0..62c0473 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -1105,22 +1105,20 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } - def missedLine = { - val existsValidLine = WireInit(false.B) - existsValidLine := laneReqs.map(_.valid).reduce(_||_) - val missedLine = WireInit(false.B) - missedLine := !downstreamReady && existsValidLine + // def missedLine = { + // val existsValidLine = WireInit(false.B) + // existsValidLine := laneReqs.map(_.valid).reduce(_||_) + // val missedLine = WireInit(false.B) + // missedLine := !downstreamReady && existsValidLine - // Debug - dontTouch(downstreamReady) - dontTouch(existsValidLine) - dontTouch(missedLine) + // // Debug + // dontTouch(downstreamReady) + // dontTouch(existsValidLine) + // dontTouch(missedLine) - missedLine - } - // Do not increment trace read cycle if we didn't fire a valid line because - // downstream was blocking. This prevents missing any line in the trace. - when (!missedLine){ + // missedLine + // } + when (downstreamReady){ traceReadCycle := traceReadCycle + 1.U } From 8cdbc81bddcd4871ee61ca754072e65143f2d2bb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:26:05 -0700 Subject: [PATCH 10/14] Only write in MemTraceLogger when TL fire Without this we log extraneous lines that were valid but not transacted with the downstream as it was not ready, which affects validity of memtrace testing. --- src/main/scala/tilelink/Coalescing.scala | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 62c0473..0addef1 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -1334,7 +1334,9 @@ class MemTraceLogger( // requests on TL A channel // - req.valid := tlIn.a.valid + // Only log trace when fired, e.g. both upstream and downstream is ready + // and transaction happened. + req.valid := tlIn.a.fire req.size := tlIn.a.bits.size req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source @@ -1378,7 +1380,9 @@ class MemTraceLogger( // responses on TL D channel // - resp.valid := tlOut.d.valid + // Only log trace when fired, e.g. both upstream and downstream is ready + // and transaction happened. + resp.valid := tlOut.d.fire resp.size := tlOut.d.bits.size resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) resp.source := tlOut.d.bits.source @@ -1412,7 +1416,7 @@ class MemTraceLogger( // // This is a clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. - def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { + def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { // these will get optimized out val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid))) val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source))) @@ -1428,12 +1432,12 @@ class MemTraceLogger( vecSize(i) := l.size vecData(i) := l.data } - traceLogIO.valid := vecValid.asUInt - traceLogIO.source := vecSource.asUInt - traceLogIO.address := vecAddress.asUInt - traceLogIO.is_store := vecIsStore.asUInt - traceLogIO.size := vecSize.asUInt - traceLogIO.data := vecData.asUInt + simIO.valid := vecValid.asUInt + simIO.source := vecSource.asUInt + simIO.address := vecAddress.asUInt + simIO.is_store := vecIsStore.asUInt + simIO.size := vecSize.asUInt + simIO.data := vecData.asUInt } if (simReq.isDefined) { From 2e219ea15a6627a8532ce1daa84a715e28776d63 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:27:54 -0700 Subject: [PATCH 11/14] Connect CoalShiftQueue enq.ready to upstream TL.ready Now CoalShiftQueue can properly stall memtrace driver. --- src/main/scala/tilelink/Coalescing.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 0addef1..4b4a0f6 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -604,7 +604,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends val deq = reqQueues.io.queue.deq(lane) enq.valid := tlIn.a.valid enq.bits := req - deq.ready := true.B // TODO: deq.ready should respect downstream arbiter + // TODO: deq.ready should respect downstream arbiter + deq.ready := true.B + // Stall upstream core or memtrace driver when shiftqueue is not ready + tlIn.a.ready := enq.ready tlOut.a.valid := deq.valid tlOut.a.bits := deq.bits.toTLA(edgeOut) From 54a3e3cf72de3f20bb7b24aa307af8a04b7aa9d9 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:34:52 -0700 Subject: [PATCH 12/14] Initiate memtrace DPI only when trace_read_ready This is required because otherwise we might overwrite into the Verilog registers that contain a valid trace line that was missed by downstream when it was not ready. Basically whenever trace_read_cycle stalls, we also want to stall __in_* registers. --- src/main/resources/vsrc/SimMemTrace.v | 31 ++++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index fea2bed..74594cb 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -76,21 +76,26 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( end __in_finished = 1'b0; end else begin - for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin - memtrace_query( - trace_read_ready, - trace_read_cycle, - tid, + // We have to write to __in_ regs only when trace_read_ready, or + // otherwise we might overwrite lines that were previously valid + // but the downstream missed by being not ready. + if (trace_read_ready) begin + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + memtrace_query( + trace_read_ready, + trace_read_cycle, + tid, - __in_valid[tid], - __in_address[tid], - - __in_is_store[tid], - __in_size[tid], - __in_data[tid], + __in_valid[tid], + __in_address[tid], - __in_finished - ); + __in_is_store[tid], + __in_size[tid], + __in_data[tid], + + __in_finished + ); + end end end end From 2d4e28e862ca270b3bcc4f37d8a8788cbc20fe31 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 14:38:15 -0700 Subject: [PATCH 13/14] Use WithoutTLMonitors to slightly speed up chiseltests --- src/test/scala/coalescing/CoalescingUnitTest.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 08db521..5a48733 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -7,6 +7,7 @@ import org.scalatest.flatspec.AnyFlatSpec import freechips.rocketchip.tilelink._ import freechips.rocketchip.util.MultiPortQueue import freechips.rocketchip.diplomacy._ +import freechips.rocketchip.subsystem.WithoutTLMonitors import org.chipsalliance.cde.config.Parameters import chisel3.util.{DecoupledIO, Valid} import chisel3.util.experimental.BoringUtils @@ -230,7 +231,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } it should "coalesce fully consecutive accesses at size 4, only once" in { - test(LazyModule(new DummyCoalescingUnitTB()).module) + test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module) .withAnnotations(Seq(VerilatorBackendAnnotation, VerilatorFlags(Seq("--coverage-line")), WriteFstAnnotation)) // .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation)) { c => @@ -292,7 +293,7 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester { } it should "coalesce identical addresses (stride of 0)" in { - test(LazyModule(new DummyCoalescingUnitTB()).module) + test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module) .withAnnotations(Seq(VerilatorBackendAnnotation)) { c => println(s"coalIO length = ${c.coalIOs(0).length}") From 3fae0b2c7a4df7c8dde97878a2ad0b2adc58d525 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 8 May 2023 15:18:45 -0700 Subject: [PATCH 14/14] Use priority encoder for chooseLeaderIdx --- src/main/scala/tilelink/Coalescing.scala | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4b4a0f6..ae4da82 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -372,14 +372,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry], .reduce(_ +& _)) val canCoalesce = matchCounts.map(_ > 1.U) - // Elect the leader out of all potential leaders that have matchCounts > 1. + // Elect the leader that has the most match counts. // TODO: potentially expensive: magnitude comparator - // Maybe choose leftmost leader (priority encoder) instead of argmax - val chosenLeaderIdx = matchCounts.zipWithIndex.map { - case (c, i) => (c, i.U) - }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => - (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) - }._2 + def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = { + matchCounts.zipWithIndex.map { + case (c, i) => (c, i.U) + }.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) => + (Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j)) + }._2 + } + // Elect leader by choosing the smallest-index lane that has a valid + // match, i.e. using priority encoder. + def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = { + PriorityEncoder(matchCounts.map(_ > 1.U)) + } + val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts) val chosenLeader = VecInit(leaders)(chosenLeaderIdx) // matchTable for the chosen lane, but converted to a Vec[UInt] @@ -1530,7 +1537,7 @@ class DummyDriver(config: CoalescerConfig)(implicit p: Parameters) val clientParam = Seq( TLMasterParameters.v1( name = "dummy-core-node-" + i.toString, - sourceId = IdRange(0, defaultConfig.numOldSrcIds) + sourceId = IdRange(0, config.numOldSrcIds) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) )