From abecd30b2b988af70e677b9f6925e09f79bb2086 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 7 Apr 2023 14:50:40 -0700 Subject: [PATCH 01/37] Store sourceId for every old req entry in table --- src/main/scala/tilelink/Coalescing.scala | 60 ++++++++++++------- .../scala/coalescing/CoalescingUnitTest.scala | 12 ++-- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index d199b1a..5c7c8e1 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -56,7 +56,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val wordSize = 4 val reqQueueDepth = 4 // FIXME test - val respQueueDepth = 2 // FIXME test + val respQueueDepth = 4 // FIXME test val sourceWidth = outer.node.in(1)._1.params.sourceBits val addressWidth = outer.node.in(1)._1.params.addressBits @@ -240,11 +240,13 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val newEntry = Wire( new InflightCoalReqTableEntry(numLanes, numPerLaneReqs, sourceWidth, offsetBits, sizeBits) ) + println(s"=========== table sourceWidth: ${sourceWidth}") newEntry.source := coalSourceId newEntry.lanes.foreach { l => - l.reqs.foreach { r => + l.reqs.zipWithIndex.foreach { case (r, i) => // TODO: this part needs the actual coalescing logic to work r.valid := false.B + r.source := i.U //FIXME bogus r.offset := 1.U r.size := 2.U } @@ -273,6 +275,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule uncoalescer.io.coalRespSrcId := tlCoal.d.bits.source uncoalescer.io.coalRespData := tlCoal.d.bits.data + // TODO: multibeat TL requests. Currently tlCoal.d.bits.data is fixed to 64b + // width + println(s"=========== coalRespData width: ${tlCoal.d.bits.data.widthOption.get}") + // Queue up synthesized uncoalesced responses into each lane's response queue (respQueues zip uncoalescer.io.uncoalResps).foreach { case (q, lanes) => lanes.zipWithIndex.foreach { case (resp, i) => @@ -354,23 +360,22 @@ class UncoalescingUnit( // Un-coalesce responses back to individual lanes val found = inflightTable.io.lookup.bits - (found.lanes zip io.uncoalResps).foreach { case (lane, ioLane) => - lane.reqs.zipWithIndex.foreach { case (req, i) => - val ioReq = ioLane(i) + (found.lanes zip io.uncoalResps).foreach { case (perLane, ioPerLane) => + perLane.reqs.zipWithIndex.foreach { case (oldReq, i) => + val ioOldReq = ioPerLane(i) // FIXME: only looking at 0th srcId entry - ioReq.valid := false.B - ioReq.bits := DontCare + ioOldReq.valid := false.B + ioOldReq.bits := DontCare when(inflightTable.io.lookup.valid) { - ioReq.valid := req.valid - ioReq.bits.source := 0.U - + ioOldReq.valid := oldReq.valid + ioOldReq.bits.source := oldReq.source // FIXME: disregard size enum for now val byteSize = 4 - ioReq.bits.data := - getCoalescedDataChunk(io.coalRespData, coalDataWidth, req.offset, byteSize) + ioOldReq.bits.data := + getCoalescedDataChunk(io.coalRespData, coalDataWidth, oldReq.offset, byteSize) } } } @@ -414,6 +419,8 @@ class InflightCoalReqTable( table(i).valid := false.B table(i).bits.lanes.foreach { l => l.reqs.foreach { r => + r.valid := false.B + r.source := 0.U r.offset := 0.U r.size := 0.U } @@ -467,14 +474,16 @@ class InflightCoalReqTableEntry( val offsetBits: Int, val sizeBits: Int ) extends Bundle { - class CoreReq extends Bundle { + class PerCoreReq extends Bundle { val valid = Bool() + // FIXME: oldId and newId shares the same width + val source = UInt(sourceWidth.W) val offset = UInt(offsetBits.W) val size = UInt(sizeBits.W) } class PerLane extends Bundle { // FIXME: if numPerLaneReqs != 2 ** sourceWidth, we need to store srcId as well - val reqs = Vec(numPerLaneReqs, new CoreReq) + val reqs = Vec(numPerLaneReqs, new PerCoreReq) } // sourceId of the coalesced response that just came back. This will be the // key that queries the table. @@ -570,7 +579,9 @@ class CoalShiftQueue[T <: Data]( io.count := PopCount(io.mask) } -class MemTraceDriver(numLanes: Int = 4, traceFile : String = "vecadd.core1.thread4.trace")(implicit p: Parameters) extends LazyModule { +class MemTraceDriver(numLanes: Int = 4, traceFile: String = "vecadd.core1.thread4.trace")(implicit + p: Parameters +) extends LazyModule { // Create N client nodes together val laneNodes = Seq.tabulate(numLanes) { i => @@ -600,7 +611,7 @@ class TraceReq extends Bundle { val data = UInt(64.W) } -class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile : String) +class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) extends LazyModuleImp(outer) with UnitTestModule { val sim = Module( @@ -630,11 +641,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile : String val sourceIdCounter = RegInit(0.U(64.W)) sourceIdCounter := sourceIdCounter + 1.U - //Issue here is that Vortex mem range is not within Chipyard Mem range - //In default setting, all mem-req for program data must be within 0X80000000 -> 0X90000000 - // - def hashToValidPhyAddr(addr : UInt) : UInt = { - Cat(8.U(4.W), addr(27, 3), 0.U(3.W) ) + // Issue here is that Vortex mem range is not within Chipyard Mem range + // In default setting, all mem-req for program data must be within + // 0X80000000 -> 0X90000000 + def hashToValidPhyAddr(addr: UInt): UInt = { + Cat(8.U(4.W), addr(27, 3), 0.U(3.W)) } // Connect each lane to its respective TL node. @@ -668,8 +679,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile : String } io.finished := sim.io.trace_read.finished - when(io.finished){ - assert(false.B, "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)") + when(io.finished) { + assert( + false.B, + "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)" + ) } // Clock Counter, for debugging purpose diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 7d7099b..47db1b6 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -238,15 +238,19 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { c.io.coalReqValid.poke(true.B) c.io.newEntry.source.poke(sourceId) c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(0).reqs(0).source.poke(1.U) c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U) c.io.newEntry.lanes(0).reqs(0).size.poke(2.U) c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B) + c.io.newEntry.lanes(0).reqs(1).source.poke(2.U) c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) c.io.newEntry.lanes(0).reqs(1).size.poke(2.U) c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(2).reqs(0).source.poke(1.U) c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U) c.io.newEntry.lanes(2).reqs(0).size.poke(1.U) c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B) + c.io.newEntry.lanes(2).reqs(1).source.poke(2.U) c.io.newEntry.lanes(2).reqs(1).offset.poke(0.U) c.io.newEntry.lanes(2).reqs(1).size.poke(2.U) @@ -268,13 +272,13 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { c.io.uncoalResps(3)(0).valid.expect(false.B) c.io.uncoalResps(0)(0).bits.data.expect(0x89abcdefL.U) - c.io.uncoalResps(0)(0).bits.source.expect(0.U) + c.io.uncoalResps(0)(0).bits.source.expect(1.U) c.io.uncoalResps(0)(1).bits.data.expect(0x89abcdefL.U) - c.io.uncoalResps(0)(1).bits.source.expect(0.U) + c.io.uncoalResps(0)(1).bits.source.expect(2.U) c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U) - c.io.uncoalResps(2)(0).bits.source.expect(0.U) + c.io.uncoalResps(2)(0).bits.source.expect(1.U) c.io.uncoalResps(2)(1).bits.data.expect(0x01234567L.U) - c.io.uncoalResps(2)(1).bits.source.expect(0.U) + c.io.uncoalResps(2)(1).bits.source.expect(2.U) } } } From 109ad2cac036bd5bc53dcd39a2e9d9899ee0bd58 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 7 Apr 2023 16:23:14 -0700 Subject: [PATCH 02/37] Write MemTraceLogger and new synthesizable unittest --- src/main/scala/tilelink/Coalescing.scala | 57 +++++++++++++++++++++--- 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 5c7c8e1..33523b8 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -211,6 +211,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule reqQueues(2).io.deq.valid && reqQueues(3).io.deq.valid when(coalReqValid) { // invalidate original requests due to coalescing + // FIXME: bogus reqQueues(0).io.invalidate := 0x1.U reqQueues(1).io.invalidate := 0x1.U reqQueues(2).io.invalidate := 0x1.U @@ -246,7 +247,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule l.reqs.zipWithIndex.foreach { case (r, i) => // TODO: this part needs the actual coalescing logic to work r.valid := false.B - r.source := i.U //FIXME bogus + r.source := i.U // FIXME bogus r.offset := 1.U r.size := 2.U } @@ -722,7 +723,51 @@ class SimMemTrace(filename: String, numLanes: Int) addResource("/csrc/SimMemTrace.h") } -class CoalConnectTrace(implicit p: Parameters) extends LazyModule { +class MemTraceLogger(numLanes: Int = 5)(implicit p: Parameters) extends LazyModule { + val beatBytes = 4 // FIXME: hardcoded + val node = TLManagerNode(Seq.tabulate(numLanes) { _ => + TLSlavePortParameters.v1( + Seq( + TLSlaveParameters.v1( + address = List(AddressSet(0x0000, 0xffffff)), // FIXME: hardcoded + supportsGet = TransferSizes(1, beatBytes), + supportsPutPartial = TransferSizes(1, beatBytes), + supportsPutFull = TransferSizes(1, beatBytes) + ) + ), + beatBytes = beatBytes + ) + }) + + lazy val module = new Impl + class Impl extends LazyModuleImp(this) {} +} + +// synthesizable unit tests + +class CoalescerLogger(implicit p: Parameters) extends LazyModule { + // TODO: use parameters for numLanes + val numLanes = 4 + val coal = LazyModule(new CoalescingUnit(numLanes)) + val driver = LazyModule(new MemTraceDriver(numLanes)) + val logger = LazyModule(new MemTraceLogger(numLanes + 1)) // +1 for coalesced edge + + logger.node :=* coal.node :=* driver.node + + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with UnitTestModule { + driver.module.io.start := io.start + io.finished := driver.module.io.finished + } +} + +class CoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { + val dut = Module(LazyModule(new CoalescerLogger).module) + dut.io.start := io.start + io.finished := dut.io.finished +} + +class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 val coal = LazyModule(new CoalescingUnit(numLanes)) @@ -730,11 +775,9 @@ class CoalConnectTrace(implicit p: Parameters) extends LazyModule { coal.node :=* driver.node - // Use TLTestRAM as bogus downstream TL manager nodes - // TODO: swap this out with a memtrace logger val rams = Seq.tabulate(numLanes + 1) { _ => LazyModule( - // TODO: properly propagate beatBytes? + // FIXME: properly propagate beatBytes? new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) ) } @@ -748,8 +791,8 @@ class CoalConnectTrace(implicit p: Parameters) extends LazyModule { } } -class CoalescingUnitTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new CoalConnectTrace).module) +class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { + val dut = Module(LazyModule(new TLRAMCoalescer).module) dut.io.start := io.start io.finished := dut.io.finished } From 452cc40eb7d1808fc3931f3ac5905db95ab2ac9d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 7 Apr 2023 17:22:52 -0700 Subject: [PATCH 03/37] Make MemTraceLogger pass-through node Instead of making MemTraceLogger a TL slave, make it an IdentityNode that simply snoops on the TL edges and generates logs. We can attach a TLRAM at the downstream to actually get response back, rather than MemTraceLogger simply absorbing all requests. --- src/main/scala/tilelink/Coalescing.scala | 85 +++++++++++++++--------- 1 file changed, 52 insertions(+), 33 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 33523b8..80bf9bc 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -216,13 +216,16 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule reqQueues(1).io.invalidate := 0x1.U reqQueues(2).io.invalidate := 0x1.U reqQueues(3).io.invalidate := 0x1.U + printf("coalescing succeeded!\n") } + // TODO: write request val (legal, bits) = edgeCoal.Get( fromSource = coalSourceId, // `toAddress` should be aligned to 2**lgSize toAddress = coalReqAddress, // 64 bits = 8 bytes = 2**(3) bytes + // TODO: parameterize to eg. cache line size lgSize = 3.U ) assert(legal, "unhandled illegal TL req gen") @@ -235,7 +238,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule // Construct new entry for the inflight table // FIXME: don't instantiate inflight table entry type here. It leaks the table's impl - // detail outside to the coalescer + // detail to the coalescer val offsetBits = 4 // FIXME hardcoded val sizeBits = 2 // FIXME hardcoded val newEntry = Wire( @@ -348,7 +351,10 @@ class UncoalescingUnit( def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, byteSize: Int): UInt = { val bitSize = byteSize * 8 val sizeMask = (1.U << bitSize) - 1.U - assert(dataWidth % bitSize == 0, "coalesced data width not evenly divisible by size") + assert( + dataWidth > 0 && dataWidth % bitSize == 0, + s"coalesced data width ($dataWidth) not evenly divisible by core req size ($bitSize)" + ) val numChunks = dataWidth / bitSize val chunks = Wire(Vec(numChunks, UInt(bitSize.W))) val offsets = (0 until numChunks) @@ -434,13 +440,12 @@ class InflightCoalReqTable( .map { i => table(i).valid } .reduce { (v0, v1) => v0 && v1 } // Inflight table should never be full. It should have enough number of - // entries to keep track of all outstanding core-side requests; otherwise, - // it will stall the core issuing logic. - assert(!full, "table is blocking coalescer") + // entries to keep track of all outstanding core-side requests, i.e. + // (2 ** oldSrcIdBits) entries. + assert(!full, "inflight table is full and blocking coalescer") dontTouch(full) // Enqueue logic - // io.enq.ready := !full val enqFire = io.enq.ready && io.enq.valid when(enqFire) { @@ -455,7 +460,6 @@ class InflightCoalReqTable( } // Lookup logic - // io.lookup.valid := table(io.lookupSourceId).valid io.lookup.bits := table(io.lookupSourceId).bits val lookupFire = io.lookup.ready && io.lookup.valid @@ -723,36 +727,52 @@ class SimMemTrace(filename: String, numLanes: Int) addResource("/csrc/SimMemTrace.h") } -class MemTraceLogger(numLanes: Int = 5)(implicit p: Parameters) extends LazyModule { - val beatBytes = 4 // FIXME: hardcoded - val node = TLManagerNode(Seq.tabulate(numLanes) { _ => - TLSlavePortParameters.v1( - Seq( - TLSlaveParameters.v1( - address = List(AddressSet(0x0000, 0xffffff)), // FIXME: hardcoded - supportsGet = TransferSizes(1, beatBytes), - supportsPutPartial = TransferSizes(1, beatBytes), - supportsPutFull = TransferSizes(1, beatBytes) - ) - ), - beatBytes = beatBytes - ) - }) +class MemTraceLogger(numLanes: Int = 4)(implicit p: Parameters) extends LazyModule { + val node = TLIdentityNode() + + // val beatBytes = 8 // FIXME: hardcoded + // val node = TLManagerNode(Seq.tabulate(numLanes) { _ => + // TLSlavePortParameters.v1( + // Seq( + // TLSlaveParameters.v1( + // address = List(AddressSet(0x0000, 0xffffff)), // FIXME: hardcoded + // supportsGet = TransferSizes(1, beatBytes), + // supportsPutPartial = TransferSizes(1, beatBytes), + // supportsPutFull = TransferSizes(1, beatBytes) + // ) + // ), + // beatBytes = beatBytes + // ) + // }) lazy val module = new Impl - class Impl extends LazyModuleImp(this) {} + class Impl extends LazyModuleImp(this) { + (node.in zip node.out).foreach { + case ((tlIn, _), (tlOut, _)) => + tlOut.a <> tlIn.a + tlIn.d <> tlOut.d + } + } } // synthesizable unit tests -class CoalescerLogger(implicit p: Parameters) extends LazyModule { +// tracedriver --> coalescer --> tracelogger --> tlram +class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 val coal = LazyModule(new CoalescingUnit(numLanes)) val driver = LazyModule(new MemTraceDriver(numLanes)) - val logger = LazyModule(new MemTraceLogger(numLanes + 1)) // +1 for coalesced edge + val logger = LazyModule(new MemTraceLogger(numLanes + 1)) + val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge + LazyModule( + // FIXME: properly propagate beatBytes? + new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) + ) + ) logger.node :=* coal.node :=* driver.node + rams.foreach { r => r.node := logger.node } lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { @@ -761,27 +781,26 @@ class CoalescerLogger(implicit p: Parameters) extends LazyModule { } } -class CoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new CoalescerLogger).module) +class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { + val dut = Module(LazyModule(new TLRAMCoalescerLogger).module) dut.io.start := io.start io.finished := dut.io.finished } +// tracedriver --> coalescer --> tlram class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 val coal = LazyModule(new CoalescingUnit(numLanes)) val driver = LazyModule(new MemTraceDriver(numLanes)) - - coal.node :=* driver.node - - val rams = Seq.tabulate(numLanes + 1) { _ => + val rams = Seq.fill(numLanes + 1) ( // +1 for coalesced edge LazyModule( // FIXME: properly propagate beatBytes? new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) ) - } - // Connect all (N+1) outputs of coal to separate TestRAM modules + ) + + coal.node :=* driver.node rams.foreach { r => r.node := coal.node } lazy val module = new Impl From af29acdcda0675e542737e4148516bc57ec52d18 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 9 Apr 2023 14:53:02 -0700 Subject: [PATCH 04/37] Placeholder for MemTraceLogger C++ code --- src/main/resources/csrc/SimMemTraceLogger.cc | 127 +++++++++++++++++++ src/main/scala/tilelink/Coalescing.scala | 52 ++++++-- 2 files changed, 170 insertions(+), 9 deletions(-) create mode 100644 src/main/resources/csrc/SimMemTraceLogger.cc diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc new file mode 100644 index 0000000..736d0ba --- /dev/null +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -0,0 +1,127 @@ +#ifndef NO_VPI +#include +#include +#endif +#include +#include +#include +#include +#include +#include "SimMemTraceLogger.h" + +// Global singleton instance +static std::unique_ptr logger; + +MemTraceLogger::MemTraceLogger(const std::string &filename) { + char cwd[4096]; + if (getcwd(cwd, sizeof(cwd))) { + printf("MemTraceLogger: current working dir: %s\n", cwd); + } + + infile.open(filename); + if (infile.fail()) { + fprintf(stderr, "failed to open file %s\n", filename.c_str()); + } +} + +MemTraceLogger::~MemTraceLogger() { + infile.close(); + printf("MemTraceLogger destroyed\n"); +} + +#if 0 +// Parse trace file in its entirety and store it into internal structure. +// TODO: might block for a long time when the trace gets big, check if need to +// be broken down +void MemTraceReader::parse() { + MemTraceLine line; + + printf("MemTraceReader: started parsing\n"); + + while (infile >> line.cycle >> line.loadstore >> line.core_id >> + line.lane_id >> std::hex >> line.address >> line.data >> std::dec >> + line.data_size) { + line.valid = true; + trace.push_back(line); + } + read_pos = trace.cbegin(); + + printf("MemTraceReader: finished parsing\n"); +} + +// Try to read a memory request that might have happened at a given cycle, on a +// given SIMD lane (= "thread"). In case no request happened at that point, +// return an empty line with .valid = false. +MemTraceLine MemTraceReader::read_trace_at(const long cycle, + const int lane_id) { + MemTraceLine line; + line.valid = false; + + // printf("tick(): cycle=%ld\n", cycle); + + if (finished()) { + return line; + } + + line = *read_pos; + // It should always be guaranteed that we consumed all of the past lines, and + // the next line is in the future. + if (line.cycle < cycle) { + // fprintf(stderr, "line.cycle=%ld, cycle=%ld\n", line.cycle, cycle); + assert(false && "some trace lines are left unread in the past"); + } + + if (line.lane_id != lane_id) { + line.valid = false; + } + if (line.cycle > cycle) { + // We haven't reached the cycle mark specified in this line yet, so we don't + // read it right now. + return MemTraceLine{}; + } else if (line.cycle == cycle && line.lane_id == lane_id) { + printf("fire! cycle=%ld, valid=%d, %s addr=%x \n", cycle, line.valid, + line.loadstore, line.address); + + // FIXME! Currently lane_id is assumed to be in round-robin order, e.g. + // 0->1->2->3->0->..., both in the trace file and the order the caller calls + // this function. If this is not true, we cannot simply monotonically + // increment read_pos. + + // Only advance pointer when cycle and threa_id both match + // now increaseing sequence is fine (0, 1, 3), but unordered is not fine (0, 3, 1) + ++read_pos; + } + + return line; +} +#endif + +extern "C" void memtracelogger_init(const char *filename) { +#ifndef NO_VPI + s_vpi_vlog_info info; + if (!vpi_get_vlog_info(&info)) { + fprintf(stderr, "fatal: failed to get plusargs from VCS\n"); + exit(1); + } + const char* TRACEFILENAME_PLUSARG = "+memtracefile="; + for (int i = 0; i < info.argc; i++) { + char* input_arg = info.argv[i]; + if (strncmp(input_arg, TRACEFILENAME_PLUSARG, + strlen(TRACEFILENAME_PLUSARG)) == 0) { + filename = input_arg + strlen(TRACEFILENAME_PLUSARG); + break; + } + } +#endif + + printf("memtrace_init: filename=[%s]\n", filename); + + logger = std::make_unique(filename); +} + +// TODO: accept core_id as well +extern "C" void memtracelogger_log(unsigned char *trace_log_ready) { + // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, + // trace_read_lane_id); + *trace_log_ready = 1; +} diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 80bf9bc..8d318a0 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -584,7 +584,7 @@ class CoalShiftQueue[T <: Data]( io.count := PopCount(io.mask) } -class MemTraceDriver(numLanes: Int = 4, traceFile: String = "vecadd.core1.thread4.trace")(implicit +class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit p: Parameters ) extends LazyModule { @@ -605,7 +605,7 @@ class MemTraceDriver(numLanes: Int = 4, traceFile: String = "vecadd.core1.thread val node = TLIdentityNode() laneNodes.foreach { l => node := l } - lazy val module = new MemTraceDriverImp(this, numLanes, traceFile) + lazy val module = new MemTraceDriverImp(this, numLanes, filename) } class TraceReq extends Bundle { @@ -727,7 +727,7 @@ class SimMemTrace(filename: String, numLanes: Int) addResource("/csrc/SimMemTrace.h") } -class MemTraceLogger(numLanes: Int = 4)(implicit p: Parameters) extends LazyModule { +class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit p: Parameters) extends LazyModule { val node = TLIdentityNode() // val beatBytes = 8 // FIXME: hardcoded @@ -747,14 +747,47 @@ class MemTraceLogger(numLanes: Int = 4)(implicit p: Parameters) extends LazyModu lazy val module = new Impl class Impl extends LazyModuleImp(this) { - (node.in zip node.out).foreach { - case ((tlIn, _), (tlOut, _)) => - tlOut.a <> tlIn.a - tlIn.d <> tlOut.d + val sim = Module(new SimMemTraceLogger(filename, numLanes)) + sim.io.clock := clock + sim.io.reset := reset.asBool + + (node.in zip node.out).foreach { case ((tlIn, _), (tlOut, _)) => + tlOut.a <> tlIn.a + tlIn.d <> tlOut.d } + + // io.finished := sim.io.trace_read.finished } } +class SimMemTraceLogger(filename: String, numLanes: Int) + extends BlackBox( + Map("FILENAME" -> filename, "NUM_LANES" -> numLanes) + ) + with HasBlackBoxResource { + val io = IO(new Bundle { + val clock = Input(Clock()) + val reset = Input(Bool()) + + // val trace_read = new Bundle { + // val ready = Input(Bool()) + // val valid = Output(UInt(numLanes.W)) + // // Chisel can't interface with Verilog 2D port, so flatten all lanes into + // // single wide 1D array. + // // TODO: assumes 64-bit address. + // val address = Output(UInt((64 * numLanes).W)) + // val is_store = Output(UInt(numLanes.W)) + // val store_mask = Output(UInt((8 * numLanes).W)) + // val data = Output(UInt((64 * numLanes).W)) + // val finished = Output(Bool()) + // } + }) + + addResource("/vsrc/SimMemTraceLogger.v") + addResource("/csrc/SimMemTraceLogger.cc") + addResource("/csrc/SimMemTraceLogger.h") +} + // synthesizable unit tests // tracedriver --> coalescer --> tracelogger --> tlram @@ -781,7 +814,8 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { } } -class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { +class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) + extends UnitTest(timeout) { val dut = Module(LazyModule(new TLRAMCoalescerLogger).module) dut.io.start := io.start io.finished := dut.io.finished @@ -793,7 +827,7 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { val numLanes = 4 val coal = LazyModule(new CoalescingUnit(numLanes)) val driver = LazyModule(new MemTraceDriver(numLanes)) - val rams = Seq.fill(numLanes + 1) ( // +1 for coalesced edge + val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( // FIXME: properly propagate beatBytes? new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) From b53711965ed642d74c1b9b5e3416a7df1b346b3e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 10 Apr 2023 20:24:27 -0700 Subject: [PATCH 05/37] Connect TL edge data to SimMemTraceLogger TODO: since TileLink rounds all address down to a multiple of its beat size (8 in the current code), we can't directly compare the memory trace input to its output. Need to take masks into account. --- src/main/resources/csrc/SimMemTraceLogger.cc | 21 +++++-- src/main/scala/tilelink/Coalescing.scala | 58 ++++++++++++++------ 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index 736d0ba..1fce4bd 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -3,7 +3,7 @@ #include #endif #include -#include +#include #include #include #include @@ -33,10 +33,10 @@ MemTraceLogger::~MemTraceLogger() { // Parse trace file in its entirety and store it into internal structure. // TODO: might block for a long time when the trace gets big, check if need to // be broken down -void MemTraceReader::parse() { +void MemTraceLogger::parse() { MemTraceLine line; - printf("MemTraceReader: started parsing\n"); + printf("MemTraceLogger: started parsing\n"); while (infile >> line.cycle >> line.loadstore >> line.core_id >> line.lane_id >> std::hex >> line.address >> line.data >> std::dec >> @@ -46,13 +46,13 @@ void MemTraceReader::parse() { } read_pos = trace.cbegin(); - printf("MemTraceReader: finished parsing\n"); + printf("MemTraceLogger: finished parsing\n"); } // Try to read a memory request that might have happened at a given cycle, on a // given SIMD lane (= "thread"). In case no request happened at that point, // return an empty line with .valid = false. -MemTraceLine MemTraceReader::read_trace_at(const long cycle, +MemTraceLine MemTraceLogger::read_trace_at(const long cycle, const int lane_id) { MemTraceLine line; line.valid = false; @@ -120,8 +120,17 @@ extern "C" void memtracelogger_init(const char *filename) { } // TODO: accept core_id as well -extern "C" void memtracelogger_log(unsigned char *trace_log_ready) { +extern "C" void memtracelogger_log(unsigned char trace_log_valid, + unsigned long trace_log_cycle, + unsigned long trace_log_address, + unsigned int trace_log_lane_id, + unsigned char *trace_log_ready) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, // trace_read_lane_id); *trace_log_ready = 1; + + if (trace_log_valid) { + printf("%s: [%lu] valid: address=0x%lx, tid=%u\n", __func__, + trace_log_cycle, trace_log_address, trace_log_lane_id); + } } diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 8d318a0..e9a2aed 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -727,7 +727,9 @@ class SimMemTrace(filename: String, numLanes: Int) addResource("/csrc/SimMemTrace.h") } -class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit p: Parameters) extends LazyModule { +class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit + p: Parameters +) extends LazyModule { val node = TLIdentityNode() // val beatBytes = 8 // FIXME: hardcoded @@ -751,11 +753,30 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 sim.io.clock := clock sim.io.reset := reset.asBool - (node.in zip node.out).foreach { case ((tlIn, _), (tlOut, _)) => + val laneReqs = Wire(Vec(numLanes, new TraceReq)) + val laneValid = Wire(Vec(numLanes, Bool())) + val laneAddress = Wire(Vec(numLanes, UInt(64.W))) // FIXME: hardcoded + + // snoop on the TileLink edges to log traffic + ((node.in zip node.out) zip laneReqs).foreach { case (((tlIn, _), (tlOut, _)), req) => tlOut.a <> tlIn.a tlIn.d <> tlOut.d + + req.valid := tlIn.a.valid + req.address := tlIn.a.bits.address + req.data := tlIn.a.bits.data + req.is_store := false.B // FIXME: take is_store from TL + req.mask := tlIn.a.bits.mask } + laneReqs.zipWithIndex.foreach { case (req, i) => + laneValid(i) := req.valid.asUInt + laneAddress(i) := req.address + } + // flatten per-lane signals to the Verilog blackbox input + sim.io.trace_log.valid := laneValid.asUInt + sim.io.trace_log.address := laneAddress.asUInt + // io.finished := sim.io.trace_read.finished } } @@ -769,18 +790,18 @@ class SimMemTraceLogger(filename: String, numLanes: Int) val clock = Input(Clock()) val reset = Input(Bool()) - // val trace_read = new Bundle { - // val ready = Input(Bool()) - // val valid = Output(UInt(numLanes.W)) - // // Chisel can't interface with Verilog 2D port, so flatten all lanes into - // // single wide 1D array. - // // TODO: assumes 64-bit address. - // val address = Output(UInt((64 * numLanes).W)) - // val is_store = Output(UInt(numLanes.W)) - // val store_mask = Output(UInt((8 * numLanes).W)) - // val data = Output(UInt((64 * numLanes).W)) - // val finished = Output(Bool()) - // } + // Chisel can't interface with Verilog 2D port, so flatten all lanes into + // single wide 1D array. + val trace_log = new Bundle { + val valid = Input(UInt(numLanes.W)) + val address = Input(UInt((64 * numLanes).W)) + // val ready = Output(Bool()) + // TODO: assumes 64-bit address. + // val is_store = Output(UInt(numLanes.W)) + // val store_mask = Output(UInt((8 * numLanes).W)) + // val data = Output(UInt((64 * numLanes).W)) + // val finished = Output(Bool()) + } }) addResource("/vsrc/SimMemTraceLogger.v") @@ -794,17 +815,18 @@ class SimMemTraceLogger(filename: String, numLanes: Int) class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 - val coal = LazyModule(new CoalescingUnit(numLanes)) + // val coal = LazyModule(new CoalescingUnit(numLanes)) val driver = LazyModule(new MemTraceDriver(numLanes)) - val logger = LazyModule(new MemTraceLogger(numLanes + 1)) - val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge + val logger = LazyModule(new MemTraceLogger(numLanes)) + val rams = Seq.fill(numLanes)( // +1 for coalesced edge LazyModule( // FIXME: properly propagate beatBytes? new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) ) ) - logger.node :=* coal.node :=* driver.node + // logger.node :=* coal.node :=* driver.node + logger.node :=* driver.node rams.foreach { r => r.node := logger.node } lazy val module = new Impl From dca52ace0be76e04d3ee95f1a852d1ca36fa3bd7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 10 Apr 2023 20:37:26 -0700 Subject: [PATCH 06/37] Fix verilog lint error --- src/main/resources/vsrc/SimMemTrace.v | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index 9a91848..d5c5584 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -41,7 +41,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( longint __in_address[NUM_LANES-1:0]; bit __in_is_store[NUM_LANES-1:0]; - int __in_store_mask [NUM_LANES-1:0]; + logic [`MASK_WIDTH-1:0] __in_store_mask [NUM_LANES-1:0]; longint __in_data[NUM_LANES-1:0]; bit __in_finished; From 62f940618e9856258173d79f50bc70d98cec1417 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 10 Apr 2023 21:03:20 -0700 Subject: [PATCH 07/37] Convert manual bitshift to Chisel bitfield access --- src/main/scala/tilelink/Coalescing.scala | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index e9a2aed..37f76b1 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -629,15 +629,13 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) // Split output of SimMemTrace, which is flattened across all lanes, // back to each lane's. - // Maybe this part can be improved, since now we are still mannually shifting everything val laneReqs = Wire(Vec(numLanes, new TraceReq)) laneReqs.zipWithIndex.foreach { case (req, i) => - req.valid := (sim.io.trace_read.valid >> i) - req.address := (sim.io.trace_read.address >> (64 * i)) - req.is_store := (sim.io.trace_read.is_store >> i) - req.mask := (sim.io.trace_read.store_mask >> (8 * i)) - req.data := (sim.io.trace_read.data >> (64 * i)) - + req.valid := sim.io.trace_read.valid(i) + req.address := sim.io.trace_read.address(64 * i + 63, 64 * i) + req.is_store := sim.io.trace_read.is_store(i) + req.mask := sim.io.trace_read.store_mask(8 * i + 7, 8 * i) + req.data := sim.io.trace_read.data(64 * i + 63, 64 * i) } // To prevent collision of sourceId with a current in-flight message, @@ -653,7 +651,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) Cat(8.U(4.W), addr(27, 3), 0.U(3.W)) } - // Connect each lane to its respective TL node. + // Generate TL requests according to the trace line. (outer.laneNodes zip laneReqs).foreach { case (node, req) => val (tlOut, edge) = node.out(0) @@ -754,8 +752,6 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 sim.io.reset := reset.asBool val laneReqs = Wire(Vec(numLanes, new TraceReq)) - val laneValid = Wire(Vec(numLanes, Bool())) - val laneAddress = Wire(Vec(numLanes, UInt(64.W))) // FIXME: hardcoded // snoop on the TileLink edges to log traffic ((node.in zip node.out) zip laneReqs).foreach { case (((tlIn, _), (tlOut, _)), req) => @@ -769,15 +765,15 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 req.mask := tlIn.a.bits.mask } + val laneValid = Wire(Vec(numLanes, Bool())) + val laneAddress = Wire(Vec(numLanes, UInt(64.W))) // FIXME: hardcoded laneReqs.zipWithIndex.foreach { case (req, i) => - laneValid(i) := req.valid.asUInt + laneValid(i) := req.valid laneAddress(i) := req.address } // flatten per-lane signals to the Verilog blackbox input sim.io.trace_log.valid := laneValid.asUInt sim.io.trace_log.address := laneAddress.asUInt - - // io.finished := sim.io.trace_read.finished } } From 71f334bb229c96d35545d047d6a16430ef4624e8 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 11 Apr 2023 17:36:45 -0700 Subject: [PATCH 08/37] Fix size parsing from memtrace --- src/main/resources/csrc/SimMemTrace.cc | 12 +++--- src/main/resources/csrc/SimMemTrace.h | 7 ++-- src/main/resources/vsrc/SimMemTrace.v | 48 ++++++++++++------------ src/main/scala/tilelink/Coalescing.scala | 30 +++++++++------ 4 files changed, 51 insertions(+), 46 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 5e29d57..0e3274b 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -75,8 +75,8 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, // read it right now. return MemTraceLine{}; } else if (line.cycle == cycle && line.lane_id == lane_id) { - printf("fire! cycle=%ld, valid=%d, %s addr=%x \n", cycle, line.valid, - line.loadstore, line.address); + printf("fire! cycle=%ld, valid=%d, %s addr=%lx, size=%d \n", cycle, line.valid, + line.loadstore, line.address, line.data_size); // FIXME! Currently lane_id is assumed to be in round-robin order, e.g. // 0->1->2->3->0->..., both in the trace file and the order the caller calls @@ -119,11 +119,11 @@ extern "C" void memtrace_init(const char *filename) { // TODO: accept core_id as well extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long trace_read_cycle, - int trace_read_lane_id, + int trace_read_lane_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, unsigned char *trace_read_is_store, - int *trace_read_store_mask, + int *trace_read_size, unsigned long *trace_read_data, unsigned char *trace_read_finished) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, @@ -136,8 +136,8 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, auto line = reader->read_trace_at(trace_read_cycle, trace_read_lane_id); *trace_read_valid = line.valid; *trace_read_address = line.address; - *trace_read_is_store = strcmp(line.loadstore, "STORE") == 0 ; - *trace_read_store_mask = line.data_size; + *trace_read_is_store = (strcmp(line.loadstore, "STORE") == 0); + *trace_read_size = line.data_size; *trace_read_data = line.data; // This means finished and valid will go up at the same cycle. Need to // handle this without skipping the last line. diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index b046fcc..033fc7e 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -34,11 +34,10 @@ public: extern "C" void memtrace_init(const char *filename); extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long trace_read_cycle, - int trace_read_lane_id, + int trace_read_lane_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, unsigned char *trace_read_is_store, - int *trace_read_store_mask, + int *trace_read_size, unsigned long *trace_read_data, - unsigned char *trace_read_finished - ); + unsigned char *trace_read_finished); diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index d5c5584..b18fcab 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -1,6 +1,6 @@ `define DATA_WIDTH 64 `define MAX_NUM_LANES 32 -`define MASK_WIDTH 8 +`define SIZE_WIDTH 32 import "DPI-C" function void memtrace_init( input string filename @@ -18,31 +18,31 @@ import "DPI-C" function void memtrace_query output bit trace_read_valid, output longint trace_read_address, output bit trace_read_is_store, - output int trace_read_store_mask, + output int trace_read_size, output longint trace_read_data, output bit trace_read_finished ); module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( - input clock, - input reset, + input clock, + input reset, // These have to match the IO port of the Chisel wrapper module. - input trace_read_ready, - output [NUM_LANES-1:0] trace_read_valid, + input trace_read_ready, + output [NUM_LANES-1:0] trace_read_valid, output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address, - output [NUM_LANES-1:0] trace_read_is_store, - output [NUM_LANES*`MASK_WIDTH-1:0] trace_read_store_mask, + output [NUM_LANES-1:0] trace_read_is_store, + output [`SIZE_WIDTH*NUM_LANES-1:0] trace_read_size, output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, - output trace_read_finished + output trace_read_finished ); - bit __in_valid[NUM_LANES-1:0]; - longint __in_address[NUM_LANES-1:0]; + bit __in_valid [NUM_LANES-1:0]; + longint __in_address [NUM_LANES-1:0]; - bit __in_is_store[NUM_LANES-1:0]; - logic [`MASK_WIDTH-1:0] __in_store_mask [NUM_LANES-1:0]; - longint __in_data[NUM_LANES-1:0]; + bit __in_is_store [NUM_LANES-1:0]; + int __in_size [NUM_LANES-1:0]; + longint __in_data [NUM_LANES-1:0]; bit __in_finished; string __uartlog; @@ -54,13 +54,13 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( assign next_cycle_counter = cycle_counter + 1'b1; // registers that stage outputs of the C parser - reg [NUM_LANES-1:0] __in_valid_reg; + reg [NUM_LANES-1:0] __in_valid_reg; reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_LANES-1:0]; - reg [NUM_LANES-1:0] __in_is_store_reg; - reg [`MASK_WIDTH-1:0] __in_store_mask_reg [NUM_LANES-1:0]; + reg [NUM_LANES-1:0] __in_is_store_reg; + int __in_size_reg [NUM_LANES-1:0]; reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_LANES-1:0]; - reg __in_finished_reg; + reg __in_finished_reg; genvar g; @@ -70,7 +70,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_reg[g]; assign trace_read_is_store[g] = __in_is_store_reg[g]; - assign trace_read_store_mask[`MASK_WIDTH*(g+1)-1:`MASK_WIDTH*g] = __in_store_mask_reg[g]; + assign trace_read_size[`SIZE_WIDTH*(g+1)-1:`SIZE_WIDTH*g] = __in_size_reg[g]; assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_reg[g]; end endgenerate @@ -83,15 +83,13 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( // Evaluate the signals on the positive edge always @(posedge clock) begin - - // Setting reset value if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid[tid] = 1'b0; __in_address[tid] = `DATA_WIDTH'b0; __in_is_store[tid] = 1'b0; - __in_store_mask[tid] = `MASK_WIDTH'b0; + __in_size[tid] = `SIZE_WIDTH'b0; __in_data[tid] = `DATA_WIDTH'b0; end @@ -105,7 +103,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address_reg[tid] <= `DATA_WIDTH'b0; __in_is_store_reg[tid] = 1'b0; - __in_store_mask_reg[tid] = `MASK_WIDTH'b0; + __in_size_reg[tid] = `SIZE_WIDTH'b0; __in_data_reg[tid] = `DATA_WIDTH'b0; end @@ -127,7 +125,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address[tid], __in_is_store[tid], - __in_store_mask[tid], + __in_size[tid], __in_data[tid], __in_finished @@ -140,7 +138,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address_reg[tid] <= __in_address[tid]; __in_is_store_reg[tid] <= __in_is_store[tid]; - __in_store_mask_reg[tid] <= __in_store_mask[tid]; + __in_size_reg[tid] <= __in_size[tid]; __in_data_reg[tid] <= __in_data[tid]; end __in_finished_reg <= __in_finished; diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 37f76b1..bea19ee 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -587,7 +587,6 @@ class CoalShiftQueue[T <: Data]( class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit p: Parameters ) extends LazyModule { - // Create N client nodes together val laneNodes = Seq.tabulate(numLanes) { i => val clientParam = Seq( @@ -612,7 +611,7 @@ class TraceReq extends Bundle { val valid = Bool() val address = UInt(64.W) val is_store = Bool() - val mask = UInt(8.W) + val size = UInt(32.W) val data = UInt(64.W) } @@ -634,7 +633,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) req.valid := sim.io.trace_read.valid(i) req.address := sim.io.trace_read.address(64 * i + 63, 64 * i) req.is_store := sim.io.trace_read.is_store(i) - req.mask := sim.io.trace_read.store_mask(8 * i + 7, 8 * i) + req.size := sim.io.trace_read.size(32 * i + 31, 32 * i) + printf("========= req.size=%d\n", req.size) req.data := sim.io.trace_read.data(64 * i + 63, 64 * i) } @@ -655,16 +655,17 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) (outer.laneNodes zip laneReqs).foreach { case (node, req) => val (tlOut, edge) = node.out(0) + val size = 4.U // TODO: get proper size from the trace val (plegal, pbits) = edge.Put( fromSource = sourceIdCounter, toAddress = hashToValidPhyAddr(req.address), - lgSize = 3.U, + lgSize = Log2(size), data = req.data ) val (glegal, gbits) = edge.Get( fromSource = sourceIdCounter, toAddress = hashToValidPhyAddr(req.address), - lgSize = 3.U + lgSize = Log2(size), ) val legal = Mux(req.is_store, plegal, glegal) val bits = Mux(req.is_store, pbits, gbits) @@ -677,6 +678,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) tlOut.d.ready := true.B tlOut.e.valid := false.B + println(s"======= MemTraceDriver: TL data width: ${tlOut.params.dataBits}") + dontTouch(tlOut.a) dontTouch(tlOut.d) } @@ -714,7 +717,7 @@ class SimMemTrace(filename: String, numLanes: Int) // TODO: assumes 64-bit address. val address = Output(UInt((64 * numLanes).W)) val is_store = Output(UInt(numLanes.W)) - val store_mask = Output(UInt((8 * numLanes).W)) + val size = Output(UInt((32 * numLanes).W)) val data = Output(UInt((64 * numLanes).W)) val finished = Output(Bool()) } @@ -762,7 +765,8 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 req.address := tlIn.a.bits.address req.data := tlIn.a.bits.data req.is_store := false.B // FIXME: take is_store from TL - req.mask := tlIn.a.bits.mask + req.size := tlIn.a.bits.size + printf("========= logger: req.size=%d\n", tlIn.a.bits.size) } val laneValid = Wire(Vec(numLanes, Bool())) @@ -794,7 +798,7 @@ class SimMemTraceLogger(filename: String, numLanes: Int) // val ready = Output(Bool()) // TODO: assumes 64-bit address. // val is_store = Output(UInt(numLanes.W)) - // val store_mask = Output(UInt((8 * numLanes).W)) + // val size = Output(UInt((8 * numLanes).W)) // val data = Output(UInt((64 * numLanes).W)) // val finished = Output(Bool()) } @@ -805,7 +809,7 @@ class SimMemTraceLogger(filename: String, numLanes: Int) addResource("/csrc/SimMemTraceLogger.h") } -// synthesizable unit tests +// Synthesizable unit tests // tracedriver --> coalescer --> tracelogger --> tlram class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { @@ -816,7 +820,9 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { val logger = LazyModule(new MemTraceLogger(numLanes)) val rams = Seq.fill(numLanes)( // +1 for coalesced edge LazyModule( - // FIXME: properly propagate beatBytes? + // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink + // edges globally, by way of Diplomacy communicating the TL slave + // parameters to the upstream nodes. new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) ) ) @@ -847,7 +853,9 @@ class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { val driver = LazyModule(new MemTraceDriver(numLanes)) val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( - // FIXME: properly propagate beatBytes? + // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink + // edges globally, by way of Diplomacy communicating the TL slave + // parameters to the upstream nodes. new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) ) ) From 1057ed59d35637f096b6e3411d485567954546e6 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 11 Apr 2023 18:23:50 -0700 Subject: [PATCH 09/37] Parse log2(size) from trace; set is_store from TL opcode --- src/main/resources/csrc/SimMemTrace.cc | 15 +++++++++--- src/main/resources/csrc/SimMemTrace.h | 2 +- src/main/resources/csrc/SimMemTraceLogger.cc | 1 + src/main/resources/vsrc/SimMemTrace.v | 24 ++++++++++---------- src/main/scala/tilelink/Coalescing.scala | 20 ++++++++++++---- 5 files changed, 42 insertions(+), 20 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 0e3274b..8a636a7 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include "SimMemTrace.h" @@ -34,10 +35,18 @@ void MemTraceReader::parse() { printf("MemTraceReader: started parsing\n"); + long size = 0; while (infile >> line.cycle >> line.loadstore >> line.core_id >> line.lane_id >> std::hex >> line.address >> line.data >> std::dec >> - line.data_size) { + size) { line.valid = true; + + assert(size > 0 && "invalid size in trace"); + int lgsize = static_cast(log2(size)); + assert((size & ~(~0lu << lgsize)) == 0 && + "non-power-of-2 size detected in trace"); + line.log_data_size = lgsize; + trace.push_back(line); } read_pos = trace.cbegin(); @@ -76,7 +85,7 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, return MemTraceLine{}; } else if (line.cycle == cycle && line.lane_id == lane_id) { printf("fire! cycle=%ld, valid=%d, %s addr=%lx, size=%d \n", cycle, line.valid, - line.loadstore, line.address, line.data_size); + line.loadstore, line.address, line.log_data_size); // FIXME! Currently lane_id is assumed to be in round-robin order, e.g. // 0->1->2->3->0->..., both in the trace file and the order the caller calls @@ -137,7 +146,7 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, *trace_read_valid = line.valid; *trace_read_address = line.address; *trace_read_is_store = (strcmp(line.loadstore, "STORE") == 0); - *trace_read_size = line.data_size; + *trace_read_size = line.log_data_size; *trace_read_data = line.data; // This means finished and valid will go up at the same cycle. Need to // handle this without skipping the last line. diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index 033fc7e..14045b5 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -15,7 +15,7 @@ struct MemTraceLine { int lane_id = 0; unsigned long address = 0; unsigned long data = 0; - int data_size = 0; + int log_data_size = 0; }; class MemTraceReader { diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index 1fce4bd..db7f920 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "SimMemTraceLogger.h" diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index b18fcab..ab189e8 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -1,6 +1,6 @@ `define DATA_WIDTH 64 `define MAX_NUM_LANES 32 -`define SIZE_WIDTH 32 +`define LOGSIZE_WIDTH 32 import "DPI-C" function void memtrace_init( input string filename @@ -27,15 +27,15 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( input clock, input reset, - // These have to match the IO port of the Chisel wrapper module. - input trace_read_ready, - output [NUM_LANES-1:0] trace_read_valid, - output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address, + // These have to match the IO port name of the Chisel wrapper module. + input trace_read_ready, + output [NUM_LANES-1:0] trace_read_valid, + output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address, - output [NUM_LANES-1:0] trace_read_is_store, - output [`SIZE_WIDTH*NUM_LANES-1:0] trace_read_size, - output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, - output trace_read_finished + output [NUM_LANES-1:0] trace_read_is_store, + output [`LOGSIZE_WIDTH*NUM_LANES-1:0] trace_read_size, + output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, + output trace_read_finished ); bit __in_valid [NUM_LANES-1:0]; longint __in_address [NUM_LANES-1:0]; @@ -70,7 +70,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_reg[g]; assign trace_read_is_store[g] = __in_is_store_reg[g]; - assign trace_read_size[`SIZE_WIDTH*(g+1)-1:`SIZE_WIDTH*g] = __in_size_reg[g]; + assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size_reg[g]; assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_reg[g]; end endgenerate @@ -89,7 +89,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address[tid] = `DATA_WIDTH'b0; __in_is_store[tid] = 1'b0; - __in_size[tid] = `SIZE_WIDTH'b0; + __in_size[tid] = `LOGSIZE_WIDTH'b0; __in_data[tid] = `DATA_WIDTH'b0; end @@ -103,7 +103,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address_reg[tid] <= `DATA_WIDTH'b0; __in_is_store_reg[tid] = 1'b0; - __in_size_reg[tid] = `SIZE_WIDTH'b0; + __in_size_reg[tid] = `LOGSIZE_WIDTH'b0; __in_data_reg[tid] = `DATA_WIDTH'b0; end diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index bea19ee..98be3da 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -611,7 +611,7 @@ class TraceReq extends Bundle { val valid = Bool() val address = UInt(64.W) val is_store = Bool() - val size = UInt(32.W) + val size = UInt(32.W) // this is log2(bytesize) as in TL A bundle val data = UInt(64.W) } @@ -634,7 +634,6 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) req.address := sim.io.trace_read.address(64 * i + 63, 64 * i) req.is_store := sim.io.trace_read.is_store(i) req.size := sim.io.trace_read.size(32 * i + 31, 32 * i) - printf("========= req.size=%d\n", req.size) req.data := sim.io.trace_read.data(64 * i + 63, 64 * i) } @@ -761,12 +760,25 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 tlOut.a <> tlIn.a tlIn.d <> tlOut.d + // requests on TL A channel req.valid := tlIn.a.valid req.address := tlIn.a.bits.address req.data := tlIn.a.bits.data - req.is_store := false.B // FIXME: take is_store from TL + req.is_store := false.B + when (tlIn.a.bits.opcode === 0.U || tlIn.a.bits.opcode === 1.U) { + // 0: PutFullData, 1: PutPartialData + req.is_store := true.B + }.elsewhen(tlIn.a.bits.opcode === 4.U) { + // 4: Get + req.is_store := false.B + }.elsewhen(true.B) { + // that's all I know + assert(false.B, "unhandled TL opcode found in MemTraceLogger") + } req.size := tlIn.a.bits.size - printf("========= logger: req.size=%d\n", tlIn.a.bits.size) + + // responses on TL D channel + // TODO } val laneValid = Wire(Vec(numLanes, Bool())) From 8e763b512a7dad4b264deaa231b467dd892a227b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 12 Apr 2023 13:54:59 -0700 Subject: [PATCH 10/37] Relay full trace line info to DPI --- src/main/resources/csrc/SimMemTraceLogger.cc | 10 ++-- src/main/resources/vsrc/SimMemTrace.v | 2 +- src/main/scala/tilelink/Coalescing.scala | 49 ++++++++++++++------ 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index db7f920..b685e83 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -124,14 +124,18 @@ extern "C" void memtracelogger_init(const char *filename) { extern "C" void memtracelogger_log(unsigned char trace_log_valid, unsigned long trace_log_cycle, unsigned long trace_log_address, - unsigned int trace_log_lane_id, + int trace_log_lane_id, + unsigned char trace_log_is_store, + int trace_log_size, + unsigned long trace_log_data, unsigned char *trace_log_ready) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, // trace_read_lane_id); *trace_log_ready = 1; if (trace_log_valid) { - printf("%s: [%lu] valid: address=0x%lx, tid=%u\n", __func__, - trace_log_cycle, trace_log_address, trace_log_lane_id); + printf("%s: [%lu] valid: address=%lx, tid=%u, size=%d\n", __func__, + trace_log_cycle, trace_log_address, trace_log_lane_id, + trace_log_size); } } diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index ab189e8..a805130 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -1,3 +1,4 @@ +// FIXME hardcoded `define DATA_WIDTH 64 `define MAX_NUM_LANES 32 `define LOGSIZE_WIDTH 32 @@ -81,7 +82,6 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( memtrace_init(FILENAME); end - // Evaluate the signals on the positive edge always @(posedge clock) begin if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 98be3da..d6fb6ff 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -607,6 +607,8 @@ class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 lazy val module = new MemTraceDriverImp(this, numLanes, filename) } +// TODO: this is replicated in sim.io.trace_read and sim.io.trace_log; make it +// into a trait class TraceReq extends Bundle { val valid = Bool() val address = UInt(64.W) @@ -664,7 +666,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) val (glegal, gbits) = edge.Get( fromSource = sourceIdCounter, toAddress = hashToValidPhyAddr(req.address), - lgSize = Log2(size), + lgSize = Log2(size) ) val legal = Mux(req.is_store, plegal, glegal) val bits = Mux(req.is_store, pbits, gbits) @@ -755,6 +757,11 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 val laneReqs = Wire(Vec(numLanes, new TraceReq)) + assert( + numLanes == node.in.length, + "`numLanes` does not match the number of TL edges connected to the MemTraceLogger" + ) + // snoop on the TileLink edges to log traffic ((node.in zip node.out) zip laneReqs).foreach { case (((tlIn, _), (tlOut, _)), req) => tlOut.a <> tlIn.a @@ -765,7 +772,7 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 req.address := tlIn.a.bits.address req.data := tlIn.a.bits.data req.is_store := false.B - when (tlIn.a.bits.opcode === 0.U || tlIn.a.bits.opcode === 1.U) { + when(tlIn.a.bits.opcode === 0.U || tlIn.a.bits.opcode === 1.U) { // 0: PutFullData, 1: PutPartialData req.is_store := true.B }.elsewhen(tlIn.a.bits.opcode === 4.U) { @@ -777,19 +784,36 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 } req.size := tlIn.a.bits.size - // responses on TL D channel - // TODO + when(req.valid) { + printf("======== MemTraceLogger: req.size=%d\n", req.size) + } + + // responses on TL D channel + // TODO } + // clunky workaround of the fact that Chisel doesn't allow partial + // assignment to a bitfield range of a wide signal. val laneValid = Wire(Vec(numLanes, Bool())) - val laneAddress = Wire(Vec(numLanes, UInt(64.W))) // FIXME: hardcoded + val laneAddress = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).address))) + val laneIsStore = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).is_store))) + val laneSize = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).size))) + val laneData = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).data))) laneReqs.zipWithIndex.foreach { case (req, i) => laneValid(i) := req.valid laneAddress(i) := req.address + laneIsStore(i) := req.is_store + laneSize(i) := req.size + laneData(i) := req.data } // flatten per-lane signals to the Verilog blackbox input sim.io.trace_log.valid := laneValid.asUInt sim.io.trace_log.address := laneAddress.asUInt + sim.io.trace_log.is_store := laneIsStore.asUInt + sim.io.trace_log.size := laneSize.asUInt + sim.io.trace_log.data := laneData.asUInt + + assert(sim.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") } } @@ -802,17 +826,16 @@ class SimMemTraceLogger(filename: String, numLanes: Int) val clock = Input(Clock()) val reset = Input(Bool()) - // Chisel can't interface with Verilog 2D port, so flatten all lanes into - // single wide 1D array. val trace_log = new Bundle { val valid = Input(UInt(numLanes.W)) - val address = Input(UInt((64 * numLanes).W)) - // val ready = Output(Bool()) + // Chisel can't interface with Verilog 2D port, so flatten all lanes into + // single wide 1D array. // TODO: assumes 64-bit address. - // val is_store = Output(UInt(numLanes.W)) - // val size = Output(UInt((8 * numLanes).W)) - // val data = Output(UInt((64 * numLanes).W)) - // val finished = Output(Bool()) + val address = Input(UInt((64 * numLanes).W)) + val is_store = Input(UInt(numLanes.W)) + val size = Input(UInt((32 * numLanes).W)) + val data = Input(UInt((64 * numLanes).W)) + val ready = Output(Bool()) } }) From 57874a564e0e06d3273551f25e0438bfec897566 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 12 Apr 2023 14:23:17 -0700 Subject: [PATCH 11/37] Set TL size according to trace from driver Mask will be set accordingly by the TL generator methods (Get/Put). --- src/main/scala/tilelink/Coalescing.scala | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index d6fb6ff..32a7e0c 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -656,21 +656,29 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) (outer.laneNodes zip laneReqs).foreach { case (node, req) => val (tlOut, edge) = node.out(0) - val size = 4.U // TODO: get proper size from the trace val (plegal, pbits) = edge.Put( fromSource = sourceIdCounter, toAddress = hashToValidPhyAddr(req.address), - lgSize = Log2(size), + lgSize = req.size, // trace line already holds log2(size) data = req.data ) val (glegal, gbits) = edge.Get( fromSource = sourceIdCounter, toAddress = hashToValidPhyAddr(req.address), - lgSize = Log2(size) + lgSize = req.size ) val legal = Mux(req.is_store, plegal, glegal) val bits = Mux(req.is_store, pbits, gbits) + when(tlOut.a.valid) { + printf( + "Get(): addr=%x, size=%x, mask=%x\n", + tlOut.a.bits.address, + tlOut.a.bits.size, + tlOut.a.bits.mask + ); + } + assert(legal, "illegal TL req gen") tlOut.a.valid := req.valid tlOut.a.bits := bits @@ -772,8 +780,8 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 req.address := tlIn.a.bits.address req.data := tlIn.a.bits.data req.is_store := false.B - when(tlIn.a.bits.opcode === 0.U || tlIn.a.bits.opcode === 1.U) { - // 0: PutFullData, 1: PutPartialData + when(tlIn.a.bits.opcode === 0.U) { + // 0: PutFullData, 1: PutPartialData but we don't support it req.is_store := true.B }.elsewhen(tlIn.a.bits.opcode === 4.U) { // 4: Get @@ -784,10 +792,6 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 } req.size := tlIn.a.bits.size - when(req.valid) { - printf("======== MemTraceLogger: req.size=%d\n", req.size) - } - // responses on TL D channel // TODO } From 8e6a5f4bce3bdd32ea412dc4ca21c417879e7cfe Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 15:30:30 -0700 Subject: [PATCH 12/37] Fix address & data handling in memtrace driver and logger * TileLink doesn't alter the `address` field from what we originally used in the Get/Put call. * Same goes for the `data` field. * The only thing TL generates by itself is `mask`. This means we have to align data to the beatBytes boundary ourselves when Putting, and also taking the right sublanes using the mask when Getting. --- src/main/scala/tilelink/Coalescing.scala | 71 ++++++++++++++++++------ 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 32a7e0c..15271ea 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -649,7 +649,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) // In default setting, all mem-req for program data must be within // 0X80000000 -> 0X90000000 def hashToValidPhyAddr(addr: UInt): UInt = { - Cat(8.U(4.W), addr(27, 3), 0.U(3.W)) + Cat(8.U(4.W), addr(27, 0)) } // Generate TL requests according to the trace line. @@ -660,7 +660,8 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) fromSource = sourceIdCounter, toAddress = hashToValidPhyAddr(req.address), lgSize = req.size, // trace line already holds log2(size) - data = req.data + // Need to construct data that is correctly aligned to beatBytes + data = (req.data << (8.U * (req.address % edge.manager.beatBytes.U))) ) val (glegal, gbits) = edge.Get( fromSource = sourceIdCounter, @@ -672,10 +673,13 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) when(tlOut.a.valid) { printf( - "Get(): addr=%x, size=%x, mask=%x\n", + "MemTraceDriver: TL addr=%x, size=%d, mask=%x, store=%d, tlData=%x, reqData=%x\n", tlOut.a.bits.address, tlOut.a.bits.size, - tlOut.a.bits.mask + tlOut.a.bits.mask, + req.is_store, + tlOut.a.bits.data, + req.data ); } @@ -757,6 +761,12 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 // ) // }) + // Copied from freechips.rocketchip.trailingZeros which only supports Scala + // integers + def trailingZeros(x: UInt): UInt = { + Mux(x === 0.U, x.widthOption.get.U, Log2(x & -x)) + } + lazy val module = new Impl class Impl extends LazyModuleImp(this) { val sim = Module(new SimMemTraceLogger(filename, numLanes)) @@ -776,21 +786,48 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 tlIn.d <> tlOut.d // requests on TL A channel + // req.valid := tlIn.a.valid - req.address := tlIn.a.bits.address - req.data := tlIn.a.bits.data - req.is_store := false.B - when(tlIn.a.bits.opcode === 0.U) { - // 0: PutFullData, 1: PutPartialData but we don't support it - req.is_store := true.B - }.elsewhen(tlIn.a.bits.opcode === 4.U) { - // 4: Get - req.is_store := false.B - }.elsewhen(true.B) { - // that's all I know - assert(false.B, "unhandled TL opcode found in MemTraceLogger") - } req.size := tlIn.a.bits.size + def tlOpcodeIsStore(opcode: UInt): Bool = { + // 0: PutFullData, 1: PutPartialData but we don't support it + // 4: Get + assert(opcode === 0.U || opcode === 4.U, "unhandled TL opcode found in MemTraceLogger") + tlIn.a.bits.opcode === 0.U + } + req.is_store := tlOpcodeIsStore(tlIn.a.bits.opcode) + // TL always carries the exact unaligned address that the client + // originally requested, so no postprocessing required + req.address := tlIn.a.bits.address + + // TL data + // + // When tlIn.a.bits.size is smaller than the data bus width, need to + // figure out which byte lanes we actually accessed so that + // we can write that to the memory trace. + // See Section 4.5 Byte Lanes in spec 1.8.1 + + // This assert only holds true for PutFullData and not PutPartialData, + // where HIGH bits in the mask may not be contiguous. + assert( + PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), + "mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic" + ) + val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) + val mask = ~((~0.U) << (trailingZerosInMask * 8.U)) + req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) + + when(req.valid) { + printf( + "MemTraceLogger: TL addr=%x, size=%d, mask=%x, store=%d, tlData=%x, reqData=%x\n", + tlIn.a.bits.address, + tlIn.a.bits.size, + tlIn.a.bits.mask, + req.is_store, + tlIn.a.bits.data, + req.data + ) + } // responses on TL D channel // TODO From 1a90ad52ac213bb218a0e52c7937474839ee2c3b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 15:50:17 -0700 Subject: [PATCH 13/37] Cleanup tracer debug prints --- src/main/scala/tilelink/Coalescing.scala | 30 ++++++++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 15271ea..9d0adc1 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -672,15 +672,15 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) val bits = Mux(req.is_store, pbits, gbits) when(tlOut.a.valid) { - printf( - "MemTraceDriver: TL addr=%x, size=%d, mask=%x, store=%d, tlData=%x, reqData=%x\n", + TracePrintf( + "MemTraceDriver", tlOut.a.bits.address, tlOut.a.bits.size, tlOut.a.bits.mask, req.is_store, tlOut.a.bits.data, req.data - ); + ) } assert(legal, "illegal TL req gen") @@ -818,8 +818,8 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) when(req.valid) { - printf( - "MemTraceLogger: TL addr=%x, size=%d, mask=%x, store=%d, tlData=%x, reqData=%x\n", + TracePrintf( + "MemTraceLogger", tlIn.a.bits.address, tlIn.a.bits.size, tlIn.a.bits.mask, @@ -885,6 +885,26 @@ class SimMemTraceLogger(filename: String, numLanes: Int) addResource("/csrc/SimMemTraceLogger.h") } +class TracePrintf {} + +object TracePrintf { + def apply( + printer: String, + address: UInt, + size: UInt, + mask: UInt, + is_store: Bool, + tlData: UInt, + reqData: UInt + ) = { + printf(s"${printer}: TL addr=%x, size=%d, mask=%x, store=%d", address, size, mask, is_store) + when(is_store) { + printf(", tlData=%x, reqData=%x", tlData, reqData) + } + printf("\n") + } +} + // Synthesizable unit tests // tracedriver --> coalescer --> tracelogger --> tlram From 282434eb7d73fdd95ccf0dedbb5b3c23a0928eb3 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 16:20:27 -0700 Subject: [PATCH 14/37] Basic C++ file IO for trace logger --- src/main/resources/csrc/SimMemTrace.cc | 17 ++- src/main/resources/csrc/SimMemTrace.h | 29 +++-- src/main/resources/csrc/SimMemTraceLogger.cc | 106 +++++-------------- src/main/scala/tilelink/Coalescing.scala | 4 +- 4 files changed, 66 insertions(+), 90 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 8a636a7..99bf113 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -3,13 +3,16 @@ #include #endif #include -#include +#include #include #include #include #include #include "SimMemTrace.h" +// Global singleton instance +static std::unique_ptr reader; + MemTraceReader::MemTraceReader(const std::string &filename) { char cwd[4096]; if (getcwd(cwd, sizeof(cwd))) { @@ -36,11 +39,14 @@ void MemTraceReader::parse() { printf("MemTraceReader: started parsing\n"); long size = 0; - while (infile >> line.cycle >> line.loadstore >> line.core_id >> + std::string loadstore; // FIXME: likely slow + while (infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id >> std::hex >> line.address >> line.data >> std::dec >> size) { line.valid = true; + line.is_store = (loadstore == "STORE"); + assert(size > 0 && "invalid size in trace"); int lgsize = static_cast(log2(size)); assert((size & ~(~0lu << lgsize)) == 0 && @@ -84,8 +90,9 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, // read it right now. return MemTraceLine{}; } else if (line.cycle == cycle && line.lane_id == lane_id) { - printf("fire! cycle=%ld, valid=%d, %s addr=%lx, size=%d \n", cycle, line.valid, - line.loadstore, line.address, line.log_data_size); + printf("fire! cycle=%ld, valid=%d, %s addr=%lx, size=%d \n", cycle, + line.valid, (line.is_store ? "STORE" : "LOAD"), line.address, + line.log_data_size); // FIXME! Currently lane_id is assumed to be in round-robin order, e.g. // 0->1->2->3->0->..., both in the trace file and the order the caller calls @@ -145,7 +152,7 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, auto line = reader->read_trace_at(trace_read_cycle, trace_read_lane_id); *trace_read_valid = line.valid; *trace_read_address = line.address; - *trace_read_is_store = (strcmp(line.loadstore, "STORE") == 0); + *trace_read_is_store = line.is_store; *trace_read_size = line.log_data_size; *trace_read_data = line.data; // This means finished and valid will go up at the same cycle. Need to diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index 14045b5..b45fea4 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -2,15 +2,10 @@ #include #include -class MemTraceReader; - -// Global singleton instance of MemTraceReader -static std::unique_ptr reader; - struct MemTraceLine { bool valid = false; long cycle = 0; - char loadstore[10]; + bool is_store = 0; int core_id = 0; int lane_id = 0; unsigned long address = 0; @@ -31,6 +26,19 @@ public: std::vector::const_iterator read_pos; }; +class MemTraceWriter { +public: + MemTraceWriter(const std::string &filename); + ~MemTraceWriter(); + // void parse(); + void write_trace_at(const MemTraceLine line); + // bool finished() const { return read_pos == trace.cend(); } + + FILE *outfile; + // std::vector trace; + // std::vector::const_iterator read_pos; +}; + extern "C" void memtrace_init(const char *filename); extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long trace_read_cycle, @@ -41,3 +49,12 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, int *trace_read_size, unsigned long *trace_read_data, unsigned char *trace_read_finished); +extern "C" void memtracelogger_init(const char *filename); +extern "C" void memtracelogger_log(unsigned char trace_log_valid, + unsigned long trace_log_cycle, + unsigned long trace_log_address, + int trace_log_lane_id, + unsigned char trace_log_is_store, + int trace_log_size, + unsigned long trace_log_data, + unsigned char *trace_log_ready); diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index b685e83..52de1f7 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -8,95 +8,34 @@ #include #include #include -#include "SimMemTraceLogger.h" +#include "SimMemTrace.h" // Global singleton instance -static std::unique_ptr logger; +static std::unique_ptr logger; -MemTraceLogger::MemTraceLogger(const std::string &filename) { +MemTraceWriter::MemTraceWriter(const std::string &filename) { char cwd[4096]; if (getcwd(cwd, sizeof(cwd))) { printf("MemTraceLogger: current working dir: %s\n", cwd); } - infile.open(filename); - if (infile.fail()) { + outfile = fopen(filename.c_str(), "w"); + if (!outfile) { fprintf(stderr, "failed to open file %s\n", filename.c_str()); } } -MemTraceLogger::~MemTraceLogger() { - infile.close(); - printf("MemTraceLogger destroyed\n"); +MemTraceWriter::~MemTraceWriter() { + fclose(outfile); + printf("MemTraceWriter destroyed\n"); } -#if 0 -// Parse trace file in its entirety and store it into internal structure. -// TODO: might block for a long time when the trace gets big, check if need to -// be broken down -void MemTraceLogger::parse() { - MemTraceLine line; +void MemTraceWriter::write_trace_at(const MemTraceLine line) { + printf("tick(): cycle=%ld\n", line.cycle); - printf("MemTraceLogger: started parsing\n"); - - while (infile >> line.cycle >> line.loadstore >> line.core_id >> - line.lane_id >> std::hex >> line.address >> line.data >> std::dec >> - line.data_size) { - line.valid = true; - trace.push_back(line); - } - read_pos = trace.cbegin(); - - printf("MemTraceLogger: finished parsing\n"); + fprintf(outfile, "cycle=%ld\n", line.cycle); } -// Try to read a memory request that might have happened at a given cycle, on a -// given SIMD lane (= "thread"). In case no request happened at that point, -// return an empty line with .valid = false. -MemTraceLine MemTraceLogger::read_trace_at(const long cycle, - const int lane_id) { - MemTraceLine line; - line.valid = false; - - // printf("tick(): cycle=%ld\n", cycle); - - if (finished()) { - return line; - } - - line = *read_pos; - // It should always be guaranteed that we consumed all of the past lines, and - // the next line is in the future. - if (line.cycle < cycle) { - // fprintf(stderr, "line.cycle=%ld, cycle=%ld\n", line.cycle, cycle); - assert(false && "some trace lines are left unread in the past"); - } - - if (line.lane_id != lane_id) { - line.valid = false; - } - if (line.cycle > cycle) { - // We haven't reached the cycle mark specified in this line yet, so we don't - // read it right now. - return MemTraceLine{}; - } else if (line.cycle == cycle && line.lane_id == lane_id) { - printf("fire! cycle=%ld, valid=%d, %s addr=%x \n", cycle, line.valid, - line.loadstore, line.address); - - // FIXME! Currently lane_id is assumed to be in round-robin order, e.g. - // 0->1->2->3->0->..., both in the trace file and the order the caller calls - // this function. If this is not true, we cannot simply monotonically - // increment read_pos. - - // Only advance pointer when cycle and threa_id both match - // now increaseing sequence is fine (0, 1, 3), but unordered is not fine (0, 3, 1) - ++read_pos; - } - - return line; -} -#endif - extern "C" void memtracelogger_init(const char *filename) { #ifndef NO_VPI s_vpi_vlog_info info; @@ -117,7 +56,7 @@ extern "C" void memtracelogger_init(const char *filename) { printf("memtrace_init: filename=[%s]\n", filename); - logger = std::make_unique(filename); + logger = std::make_unique(filename); } // TODO: accept core_id as well @@ -133,9 +72,22 @@ extern "C" void memtracelogger_log(unsigned char trace_log_valid, // trace_read_lane_id); *trace_log_ready = 1; - if (trace_log_valid) { - printf("%s: [%lu] valid: address=%lx, tid=%u, size=%d\n", __func__, - trace_log_cycle, trace_log_address, trace_log_lane_id, - trace_log_size); + if (!trace_log_valid) { + return; } + + printf("%s: [%lu] valid: address=%lx, tid=%u, size=%d\n", __func__, + trace_log_cycle, trace_log_address, trace_log_lane_id, + trace_log_size); + + MemTraceLine line{.valid = (trace_log_valid == 1), + .cycle = static_cast(trace_log_cycle), + .is_store = (trace_log_is_store == 1), + .core_id = 0, // TODO support multicores + .lane_id = trace_log_lane_id, + .address = trace_log_address, + .data = trace_log_data, + .log_data_size = trace_log_size}; + + logger->write_trace_at(line); } diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 9d0adc1..2cfc950 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -741,7 +741,7 @@ class SimMemTrace(filename: String, numLanes: Int) addResource("/csrc/SimMemTrace.h") } -class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit +class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.out.trace")(implicit p: Parameters ) extends LazyModule { val node = TLIdentityNode() @@ -882,7 +882,7 @@ class SimMemTraceLogger(filename: String, numLanes: Int) addResource("/vsrc/SimMemTraceLogger.v") addResource("/csrc/SimMemTraceLogger.cc") - addResource("/csrc/SimMemTraceLogger.h") + addResource("/csrc/SimMemTrace.h") } class TracePrintf {} From f60602fc34b1c9a82740c14b876ad3d39bfa4c88 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 16:26:25 -0700 Subject: [PATCH 15/37] Write trace from logger in the same format as driver --- src/main/resources/csrc/SimMemTraceLogger.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index 52de1f7..d7f2a8d 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -33,7 +33,9 @@ MemTraceWriter::~MemTraceWriter() { void MemTraceWriter::write_trace_at(const MemTraceLine line) { printf("tick(): cycle=%ld\n", line.cycle); - fprintf(outfile, "cycle=%ld\n", line.cycle); + fprintf(outfile, "%ld %s %d %d 0x%lx 0x%lx %u\n", line.cycle, + (line.is_store ? "STORE" : "LOAD"), line.core_id, line.lane_id, + line.address, line.data, (1u << line.log_data_size)); } extern "C" void memtracelogger_init(const char *filename) { From 8978c2a812cebe4a3b69cf1ec170ca294d57e470 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 16:51:37 -0700 Subject: [PATCH 16/37] trait HasTraceReq --- src/main/resources/csrc/SimMemTrace.h | 6 +----- src/main/resources/csrc/SimMemTraceLogger.cc | 8 +++----- src/main/scala/tilelink/Coalescing.scala | 16 +++++++++++----- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index b45fea4..3bdd2d5 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -30,13 +30,9 @@ class MemTraceWriter { public: MemTraceWriter(const std::string &filename); ~MemTraceWriter(); - // void parse(); - void write_trace_at(const MemTraceLine line); - // bool finished() const { return read_pos == trace.cend(); } + void write_line_to_trace(const MemTraceLine line); FILE *outfile; - // std::vector trace; - // std::vector::const_iterator read_pos; }; extern "C" void memtrace_init(const char *filename); diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index d7f2a8d..64e4ce5 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -16,7 +16,7 @@ static std::unique_ptr logger; MemTraceWriter::MemTraceWriter(const std::string &filename) { char cwd[4096]; if (getcwd(cwd, sizeof(cwd))) { - printf("MemTraceLogger: current working dir: %s\n", cwd); + printf("MemTraceWriter: current working dir: %s\n", cwd); } outfile = fopen(filename.c_str(), "w"); @@ -30,9 +30,7 @@ MemTraceWriter::~MemTraceWriter() { printf("MemTraceWriter destroyed\n"); } -void MemTraceWriter::write_trace_at(const MemTraceLine line) { - printf("tick(): cycle=%ld\n", line.cycle); - +void MemTraceWriter::write_line_to_trace(const MemTraceLine line) { fprintf(outfile, "%ld %s %d %d 0x%lx 0x%lx %u\n", line.cycle, (line.is_store ? "STORE" : "LOAD"), line.core_id, line.lane_id, line.address, line.data, (1u << line.log_data_size)); @@ -91,5 +89,5 @@ extern "C" void memtracelogger_log(unsigned char trace_log_valid, .data = trace_log_data, .log_data_size = trace_log_size}; - logger->write_trace_at(line); + logger->write_line_to_trace(line); } diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 2cfc950..161035e 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -607,9 +607,15 @@ class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 lazy val module = new MemTraceDriverImp(this, numLanes, filename) } -// TODO: this is replicated in sim.io.trace_read and sim.io.trace_log; make it -// into a trait -class TraceReq extends Bundle { +trait HasTraceReq { + val valid: UInt + val address: UInt + val is_store: UInt + val size: UInt + val data: UInt +} + +class TraceReq extends Bundle with HasTraceReq { val valid = Bool() val address = UInt(64.W) val is_store = Bool() @@ -722,7 +728,7 @@ class SimMemTrace(filename: String, numLanes: Int) // These names have to match declarations in the Verilog code, eg. // trace_read_address. - val trace_read = new Bundle { + val trace_read = new Bundle with HasTraceReq { val ready = Input(Bool()) val valid = Output(UInt(numLanes.W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into @@ -867,7 +873,7 @@ class SimMemTraceLogger(filename: String, numLanes: Int) val clock = Input(Clock()) val reset = Input(Bool()) - val trace_log = new Bundle { + val trace_log = new Bundle with HasTraceReq { val valid = Input(UInt(numLanes.W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. From 41d520a9912221cb182a3b22b5a29236b646aa7e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 17:59:30 -0700 Subject: [PATCH 17/37] Log both request and response in trace logger Inside DPI code, have a vector of unique_ptrs that act as handles to multiple different trace logger instances. Each logger instance is instantiated in a single instance of the Verilog module, and multiple of these Verilog modules may be instantiated in the Chisel module (see simReq and simResp in MemTraceLogger). --- src/main/resources/csrc/SimMemTrace.h | 8 +- src/main/resources/csrc/SimMemTraceLogger.cc | 60 +++--- src/main/resources/vsrc/SimMemTraceLogger.v | 100 ++++++++++ src/main/scala/tilelink/Coalescing.scala | 196 ++++++++++++------- 4 files changed, 267 insertions(+), 97 deletions(-) create mode 100644 src/main/resources/vsrc/SimMemTraceLogger.v diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index 3bdd2d5..fef1ad8 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -28,10 +28,11 @@ public: class MemTraceWriter { public: - MemTraceWriter(const std::string &filename); + MemTraceWriter(const bool is_response, const std::string &filename); ~MemTraceWriter(); void write_line_to_trace(const MemTraceLine line); + bool is_response; FILE *outfile; }; @@ -45,8 +46,9 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, int *trace_read_size, unsigned long *trace_read_data, unsigned char *trace_read_finished); -extern "C" void memtracelogger_init(const char *filename); -extern "C" void memtracelogger_log(unsigned char trace_log_valid, +extern "C" int memtracelogger_init(int is_response, const char *filename); +extern "C" void memtracelogger_log(int handle, + unsigned char trace_log_valid, unsigned long trace_log_cycle, unsigned long trace_log_address, int trace_log_lane_id, diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index 64e4ce5..c4cfde9 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -1,19 +1,23 @@ #ifndef NO_VPI -#include #include +#include #endif -#include -#include -#include -#include -#include -#include #include "SimMemTrace.h" +#include +#include +#include +#include +#include +#include -// Global singleton instance -static std::unique_ptr logger; +// Contains handle for every logger that is instantiated per Verilog module +// instance +static std::vector> loggers; + +MemTraceWriter::MemTraceWriter(const bool is_response, + const std::string &filename) { + this->is_response = is_response; -MemTraceWriter::MemTraceWriter(const std::string &filename) { char cwd[4096]; if (getcwd(cwd, sizeof(cwd))) { printf("MemTraceWriter: current working dir: %s\n", cwd); @@ -36,16 +40,17 @@ void MemTraceWriter::write_line_to_trace(const MemTraceLine line) { line.address, line.data, (1u << line.log_data_size)); } -extern "C" void memtracelogger_init(const char *filename) { +// Returns the "handle" ID for this particular logger instance. +extern "C" int memtracelogger_init(int is_response, const char *filename) { #ifndef NO_VPI s_vpi_vlog_info info; if (!vpi_get_vlog_info(&info)) { fprintf(stderr, "fatal: failed to get plusargs from VCS\n"); exit(1); } - const char* TRACEFILENAME_PLUSARG = "+memtracefile="; + const char *TRACEFILENAME_PLUSARG = "+memtracefile="; for (int i = 0; i < info.argc; i++) { - char* input_arg = info.argv[i]; + char *input_arg = info.argv[i]; if (strncmp(input_arg, TRACEFILENAME_PLUSARG, strlen(TRACEFILENAME_PLUSARG)) == 0) { filename = input_arg + strlen(TRACEFILENAME_PLUSARG); @@ -54,20 +59,24 @@ extern "C" void memtracelogger_init(const char *filename) { } #endif - printf("memtrace_init: filename=[%s]\n", filename); + int handle = loggers.size(); + loggers.emplace_back(std::make_unique(is_response, filename)); - logger = std::make_unique(filename); + printf("memtracelogger_init: handle=%d, is_response=%d, filename=[%s]\n", + handle, is_response, filename); + + return handle; } +// This is used to log both TileLink A and D channels. // TODO: accept core_id as well -extern "C" void memtracelogger_log(unsigned char trace_log_valid, - unsigned long trace_log_cycle, - unsigned long trace_log_address, - int trace_log_lane_id, - unsigned char trace_log_is_store, - int trace_log_size, - unsigned long trace_log_data, - unsigned char *trace_log_ready) { +extern "C" void +memtracelogger_log(int handle, + unsigned char trace_log_valid, unsigned long trace_log_cycle, + unsigned long trace_log_address, int trace_log_lane_id, + unsigned char trace_log_is_store, int trace_log_size, + unsigned long trace_log_data, + unsigned char *trace_log_ready) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, // trace_read_lane_id); *trace_log_ready = 1; @@ -77,8 +86,7 @@ extern "C" void memtracelogger_log(unsigned char trace_log_valid, } printf("%s: [%lu] valid: address=%lx, tid=%u, size=%d\n", __func__, - trace_log_cycle, trace_log_address, trace_log_lane_id, - trace_log_size); + trace_log_cycle, trace_log_address, trace_log_lane_id, trace_log_size); MemTraceLine line{.valid = (trace_log_valid == 1), .cycle = static_cast(trace_log_cycle), @@ -89,5 +97,7 @@ extern "C" void memtracelogger_log(unsigned char trace_log_valid, .data = trace_log_data, .log_data_size = trace_log_size}; + assert(0 <= handle && handle < loggers.size() && "wrong trace logger handle"); + auto logger = loggers[handle].get(); logger->write_line_to_trace(line); } diff --git a/src/main/resources/vsrc/SimMemTraceLogger.v b/src/main/resources/vsrc/SimMemTraceLogger.v new file mode 100644 index 0000000..8c358d2 --- /dev/null +++ b/src/main/resources/vsrc/SimMemTraceLogger.v @@ -0,0 +1,100 @@ +// FIXME hardcoded +`define DATA_WIDTH 64 +`define MAX_NUM_LANES 32 +`define LOGSIZE_WIDTH 32 + +import "DPI-C" function int memtracelogger_init( + input bit is_response, + input string filename +); + +// Make sure to sync the parameters for: +// (1) import "DPI-C" declaration +// (2) C function declaration +// (3) DPI function calls inside initial/always blocks +import "DPI-C" function void memtracelogger_log +( + input int handle, + input bit trace_log_valid, + input longint trace_log_cycle, + input longint trace_log_address, + input int trace_log_tid, + input bit trace_log_is_store, + input int trace_log_size, + input longint trace_log_data, + output bit trace_log_ready +); + +module SimMemTraceLogger #(parameter + IS_RESPONSE = 0, + FILENAME = "undefined", + NUM_LANES = 4) ( + input clock, + input reset, + + // NOTE: LSB is lane 0 + input [NUM_LANES-1:0] trace_log_valid, + input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_address, + input [NUM_LANES-1:0] trace_log_is_store, + input [`LOGSIZE_WIDTH*NUM_LANES-1:0] trace_log_size, + input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_data, + output trace_log_ready +); + int logger_handle; + bit __in_ready; + + // cycle_counter will start off right after reset is deasserted which should + // synchronize itself with SimMemTrace.cycle_counter + reg [`DATA_WIDTH-1:0] cycle_counter; + wire [`DATA_WIDTH-1:0] next_cycle_counter; + assign next_cycle_counter = cycle_counter + 1'b1; + + // wires going into the DPC + wire __valid [NUM_LANES-1:0]; + wire [`DATA_WIDTH-1:0] __address [NUM_LANES-1:0]; + wire __is_store [NUM_LANES-1:0]; + wire [`LOGSIZE_WIDTH-1:0] __size [NUM_LANES-1:0]; + wire [`DATA_WIDTH-1:0] __data [NUM_LANES-1:0]; + + assign trace_log_ready = __in_ready; + + genvar g; + generate + for (g = 0; g < NUM_LANES; g = g + 1) begin + // LSB is lane 0 + assign __valid[g] = trace_log_valid[g]; + assign __address[g] = trace_log_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g]; + assign __is_store[g] = trace_log_is_store[g]; + assign __size[g] = trace_log_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g]; + assign __data[g] = trace_log_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g]; + end + endgenerate + + initial begin + /* $value$plusargs("uartlog=%s", __uartlog); */ + logger_handle = memtracelogger_init(IS_RESPONSE, FILENAME); + end + + always @(posedge clock) begin + if (reset) begin + __in_ready = 1'b1; + cycle_counter <= `DATA_WIDTH'b0; + end else begin + cycle_counter <= next_cycle_counter; + + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + memtracelogger_log( + logger_handle, + __valid[tid], + cycle_counter, + __address[tid], + tid, + __is_store[tid], + __size[tid], + __data[tid], + __in_ready + ); + end + end + end +endmodule diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 161035e..775cce8 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -747,7 +747,11 @@ class SimMemTrace(filename: String, numLanes: Int) addResource("/csrc/SimMemTrace.h") } -class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.out.trace")(implicit +class MemTraceLogger( + numLanes: Int = 4, + reqFilename: String = "vecadd.core1.thread4.logger.req.trace", + respFilename: String = "vecadd.core1.thread4.logger.resp.trace" +)(implicit p: Parameters ) extends LazyModule { val node = TLIdentityNode() @@ -775,98 +779,152 @@ class MemTraceLogger(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 lazy val module = new Impl class Impl extends LazyModuleImp(this) { - val sim = Module(new SimMemTraceLogger(filename, numLanes)) - sim.io.clock := clock - sim.io.reset := reset.asBool + val simReq = Module(new SimMemTraceLogger(false, reqFilename, numLanes)) + val simResp = Module(new SimMemTraceLogger(true, respFilename, numLanes)) + simReq.io.clock := clock + simReq.io.reset := reset.asBool + simResp.io.clock := clock + simResp.io.reset := reset.asBool val laneReqs = Wire(Vec(numLanes, new TraceReq)) + val laneResps = Wire(Vec(numLanes, new TraceReq)) assert( numLanes == node.in.length, "`numLanes` does not match the number of TL edges connected to the MemTraceLogger" ) + def tlAOpcodeIsStore(opcode: UInt): Bool = { + // 0: PutFullData, 1: PutPartialData but we don't support it + // 4: Get + assert(opcode === 0.U || opcode === 4.U, "unhandled TL A opcode found") + opcode === 0.U + } + def tlDOpcodeIsStore(opcode: UInt): Bool = { + // 0: AccessAck (Put), 1: AccessAckData (Get or Atomic) + // See Table 13 of spec 1.8.1 + assert(opcode === 0.U || opcode === 1.U, "unhandled TL D opcode found") + opcode === 0.U + } + // snoop on the TileLink edges to log traffic - ((node.in zip node.out) zip laneReqs).foreach { case (((tlIn, _), (tlOut, _)), req) => - tlOut.a <> tlIn.a - tlIn.d <> tlOut.d + ((node.in zip node.out) zip (laneReqs zip laneResps)).foreach { + case (((tlIn, _), (tlOut, _)), (req, resp)) => + tlOut.a <> tlIn.a + tlIn.d <> tlOut.d - // requests on TL A channel - // - req.valid := tlIn.a.valid - req.size := tlIn.a.bits.size - def tlOpcodeIsStore(opcode: UInt): Bool = { - // 0: PutFullData, 1: PutPartialData but we don't support it - // 4: Get - assert(opcode === 0.U || opcode === 4.U, "unhandled TL opcode found in MemTraceLogger") - tlIn.a.bits.opcode === 0.U - } - req.is_store := tlOpcodeIsStore(tlIn.a.bits.opcode) - // TL always carries the exact unaligned address that the client - // originally requested, so no postprocessing required - req.address := tlIn.a.bits.address + // requests on TL A channel + // + req.valid := tlIn.a.valid + req.size := tlIn.a.bits.size + req.is_store := tlAOpcodeIsStore(tlIn.a.bits.opcode) + // TL always carries the exact unaligned address that the client + // originally requested, so no postprocessing required + req.address := tlIn.a.bits.address - // TL data - // - // When tlIn.a.bits.size is smaller than the data bus width, need to - // figure out which byte lanes we actually accessed so that - // we can write that to the memory trace. - // See Section 4.5 Byte Lanes in spec 1.8.1 + // TL data + // + // When tlIn.a.bits.size is smaller than the data bus width, need to + // figure out which byte lanes we actually accessed so that + // we can write that to the memory trace. + // See Section 4.5 Byte Lanes in spec 1.8.1 - // This assert only holds true for PutFullData and not PutPartialData, - // where HIGH bits in the mask may not be contiguous. - assert( - PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), - "mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic" - ) - val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) - val mask = ~((~0.U) << (trailingZerosInMask * 8.U)) - req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) - - when(req.valid) { - TracePrintf( - "MemTraceLogger", - tlIn.a.bits.address, - tlIn.a.bits.size, - tlIn.a.bits.mask, - req.is_store, - tlIn.a.bits.data, - req.data + // This assert only holds true for PutFullData and not PutPartialData, + // where HIGH bits in the mask may not be contiguous. + assert( + PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), + "mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic" ) - } + val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) + val mask = ~((~0.U) << (trailingZerosInMask * 8.U)) + req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) - // responses on TL D channel - // TODO + when(req.valid) { + TracePrintf( + "MemTraceLogger", + tlIn.a.bits.address, + tlIn.a.bits.size, + tlIn.a.bits.mask, + req.is_store, + tlIn.a.bits.data, + req.data + ) + } + + // responses on TL D channel + // + resp.valid := tlOut.d.valid + resp.size := tlOut.d.bits.size + resp.is_store := tlDOpcodeIsStore(tlOut.d.bits.opcode) + // NOTE: TL D channel doesn't carry address nor mask, so there's no easy + // way to figure out which bytes the master actually use. Since we + // don't care too much about addresses in the trace anyway, just store + // the entire bits. + resp.address := 0.U + resp.data := tlOut.d.bits.data } // clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. - val laneValid = Wire(Vec(numLanes, Bool())) - val laneAddress = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).address))) - val laneIsStore = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).is_store))) - val laneSize = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).size))) - val laneData = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).data))) - laneReqs.zipWithIndex.foreach { case (req, i) => - laneValid(i) := req.valid - laneAddress(i) := req.address - laneIsStore(i) := req.is_store - laneSize(i) := req.size - laneData(i) := req.data + def flattenTrace(traceLogIO: Bundle with HasTraceReq, perLane: Vec[TraceReq]) = { + val laneValid = Wire(Vec(numLanes, Bool())) + val laneAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address))) + val laneIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store))) + val laneSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size))) + val laneData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data))) + perLane.zipWithIndex.foreach { case (req, i) => + laneValid(i) := req.valid + laneAddress(i) := req.address + laneIsStore(i) := req.is_store + laneSize(i) := req.size + laneData(i) := req.data + } + // flatten per-lane signals to the Verilog blackbox input + traceLogIO.valid := laneValid.asUInt + traceLogIO.address := laneAddress.asUInt + traceLogIO.is_store := laneIsStore.asUInt + traceLogIO.size := laneSize.asUInt + traceLogIO.data := laneData.asUInt } - // flatten per-lane signals to the Verilog blackbox input - sim.io.trace_log.valid := laneValid.asUInt - sim.io.trace_log.address := laneAddress.asUInt - sim.io.trace_log.is_store := laneIsStore.asUInt - sim.io.trace_log.size := laneSize.asUInt - sim.io.trace_log.data := laneData.asUInt - assert(sim.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") + flattenTrace(simReq.io.trace_log, laneReqs) + flattenTrace(simResp.io.trace_log, laneResps) + + assert(simReq.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") + assert(simResp.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") + + // val laneValid = Wire(Vec(numLanes, Bool())) + // val laneAddress = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).address))) + // val laneIsStore = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).is_store))) + // val laneSize = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).size))) + // val laneData = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).data))) + // laneReqs.zipWithIndex.foreach { case (req, i) => + // laneValid(i) := req.valid + // laneAddress(i) := req.address + // laneIsStore(i) := req.is_store + // laneSize(i) := req.size + // laneData(i) := req.data + // } + // // flatten per-lane signals to the Verilog blackbox input + // simReq.io.trace_log.valid := laneValid.asUInt + // simReq.io.trace_log.address := laneAddress.asUInt + // simReq.io.trace_log.is_store := laneIsStore.asUInt + // simReq.io.trace_log.size := laneSize.asUInt + // simReq.io.trace_log.data := laneData.asUInt } } -class SimMemTraceLogger(filename: String, numLanes: Int) +// MemTraceLogger is bidirectional. The DPI module tells itself if it's logging +// the request stream or the response stream by `isResponse`. This distinction +// is needed because the response trace file will not contain certain columns +// such as address. +class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int) extends BlackBox( - Map("FILENAME" -> filename, "NUM_LANES" -> numLanes) + Map( + "IS_RESPONSE" -> (if (isResponse) 1 else 0), + "FILENAME" -> filename, + "NUM_LANES" -> numLanes + ) ) with HasBlackBoxResource { val io = IO(new Bundle { From d4a51cfee5b6484ab6aee6671de83a3b581b2b5e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 18:43:17 -0700 Subject: [PATCH 18/37] Log source ID in the trace --- src/main/resources/csrc/SimMemTrace.h | 6 +- src/main/resources/csrc/SimMemTraceLogger.cc | 24 +++--- src/main/resources/vsrc/SimMemTrace.v | 3 +- src/main/resources/vsrc/SimMemTraceLogger.v | 26 ++++--- src/main/scala/tilelink/Coalescing.scala | 79 +++++++++----------- 5 files changed, 71 insertions(+), 67 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index fef1ad8..adc5ef0 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -5,10 +5,11 @@ struct MemTraceLine { bool valid = false; long cycle = 0; - bool is_store = 0; int core_id = 0; int lane_id = 0; + int source = 0; unsigned long address = 0; + bool is_store = 0; unsigned long data = 0; int log_data_size = 0; }; @@ -50,8 +51,9 @@ extern "C" int memtracelogger_init(int is_response, const char *filename); extern "C" void memtracelogger_log(int handle, unsigned char trace_log_valid, unsigned long trace_log_cycle, - unsigned long trace_log_address, int trace_log_lane_id, + int trace_log_source, + unsigned long trace_log_address, unsigned char trace_log_is_store, int trace_log_size, unsigned long trace_log_data, diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index c4cfde9..75a1822 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -35,9 +35,9 @@ MemTraceWriter::~MemTraceWriter() { } void MemTraceWriter::write_line_to_trace(const MemTraceLine line) { - fprintf(outfile, "%ld %s %d %d 0x%lx 0x%lx %u\n", line.cycle, + fprintf(outfile, "%ld %s %d %d %d, 0x%lx 0x%lx %u\n", line.cycle, (line.is_store ? "STORE" : "LOAD"), line.core_id, line.lane_id, - line.address, line.data, (1u << line.log_data_size)); + line.source, line.address, line.data, (1u << line.log_data_size)); } // Returns the "handle" ID for this particular logger instance. @@ -70,13 +70,16 @@ extern "C" int memtracelogger_init(int is_response, const char *filename) { // This is used to log both TileLink A and D channels. // TODO: accept core_id as well -extern "C" void -memtracelogger_log(int handle, - unsigned char trace_log_valid, unsigned long trace_log_cycle, - unsigned long trace_log_address, int trace_log_lane_id, - unsigned char trace_log_is_store, int trace_log_size, - unsigned long trace_log_data, - unsigned char *trace_log_ready) { +extern "C" void memtracelogger_log(int handle, + unsigned char trace_log_valid, + unsigned long trace_log_cycle, + int trace_log_lane_id, + int trace_log_source, + unsigned long trace_log_address, + unsigned char trace_log_is_store, + int trace_log_size, + unsigned long trace_log_data, + unsigned char *trace_log_ready) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, // trace_read_lane_id); *trace_log_ready = 1; @@ -90,10 +93,11 @@ memtracelogger_log(int handle, MemTraceLine line{.valid = (trace_log_valid == 1), .cycle = static_cast(trace_log_cycle), - .is_store = (trace_log_is_store == 1), .core_id = 0, // TODO support multicores .lane_id = trace_log_lane_id, + .source = trace_log_source, .address = trace_log_address, + .is_store = (trace_log_is_store == 1), .data = trace_log_data, .log_data_size = trace_log_size}; diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index a805130..f21aeec 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -15,7 +15,7 @@ import "DPI-C" function void memtrace_query ( input bit trace_read_ready, input longint trace_read_cycle, - input int trace_read_tid, + input int trace_read_lane_id, output bit trace_read_valid, output longint trace_read_address, output bit trace_read_is_store, @@ -32,7 +32,6 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( input trace_read_ready, output [NUM_LANES-1:0] trace_read_valid, output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address, - output [NUM_LANES-1:0] trace_read_is_store, output [`LOGSIZE_WIDTH*NUM_LANES-1:0] trace_read_size, output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, diff --git a/src/main/resources/vsrc/SimMemTraceLogger.v b/src/main/resources/vsrc/SimMemTraceLogger.v index 8c358d2..2f6ddae 100644 --- a/src/main/resources/vsrc/SimMemTraceLogger.v +++ b/src/main/resources/vsrc/SimMemTraceLogger.v @@ -1,6 +1,7 @@ // FIXME hardcoded `define DATA_WIDTH 64 `define MAX_NUM_LANES 32 +`define SOURCEID_WIDTH 32 `define LOGSIZE_WIDTH 32 import "DPI-C" function int memtracelogger_init( @@ -17,8 +18,9 @@ import "DPI-C" function void memtracelogger_log input int handle, input bit trace_log_valid, input longint trace_log_cycle, + input int trace_log_lane_id, + input int trace_log_source, input longint trace_log_address, - input int trace_log_tid, input bit trace_log_is_store, input int trace_log_size, input longint trace_log_data, @@ -29,16 +31,17 @@ module SimMemTraceLogger #(parameter IS_RESPONSE = 0, FILENAME = "undefined", NUM_LANES = 4) ( - input clock, - input reset, + input clock, + input reset, // NOTE: LSB is lane 0 - input [NUM_LANES-1:0] trace_log_valid, - input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_address, - input [NUM_LANES-1:0] trace_log_is_store, - input [`LOGSIZE_WIDTH*NUM_LANES-1:0] trace_log_size, - input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_data, - output trace_log_ready + input [NUM_LANES-1:0] trace_log_valid, + input [`SOURCEID_WIDTH*NUM_LANES-1:0] trace_log_source, + input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_address, + input [NUM_LANES-1:0] trace_log_is_store, + input [`LOGSIZE_WIDTH*NUM_LANES-1:0] trace_log_size, + input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_data, + output trace_log_ready ); int logger_handle; bit __in_ready; @@ -51,6 +54,7 @@ module SimMemTraceLogger #(parameter // wires going into the DPC wire __valid [NUM_LANES-1:0]; + wire [`SOURCEID_WIDTH-1:0] __source [NUM_LANES-1:0]; wire [`DATA_WIDTH-1:0] __address [NUM_LANES-1:0]; wire __is_store [NUM_LANES-1:0]; wire [`LOGSIZE_WIDTH-1:0] __size [NUM_LANES-1:0]; @@ -63,6 +67,7 @@ module SimMemTraceLogger #(parameter for (g = 0; g < NUM_LANES; g = g + 1) begin // LSB is lane 0 assign __valid[g] = trace_log_valid[g]; + assign __source[g] = trace_log_source[`SOURCEID_WIDTH*(g+1)-1:`SOURCEID_WIDTH*g]; assign __address[g] = trace_log_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g]; assign __is_store[g] = trace_log_is_store[g]; assign __size[g] = trace_log_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g]; @@ -87,8 +92,9 @@ module SimMemTraceLogger #(parameter logger_handle, __valid[tid], cycle_counter, - __address[tid], tid, + __source[tid], + __address[tid], __is_store[tid], __size[tid], __data[tid], diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 775cce8..4ff21dd 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -607,16 +607,19 @@ class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 lazy val module = new MemTraceDriverImp(this, numLanes, filename) } -trait HasTraceReq { +trait HasTraceLine { val valid: UInt + val source: UInt val address: UInt val is_store: UInt val size: UInt val data: UInt } -class TraceReq extends Bundle with HasTraceReq { +// used for both request and response. response had address set to 0 +class TraceLine extends Bundle with HasTraceLine { val valid = Bool() + val source = UInt(32.W) val address = UInt(64.W) val is_store = Bool() val size = UInt(32.W) // this is log2(bytesize) as in TL A bundle @@ -636,9 +639,11 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) // Split output of SimMemTrace, which is flattened across all lanes, // back to each lane's. - val laneReqs = Wire(Vec(numLanes, new TraceReq)) + val laneReqs = Wire(Vec(numLanes, new TraceLine)) laneReqs.zipWithIndex.foreach { case (req, i) => req.valid := sim.io.trace_read.valid(i) + // TODO: don't take source id from the original trace for now + req.source := 0.U req.address := sim.io.trace_read.address(64 * i + 63, 64 * i) req.is_store := sim.io.trace_read.is_store(i) req.size := sim.io.trace_read.size(32 * i + 31, 32 * i) @@ -728,7 +733,7 @@ class SimMemTrace(filename: String, numLanes: Int) // These names have to match declarations in the Verilog code, eg. // trace_read_address. - val trace_read = new Bundle with HasTraceReq { + val trace_read = new Bundle { // FIXME: can't use HasTraceLine because this doesn't have source val ready = Input(Bool()) val valid = Output(UInt(numLanes.W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into @@ -786,8 +791,8 @@ class MemTraceLogger( simResp.io.clock := clock simResp.io.reset := reset.asBool - val laneReqs = Wire(Vec(numLanes, new TraceReq)) - val laneResps = Wire(Vec(numLanes, new TraceReq)) + val laneReqs = Wire(Vec(numLanes, new TraceLine)) + val laneResps = Wire(Vec(numLanes, new TraceLine)) assert( numLanes == node.in.length, @@ -818,6 +823,7 @@ class MemTraceLogger( req.valid := tlIn.a.valid req.size := tlIn.a.bits.size req.is_store := tlAOpcodeIsStore(tlIn.a.bits.opcode) + req.source := tlIn.a.bits.source // TL always carries the exact unaligned address that the client // originally requested, so no postprocessing required req.address := tlIn.a.bits.address @@ -856,6 +862,7 @@ class MemTraceLogger( resp.valid := tlOut.d.valid resp.size := tlOut.d.bits.size resp.is_store := tlDOpcodeIsStore(tlOut.d.bits.opcode) + resp.source := tlOut.d.bits.source // NOTE: TL D channel doesn't carry address nor mask, so there's no easy // way to figure out which bytes the master actually use. Since we // don't care too much about addresses in the trace anyway, just store @@ -864,27 +871,31 @@ class MemTraceLogger( resp.data := tlOut.d.bits.data } + // Flatten per-lane signals to the Verilog blackbox input. // clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. - def flattenTrace(traceLogIO: Bundle with HasTraceReq, perLane: Vec[TraceReq]) = { - val laneValid = Wire(Vec(numLanes, Bool())) - val laneAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address))) - val laneIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store))) - val laneSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size))) - val laneData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data))) + def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { + // these will get optimized out + val vecValid = Wire(Vec(numLanes, Bool())) + val vecSource = Wire(Vec(numLanes, Bool())) + val vecAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address))) + val vecIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store))) + val vecSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size))) + val vecData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data))) perLane.zipWithIndex.foreach { case (req, i) => - laneValid(i) := req.valid - laneAddress(i) := req.address - laneIsStore(i) := req.is_store - laneSize(i) := req.size - laneData(i) := req.data + vecValid(i) := req.valid + vecSource(i) := req.source + vecAddress(i) := req.address + vecIsStore(i) := req.is_store + vecSize(i) := req.size + vecData(i) := req.data } - // flatten per-lane signals to the Verilog blackbox input - traceLogIO.valid := laneValid.asUInt - traceLogIO.address := laneAddress.asUInt - traceLogIO.is_store := laneIsStore.asUInt - traceLogIO.size := laneSize.asUInt - traceLogIO.data := laneData.asUInt + traceLogIO.valid := vecValid.asUInt + traceLogIO.source := vecSource.asUInt + traceLogIO.address := vecAddress.asUInt + traceLogIO.is_store := vecIsStore.asUInt + traceLogIO.size := vecSize.asUInt + traceLogIO.data := vecData.asUInt } flattenTrace(simReq.io.trace_log, laneReqs) @@ -892,25 +903,6 @@ class MemTraceLogger( assert(simReq.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") assert(simResp.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") - - // val laneValid = Wire(Vec(numLanes, Bool())) - // val laneAddress = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).address))) - // val laneIsStore = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).is_store))) - // val laneSize = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).size))) - // val laneData = Wire(Vec(numLanes, chiselTypeOf(laneReqs(0).data))) - // laneReqs.zipWithIndex.foreach { case (req, i) => - // laneValid(i) := req.valid - // laneAddress(i) := req.address - // laneIsStore(i) := req.is_store - // laneSize(i) := req.size - // laneData(i) := req.data - // } - // // flatten per-lane signals to the Verilog blackbox input - // simReq.io.trace_log.valid := laneValid.asUInt - // simReq.io.trace_log.address := laneAddress.asUInt - // simReq.io.trace_log.is_store := laneIsStore.asUInt - // simReq.io.trace_log.size := laneSize.asUInt - // simReq.io.trace_log.data := laneData.asUInt } } @@ -931,8 +923,9 @@ class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int) val clock = Input(Clock()) val reset = Input(Bool()) - val trace_log = new Bundle with HasTraceReq { + val trace_log = new Bundle with HasTraceLine { val valid = Input(UInt(numLanes.W)) + val source = Input(UInt((32 * numLanes).W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. // TODO: assumes 64-bit address. From 02ce969c6712ebc804f754d320072c9ada65f711 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 18:52:38 -0700 Subject: [PATCH 19/37] Fix width mismatch for source logger --- src/main/resources/csrc/SimMemTraceLogger.cc | 2 +- src/main/scala/tilelink/Coalescing.scala | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index 75a1822..6df995c 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -35,7 +35,7 @@ MemTraceWriter::~MemTraceWriter() { } void MemTraceWriter::write_line_to_trace(const MemTraceLine line) { - fprintf(outfile, "%ld %s %d %d %d, 0x%lx 0x%lx %u\n", line.cycle, + fprintf(outfile, "%ld %s %d %d %d 0x%lx 0x%lx %u\n", line.cycle, (line.is_store ? "STORE" : "LOAD"), line.core_id, line.lane_id, line.source, line.address, line.data, (1u << line.log_data_size)); } diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4ff21dd..b892c1e 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -824,6 +824,7 @@ class MemTraceLogger( req.size := tlIn.a.bits.size req.is_store := tlAOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source + printf("======== req.source=%d\n", req.source) // TL always carries the exact unaligned address that the client // originally requested, so no postprocessing required req.address := tlIn.a.bits.address @@ -863,6 +864,7 @@ class MemTraceLogger( resp.size := tlOut.d.bits.size resp.is_store := tlDOpcodeIsStore(tlOut.d.bits.opcode) resp.source := tlOut.d.bits.source + printf("======== resp.source=%d\n", resp.source) // NOTE: TL D channel doesn't carry address nor mask, so there's no easy // way to figure out which bytes the master actually use. Since we // don't care too much about addresses in the trace anyway, just store @@ -876,19 +878,19 @@ class MemTraceLogger( // assignment to a bitfield range of a wide signal. def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { // these will get optimized out - val vecValid = Wire(Vec(numLanes, Bool())) - val vecSource = Wire(Vec(numLanes, Bool())) + val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid))) + val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source))) val vecAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address))) val vecIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store))) val vecSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size))) val vecData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data))) - perLane.zipWithIndex.foreach { case (req, i) => - vecValid(i) := req.valid - vecSource(i) := req.source - vecAddress(i) := req.address - vecIsStore(i) := req.is_store - vecSize(i) := req.size - vecData(i) := req.data + perLane.zipWithIndex.foreach { case (l, i) => + vecValid(i) := l.valid + vecSource(i) := l.source + vecAddress(i) := l.address + vecIsStore(i) := l.is_store + vecSize(i) := l.size + vecData(i) := l.data } traceLogIO.valid := vecValid.asUInt traceLogIO.source := vecSource.asUInt From 334d05b22246a5635bc9862b8bad8adcdb663f93 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 21:17:23 -0700 Subject: [PATCH 20/37] Remove leftover printf --- src/main/scala/tilelink/Coalescing.scala | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index b892c1e..2afcae8 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -824,7 +824,6 @@ class MemTraceLogger( req.size := tlIn.a.bits.size req.is_store := tlAOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source - printf("======== req.source=%d\n", req.source) // TL always carries the exact unaligned address that the client // originally requested, so no postprocessing required req.address := tlIn.a.bits.address @@ -864,7 +863,6 @@ class MemTraceLogger( resp.size := tlOut.d.bits.size resp.is_store := tlDOpcodeIsStore(tlOut.d.bits.opcode) resp.source := tlOut.d.bits.source - printf("======== resp.source=%d\n", resp.source) // NOTE: TL D channel doesn't carry address nor mask, so there's no easy // way to figure out which bytes the master actually use. Since we // don't care too much about addresses in the trace anyway, just store @@ -874,7 +872,8 @@ class MemTraceLogger( } // Flatten per-lane signals to the Verilog blackbox input. - // clunky workaround of the fact that Chisel doesn't allow partial + // + // This is a clunky workaround of the fact that Chisel doesn't allow partial // assignment to a bitfield range of a wide signal. def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { // these will get optimized out @@ -908,10 +907,10 @@ class MemTraceLogger( } } -// MemTraceLogger is bidirectional. The DPI module tells itself if it's logging -// the request stream or the response stream by `isResponse`. This distinction -// is needed because the response trace file will not contain certain columns -// such as address. +// MemTraceLogger is bidirectional, and `isResponse` is how the DPI module tells +// itself whether it's logging the request stream or the response stream. This +// is necessary because we have to generate slightly different trace format +// depending on this, e.g. response trace will not contain an address column. class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int) extends BlackBox( Map( From 2ac5ee398a5e7d3fe172ec88929f3f7a77b645a2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 17 Apr 2023 21:31:13 -0700 Subject: [PATCH 21/37] Doc --- src/main/scala/tilelink/Coalescing.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 2afcae8..525cf32 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -204,9 +204,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val (tlCoal, edgeCoal) = outer.coalescerNode.out(0) val coalReqAddress = Wire(UInt(tlCoal.params.addressBits.W)) - // TODO: bogus address + // FIXME: bogus address coalReqAddress := (0xabcd.U + coalSourceId) << 4 - // FIXME: coalesce lane 0 and lane 2's queue head whenever they're valid + // FIXME: bogus coalescing logic: coalesce whenever all 4 lanes have valid + // queue head coalReqValid := reqQueues(0).io.deq.valid && reqQueues(1).io.deq.valid && reqQueues(2).io.deq.valid && reqQueues(3).io.deq.valid when(coalReqValid) { From 65a22b7fcb87128376c207e90e77141726fe2de1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 18 Apr 2023 00:09:48 -0700 Subject: [PATCH 22/37] Use TLMessages constants instead of numbers --- src/main/scala/tilelink/Coalescing.scala | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 525cf32..76bb32f 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -801,16 +801,18 @@ class MemTraceLogger( ) def tlAOpcodeIsStore(opcode: UInt): Bool = { - // 0: PutFullData, 1: PutPartialData but we don't support it - // 4: Get - assert(opcode === 0.U || opcode === 4.U, "unhandled TL A opcode found") - opcode === 0.U + assert( + opcode === TLMessages.PutFullData || opcode === TLMessages.PutPartialData, + "unhandled TL A opcode found" + ) + opcode === TLMessages.PutFullData } def tlDOpcodeIsStore(opcode: UInt): Bool = { - // 0: AccessAck (Put), 1: AccessAckData (Get or Atomic) - // See Table 13 of spec 1.8.1 - assert(opcode === 0.U || opcode === 1.U, "unhandled TL D opcode found") - opcode === 0.U + assert( + opcode === TLMessages.AccessAck || opcode === TLMessages.AccessAckData, + "unhandled TL D opcode found" + ) + opcode === TLMessages.AccessAck } // snoop on the TileLink edges to log traffic From 322f3406570a7542ea45fdd61ebff46d145586d2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 18 Apr 2023 00:14:33 -0700 Subject: [PATCH 23/37] Manage to make new bugs while doing that --- src/main/scala/tilelink/Coalescing.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 76bb32f..7a81533 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -802,7 +802,7 @@ class MemTraceLogger( def tlAOpcodeIsStore(opcode: UInt): Bool = { assert( - opcode === TLMessages.PutFullData || opcode === TLMessages.PutPartialData, + opcode === TLMessages.PutFullData || opcode === TLMessages.Get, "unhandled TL A opcode found" ) opcode === TLMessages.PutFullData From b2ab45f5e247e028ae7d0cf41a88a640bd926d8f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 20 Apr 2023 21:09:19 -0700 Subject: [PATCH 24/37] Change parameters to spatial-only coalescing --- src/main/scala/tilelink/Coalescing.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 7a81533..4c81b50 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -55,7 +55,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val wordSize = 4 - val reqQueueDepth = 4 // FIXME test + val reqQueueDepth = 1 val respQueueDepth = 4 // FIXME test val sourceWidth = outer.node.in(1)._1.params.sourceBits @@ -66,8 +66,8 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule } // The maximum number of requests from a single lane that can go into a - // coalesced request. Upper bound is 2**sourceWidth. - val numPerLaneReqs = 2 + // coalesced request. Upper bound is request queue depth. + val numPerLaneReqs = 1 val respQueueEntryT = new RespQueueEntry(sourceWidth, wordSize * 8) val respQueues = Seq.tabulate(numLanes) { _ => @@ -280,8 +280,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule uncoalescer.io.coalRespSrcId := tlCoal.d.bits.source uncoalescer.io.coalRespData := tlCoal.d.bits.data - // TODO: multibeat TL requests. Currently tlCoal.d.bits.data is fixed to 64b - // width println(s"=========== coalRespData width: ${tlCoal.d.bits.data.widthOption.get}") // Queue up synthesized uncoalesced responses into each lane's response queue @@ -511,6 +509,8 @@ class CoalShiftQueue[T <: Data]( val invalidate = Input(UInt(entries.W)) val mask = Output(UInt(entries.W)) val elts = Output(Vec(entries, gen)) + // 'QueueIO' provides io.count, but we might not want to use it in the + // coalescer because it has potentially expensive PopCount }) private val valid = RegInit(VecInit(Seq.fill(entries) { false.B })) From 7e405b53559bf88c38f6c12782f4086c93a7176f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 20 Apr 2023 21:11:12 -0700 Subject: [PATCH 25/37] Re-enable coalescer in TLRAMCoalescerLoggerTest Now that the driver and logger are working (kinda). --- src/main/scala/tilelink/Coalescing.scala | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4c81b50..ecb8668 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -593,7 +593,7 @@ class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4 val clientParam = Seq( TLMasterParameters.v1( name = "MemTraceDriver" + i.toString, - sourceId = IdRange(0, 0x1000) + sourceId = IdRange(0, 0x10) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) ) @@ -972,10 +972,10 @@ object TracePrintf { class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 - // val coal = LazyModule(new CoalescingUnit(numLanes)) + val coal = LazyModule(new CoalescingUnit(numLanes)) val driver = LazyModule(new MemTraceDriver(numLanes)) - val logger = LazyModule(new MemTraceLogger(numLanes)) - val rams = Seq.fill(numLanes)( // +1 for coalesced edge + val logger = LazyModule(new MemTraceLogger(numLanes + 1)) + val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink // edges globally, by way of Diplomacy communicating the TL slave @@ -984,8 +984,7 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { ) ) - // logger.node :=* coal.node :=* driver.node - logger.node :=* driver.node + logger.node :=* coal.node :=* driver.node rams.foreach { r => r.node := logger.node } lazy val module = new Impl From de6d6eee1a923b5e9eb1d02d6c16616c0c17d74d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 20 Apr 2023 21:12:19 -0700 Subject: [PATCH 26/37] Fix request shift queue not enqueuing when empty The queue was enabling shifting of the registers whenever deq.ready was 1, even when the queue was empty. This caused `wen` to disable writing enq.bits to any of the entries in the queue. Fixed by setting `shift` to 0 when queue is empty. --- src/main/scala/tilelink/Coalescing.scala | 13 +++--- .../scala/coalescing/CoalescingUnitTest.scala | 41 ++++++++++++++++++- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index ecb8668..7f01413 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -125,14 +125,13 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule req.address := tlIn.a.bits.address req.data := tlIn.a.bits.data + assert(reqQueue.io.enq.ready, "reqQueue is supposed to be always ready") reqQueue.io.enq.valid := tlIn.a.valid reqQueue.io.enq.bits := req // TODO: deq.ready should respect downstream ready reqQueue.io.deq.ready := true.B reqQueue.io.invalidate := 0.U - printf(s"reqQueue(${lane}).count=%d\n", reqQueue.io.count) - // Invalidate coalesced requests // FIXME: hardcoded lanes // val invalidate = coalReqValid && (lane == 0 || lane == 2).B @@ -208,8 +207,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule coalReqAddress := (0xabcd.U + coalSourceId) << 4 // FIXME: bogus coalescing logic: coalesce whenever all 4 lanes have valid // queue head - coalReqValid := reqQueues(0).io.deq.valid && reqQueues(1).io.deq.valid && - reqQueues(2).io.deq.valid && reqQueues(3).io.deq.valid + // coalReqValid := reqQueues(0).io.deq.valid && reqQueues(1).io.deq.valid && + // reqQueues(2).io.deq.valid && reqQueues(3).io.deq.valid + coalReqValid := false.B when(coalReqValid) { // invalidate original requests due to coalescing // FIXME: bogus @@ -245,7 +245,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val newEntry = Wire( new InflightCoalReqTableEntry(numLanes, numPerLaneReqs, sourceWidth, offsetBits, sizeBits) ) + println(s"=========== table sourceWidth: ${sourceWidth}") + newEntry.source := coalSourceId newEntry.lanes.foreach { l => l.reqs.zipWithIndex.foreach { case (r, i) => @@ -535,11 +537,12 @@ class CoalShiftQueue[T <: Data]( def paddedUsed = pad({ i: Int => used(i) }) def validAfterInv(i: Int) = valid(i) && !io.invalidate(i) - val shift = io.deq.ready || (used =/= 0.U) && !validAfterInv(0) + val shift = (used =/= 0.U) && (io.deq.ready || !validAfterInv(0)) for (i <- 0 until entries) { val wdata = if (i == entries - 1) io.enq.bits else Mux(!used(i + 1), io.enq.bits, elts(i + 1)) val wen = Mux( shift, + // enqueue to the top entry which will be shifted down and make space (io.enq.fire && !paddedUsed(i + 1) && used(i)) || pad(validAfterInv)(i + 1), // enqueue to the first empty slot above the top (io.enq.fire && paddedUsed(i - 1) && !used(i)) || !validAfterInv(i) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 47db1b6..2556c35 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -87,7 +87,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { c.io.invalidate.poke(0.U) // prepare - c.io.deq.ready.poke(false.B) + c.io.deq.ready.poke(true.B) c.io.enq.ready.expect(true.B) c.io.enq.valid.poke(true.B) c.io.enq.bits.poke(0x12.U) @@ -113,6 +113,45 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } + it should "work when enqueing and dequeueing simultaneously to a full queue" in { + test(new CoalShiftQueue(UInt(8.W), 1)) { c => + c.io.invalidate.poke(0.U) + + // prepare + c.io.deq.ready.poke(true.B) + c.io.enq.ready.expect(true.B) + c.io.enq.valid.poke(true.B) + c.io.enq.bits.poke(0x12.U) + c.clock.step() + // enqueue and dequeue simultaneously + c.io.deq.ready.poke(true.B) + c.io.enq.ready.expect(true.B) + c.io.enq.valid.poke(true.B) + c.io.enq.bits.poke(0x34.U) + c.io.deq.valid.expect(true.B) + c.io.deq.bits.expect(0x12.U) + c.clock.step() + // enqueue and dequeue simultaneously once more + c.io.deq.ready.poke(true.B) + c.io.enq.ready.expect(true.B) + c.io.enq.valid.poke(true.B) + c.io.enq.bits.poke(0x56.U) + c.io.deq.valid.expect(true.B) + c.io.deq.bits.expect(0x34.U) + c.clock.step() + // dequeueing back-to-back should work without any holes in the middle + c.io.deq.ready.poke(true.B) + c.io.enq.valid.poke(false.B) + c.io.deq.valid.expect(true.B) + c.io.deq.bits.expect(0x56.U) + c.clock.step() + // make sure is empty + c.io.deq.ready.poke(true.B) + c.io.enq.valid.poke(false.B) + c.io.deq.valid.expect(false.B) + } + } + it should "invalidate head being dequeued" in { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.invalidate.poke(0.U) From a9719d5e36ee5029c9094dd9947ef91ec432f686 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 20 Apr 2023 21:42:56 -0700 Subject: [PATCH 27/37] Split tl*OpcodeIsStore out to global scope --- src/main/scala/tilelink/Coalescing.scala | 42 +++++++++++++----------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 7f01413..69689d9 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -207,9 +207,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule coalReqAddress := (0xabcd.U + coalSourceId) << 4 // FIXME: bogus coalescing logic: coalesce whenever all 4 lanes have valid // queue head - // coalReqValid := reqQueues(0).io.deq.valid && reqQueues(1).io.deq.valid && - // reqQueues(2).io.deq.valid && reqQueues(3).io.deq.valid - coalReqValid := false.B + coalReqValid := reqQueues(0).io.deq.valid && reqQueues(1).io.deq.valid && + reqQueues(2).io.deq.valid && reqQueues(3).io.deq.valid + // coalReqValid := false.B when(coalReqValid) { // invalidate original requests due to coalescing // FIXME: bogus @@ -588,6 +588,23 @@ class CoalShiftQueue[T <: Data]( io.count := PopCount(io.mask) } +object TLUtils { + def AOpcodeIsStore(opcode: UInt): Bool = { + assert( + opcode === TLMessages.PutFullData || opcode === TLMessages.Get, + "unhandled TL A opcode found" + ) + opcode === TLMessages.PutFullData + } + def DOpcodeIsStore(opcode: UInt): Bool = { + assert( + opcode === TLMessages.AccessAck || opcode === TLMessages.AccessAckData, + "unhandled TL D opcode found" + ) + opcode === TLMessages.AccessAck + } +} + class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit p: Parameters ) extends LazyModule { @@ -803,21 +820,6 @@ class MemTraceLogger( "`numLanes` does not match the number of TL edges connected to the MemTraceLogger" ) - def tlAOpcodeIsStore(opcode: UInt): Bool = { - assert( - opcode === TLMessages.PutFullData || opcode === TLMessages.Get, - "unhandled TL A opcode found" - ) - opcode === TLMessages.PutFullData - } - def tlDOpcodeIsStore(opcode: UInt): Bool = { - assert( - opcode === TLMessages.AccessAck || opcode === TLMessages.AccessAckData, - "unhandled TL D opcode found" - ) - opcode === TLMessages.AccessAck - } - // snoop on the TileLink edges to log traffic ((node.in zip node.out) zip (laneReqs zip laneResps)).foreach { case (((tlIn, _), (tlOut, _)), (req, resp)) => @@ -828,7 +830,7 @@ class MemTraceLogger( // req.valid := tlIn.a.valid req.size := tlIn.a.bits.size - req.is_store := tlAOpcodeIsStore(tlIn.a.bits.opcode) + req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) req.source := tlIn.a.bits.source // TL always carries the exact unaligned address that the client // originally requested, so no postprocessing required @@ -867,7 +869,7 @@ class MemTraceLogger( // resp.valid := tlOut.d.valid resp.size := tlOut.d.bits.size - resp.is_store := tlDOpcodeIsStore(tlOut.d.bits.opcode) + resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) resp.source := tlOut.d.bits.source // NOTE: TL D channel doesn't carry address nor mask, so there's no easy // way to figure out which bytes the master actually use. Since we From de478dcca9f6c638694dedc192c4b3a4f6a4435f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 20 Apr 2023 21:57:40 -0700 Subject: [PATCH 28/37] Generate both Put/Get for non-coalesced requests --- src/main/scala/tilelink/Coalescing.scala | 51 ++++++++++++++++++------ 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 69689d9..8a1d91d 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -37,14 +37,21 @@ class CoalescingUnit(numLanes: Int = 1)(implicit p: Parameters) extends LazyModu lazy val module = new CoalescingUnitImp(this, numLanes) } -class ReqQueueEntry(val sourceWidth: Int, val addressWidth: Int) extends Bundle { +// FIXME: this overlaps a lot with HasTraceLine +class ReqQueueEntry(val sourceWidth: Int, val addressWidth: Int, val sizeWidth: Int) + extends Bundle { val source = UInt(sourceWidth.W) + val isStore = Bool() val address = UInt(addressWidth.W) + val size = UInt(sizeWidth.W) // log(sizeInBytes) val data = UInt(64.W /* FIXME hardcoded */ ) // write data } -class RespQueueEntry(val sourceWidth: Int, val dataWidthInBits: Int) extends Bundle { +class RespQueueEntry(val sourceWidth: Int, val dataWidthInBits: Int, val sizeWidth: Int) + extends Bundle { val source = UInt(sourceWidth.W) + val isStore = Bool() + val size = UInt(sizeWidth.W) // log(sizeInBytes) val data = UInt(dataWidthInBits.W) // read data } @@ -53,6 +60,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule // coalescer TL master node assert(outer.node.in.length >= 2) + // 32-bit system. FIXME hardcoded val wordSize = 4 val reqQueueDepth = 1 @@ -60,7 +68,8 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val sourceWidth = outer.node.in(1)._1.params.sourceBits val addressWidth = outer.node.in(1)._1.params.addressBits - val reqQueueEntryT = new ReqQueueEntry(sourceWidth, addressWidth) + val sizeWidth = outer.node.in(1)._1.params.sizeBits + val reqQueueEntryT = new ReqQueueEntry(sourceWidth, addressWidth, sizeWidth) val reqQueues = Seq.tabulate(numLanes) { _ => Module(new CoalShiftQueue(reqQueueEntryT, reqQueueDepth)) } @@ -69,7 +78,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule // coalesced request. Upper bound is request queue depth. val numPerLaneReqs = 1 - val respQueueEntryT = new RespQueueEntry(sourceWidth, wordSize * 8) + val respQueueEntryT = new RespQueueEntry(sourceWidth, wordSize * 8, sizeWidth) val respQueues = Seq.tabulate(numLanes) { _ => Module( new MultiPortQueue( @@ -122,7 +131,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val reqQueue = reqQueues(lane) val req = Wire(reqQueueEntryT) req.source := tlIn.a.bits.source + req.isStore := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) req.address := tlIn.a.bits.address + req.size := tlIn.a.bits.size req.data := tlIn.a.bits.data assert(reqQueue.io.enq.ready, "reqQueue is supposed to be always ready") @@ -139,15 +150,22 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule tlOut.a.valid := reqQueue.io.deq.valid && !invalidate val reqHead = reqQueue.io.deq.bits - // FIXME: generate Get or Put according to read/write - val (reqLegal, reqBits) = edgeOut.Get( + val (plegal, pbits) = edgeOut.Put( fromSource = reqHead.source, - // `toAddress` should be aligned to 2**lgSize toAddress = reqHead.address, - lgSize = 0.U + lgSize = reqHead.size, + // data should be aligned to beatBytes + data = reqHead.data ) - assert(reqLegal, "unhandled illegal TL req gen") - tlOut.a.bits := reqBits + val (glegal, gbits) = edgeOut.Get( + fromSource = reqHead.source, + toAddress = reqHead.address, + lgSize = reqHead.size + ) + val legal = Mux(reqHead.isStore, plegal, glegal) + val bits = Mux(reqHead.isStore, pbits, gbits) + assert(legal, "unhandled illegal TL req gen") + tlOut.a.bits := bits // Response queue // @@ -156,8 +174,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val respQueue = respQueues(lane) val resp = Wire(respQueueEntryT) resp.source := tlOut.d.bits.source + resp.isStore := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) + resp.size := tlOut.d.bits.size resp.data := tlOut.d.bits.data - // TODO: read/write bit? // Queue up responses that didn't get coalesced originally ("noncoalesced" responses). // Coalesced (but uncoalesced back) responses will also be enqueued into the same queue. @@ -172,6 +191,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule tlIn.d.valid := respQueue.io.deq(respQueueNoncoalPort).valid val respHead = respQueue.io.deq(respQueueNoncoalPort).bits + // TODO: AccessAckData for Get val respBits = edgeIn.AccessAck( toSource = respHead.source, lgSize = 0.U, @@ -271,6 +291,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule numLanes, numPerLaneReqs, sourceWidth, + sizeWidth, coalDataWidth, outer.numInflightCoalRequests ) @@ -313,6 +334,7 @@ class UncoalescingUnit( val numLanes: Int, val numPerLaneReqs: Int, val sourceWidth: Int, + val sizeWidth: Int, val coalDataWidth: Int, val numInflightCoalRequests: Int ) extends Module { @@ -328,7 +350,10 @@ class UncoalescingUnit( val coalRespSrcId = Input(UInt(sourceWidth.W)) val coalRespData = Input(UInt(coalDataWidth.W)) val uncoalResps = Output( - Vec(numLanes, Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, wordSize * 8)))) + Vec( + numLanes, + Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, wordSize * 8, sizeWidth))) + ) ) }) @@ -692,7 +717,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) fromSource = sourceIdCounter, toAddress = hashToValidPhyAddr(req.address), lgSize = req.size, // trace line already holds log2(size) - // Need to construct data that is correctly aligned to beatBytes + // data should be aligned to beatBytes data = (req.data << (8.U * (req.address % edge.manager.beatBytes.U))) ) val (glegal, gbits) = edge.Get( From 6ae08b654158cb3da483169533d24a0e517742b9 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 21 Apr 2023 11:24:57 -0700 Subject: [PATCH 29/37] Add missing sizeWidth to uncoalescer test --- src/test/scala/coalescing/CoalescingUnitTest.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 2556c35..5f722c5 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -255,6 +255,7 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { val numLanes = 4 val numPerLaneReqs = 2 val sourceWidth = 2 + val sizeWidth = 2 // 16B coalescing size val coalDataWidth = 128 val numInflightCoalRequests = 4 @@ -265,8 +266,9 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { numLanes, numPerLaneReqs, sourceWidth, + sizeWidth, coalDataWidth, - numInflightCoalRequests + numInflightCoalRequests, ) ) // vcs helps with simulation time, but sometimes errors with From e04ffe213016ced42ff6f5e8fbddfe8a251a9494 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 21 Apr 2023 15:53:15 -0700 Subject: [PATCH 30/37] Generate separate traces per logger/req/resp --- src/main/scala/tilelink/Coalescing.scala | 67 +++++++++++++++++------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 8a1d91d..2abb17f 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -154,7 +154,9 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule fromSource = reqHead.source, toAddress = reqHead.address, lgSize = reqHead.size, - // data should be aligned to beatBytes + // data is already aligned by MemTraceDriver + // NOTE: if tlIn has different parameters, this will no longer be the + // case data = reqHead.data ) val (glegal, gbits) = edgeOut.Get( @@ -799,9 +801,14 @@ class SimMemTrace(filename: String, numLanes: Int) } class MemTraceLogger( - numLanes: Int = 4, - reqFilename: String = "vecadd.core1.thread4.logger.req.trace", - respFilename: String = "vecadd.core1.thread4.logger.resp.trace" + numLanes: Int, + // base filename for the generated trace files. full filename will be + // suffixed depending on `reqEnable`/`respEnable`/`loggerName`. + filename: String = "vecadd.core1.thread4.trace", + reqEnable: Boolean = true, + respEnable: Boolean = true, + // filename suffix that is unique to this logger module. + loggerName: String = ".logger" )(implicit p: Parameters ) extends LazyModule { @@ -830,12 +837,22 @@ class MemTraceLogger( lazy val module = new Impl class Impl extends LazyModuleImp(this) { - val simReq = Module(new SimMemTraceLogger(false, reqFilename, numLanes)) - val simResp = Module(new SimMemTraceLogger(true, respFilename, numLanes)) - simReq.io.clock := clock - simReq.io.reset := reset.asBool - simResp.io.clock := clock - simResp.io.reset := reset.asBool + val simReq = + if (reqEnable) + Some(Module(new SimMemTraceLogger(false, s"${filename}.${loggerName}.req", numLanes))) + else None + val simResp = + if (respEnable) + Some(Module(new SimMemTraceLogger(true, s"${filename}.${loggerName}.resp", numLanes))) + else None + if (simReq.isDefined) { + simReq.get.io.clock := clock + simReq.get.io.reset := reset.asBool + } + if (simResp.isDefined) { + simResp.get.io.clock := clock + simResp.get.io.reset := reset.asBool + } val laneReqs = Wire(Vec(numLanes, new TraceLine)) val laneResps = Wire(Vec(numLanes, new TraceLine)) @@ -932,11 +949,20 @@ class MemTraceLogger( traceLogIO.data := vecData.asUInt } - flattenTrace(simReq.io.trace_log, laneReqs) - flattenTrace(simResp.io.trace_log, laneResps) - - assert(simReq.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") - assert(simResp.io.trace_log.ready === true.B, "MemTraceLogger is expected to be always ready") + if (simReq.isDefined) { + flattenTrace(simReq.get.io.trace_log, laneReqs) + assert( + simReq.get.io.trace_log.ready === true.B, + "MemTraceLogger is expected to be always ready" + ) + } + if (simResp.isDefined) { + flattenTrace(simResp.get.io.trace_log, laneResps) + assert( + simResp.get.io.trace_log.ready === true.B, + "MemTraceLogger is expected to be always ready" + ) + } } } @@ -1002,9 +1028,12 @@ object TracePrintf { class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 - val coal = LazyModule(new CoalescingUnit(numLanes)) val driver = LazyModule(new MemTraceDriver(numLanes)) - val logger = LazyModule(new MemTraceLogger(numLanes + 1)) + val coreSideLogger = LazyModule( + new MemTraceLogger(numLanes, loggerName = "coreside") + ) + val coal = LazyModule(new CoalescingUnit(numLanes)) + val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, loggerName = "memside")) val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink @@ -1014,8 +1043,8 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { ) ) - logger.node :=* coal.node :=* driver.node - rams.foreach { r => r.node := logger.node } + memSideLogger.node :=* coal.node :=* coreSideLogger.node :=* driver.node + rams.foreach { r => r.node := memSideLogger.node } lazy val module = new Impl class Impl extends LazyModuleImp(this) with UnitTestModule { From 3f9f7a1d67f9d96abdb400578d01445bdfd44722 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 21 Apr 2023 16:14:18 -0700 Subject: [PATCH 31/37] Generate proper AccessAck/AccessAckData from response queue --- src/main/resources/csrc/SimMemTraceLogger.cc | 4 ++-- src/main/scala/tilelink/Coalescing.scala | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc index 6df995c..1226143 100644 --- a/src/main/resources/csrc/SimMemTraceLogger.cc +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -88,8 +88,8 @@ extern "C" void memtracelogger_log(int handle, return; } - printf("%s: [%lu] valid: address=%lx, tid=%u, size=%d\n", __func__, - trace_log_cycle, trace_log_address, trace_log_lane_id, trace_log_size); + // printf("%s: [%lu] valid: address=%lx, tid=%u, size=%d\n", __func__, + // trace_log_cycle, trace_log_address, trace_log_lane_id, trace_log_size); MemTraceLine line{.valid = (trace_log_valid == 1), .cycle = static_cast(trace_log_cycle), diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 2abb17f..27fbc6e 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -193,12 +193,16 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule tlIn.d.valid := respQueue.io.deq(respQueueNoncoalPort).valid val respHead = respQueue.io.deq(respQueueNoncoalPort).bits - // TODO: AccessAckData for Get - val respBits = edgeIn.AccessAck( + val apBits = edgeIn.AccessAck( toSource = respHead.source, - lgSize = 0.U, + lgSize = respHead.size + ) + val agBits = edgeIn.AccessAck( + toSource = respHead.source, + lgSize = respHead.size, data = respHead.data ) + val respBits = Mux(respHead.isStore, apBits, agBits) tlIn.d.bits := respBits // Debug only From 3ba566b9f7961ce154fc32fa7fcb1b67adc2bfde Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 21 Apr 2023 16:26:09 -0700 Subject: [PATCH 32/37] Give slack time after trace EOF before sim termination ... so that we make sure to receive all outstanding responses. This fixes the response traces being truncated too early. --- src/main/scala/tilelink/Coalescing.scala | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 27fbc6e..5259e36 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -760,18 +760,19 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) dontTouch(tlOut.d) } - io.finished := sim.io.trace_read.finished - when(io.finished) { - assert( - false.B, - "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)" - ) + // Give some slack time after trace EOF to the downstream system so that we + // make sure to receive all outstanding responses. + val finishCounter = RegInit(200.U(64.W)) + when (sim.io.trace_read.finished) { + finishCounter := finishCounter - 1.U } - - // Clock Counter, for debugging purpose - val clkcount = RegInit(0.U(64.W)) - clkcount := clkcount + 1.U - dontTouch(clkcount) + io.finished := (finishCounter === 0.U) + // when(io.finished) { + // assert( + // false.B, + // "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)" + // ) + // } } class SimMemTrace(filename: String, numLanes: Int) From db06bda67484df69ac477a0efb4d71c0e7a77f69 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 21 Apr 2023 16:49:19 -0700 Subject: [PATCH 33/37] Count req/resp lines and bytes to coalescer and test match Note total bytes in requests and responses (i.e. traffic) don't need to match because of redundant requests to the same address may get coalesced. --- src/main/scala/tilelink/Coalescing.scala | 42 +++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 5259e36..6119bd2 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -763,7 +763,7 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) // Give some slack time after trace EOF to the downstream system so that we // make sure to receive all outstanding responses. val finishCounter = RegInit(200.U(64.W)) - when (sim.io.trace_read.finished) { + when(sim.io.trace_read.finished) { finishCounter := finishCounter - 1.U } io.finished := (finishCounter === 0.U) @@ -842,6 +842,22 @@ class MemTraceLogger( lazy val module = new Impl class Impl extends LazyModuleImp(this) { + val io = IO(new Bundle { + val numReqs = Output(UInt(64.W)) + val numResps = Output(UInt(64.W)) + val reqBytes = Output(UInt(64.W)) + val respBytes = Output(UInt(64.W)) + }) + + val numReqs = RegInit(0.U(64.W)) + val numResps = RegInit(0.U(64.W)) + val reqBytes = RegInit(0.U(64.W)) + val respBytes = RegInit(0.U(64.W)) + io.numReqs := numReqs + io.numResps := numResps + io.reqBytes := reqBytes + io.respBytes := respBytes + val simReq = if (reqEnable) Some(Module(new SimMemTraceLogger(false, s"${filename}.${loggerName}.req", numLanes))) @@ -924,6 +940,16 @@ class MemTraceLogger( // the entire bits. resp.address := 0.U resp.data := tlOut.d.bits.data + + // stats + when(req.valid) { + numReqs := numReqs + 1.U + reqBytes := reqBytes + (1.U << tlIn.a.bits.size) + } + when(resp.valid) { + numResps := numResps + 1.U + respBytes := respBytes + (1.U << tlOut.d.bits.size) + } } // Flatten per-lane signals to the Verilog blackbox input. @@ -1055,6 +1081,20 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { class Impl extends LazyModuleImp(this) with UnitTestModule { driver.module.io.start := io.start io.finished := driver.module.io.finished + + when(io.finished) { + printf( + "numReqs=%d, numResps=%d, reqBytes=%d, respBytes=%d\n", + coreSideLogger.module.io.numReqs, + coreSideLogger.module.io.numResps, + coreSideLogger.module.io.reqBytes, + coreSideLogger.module.io.respBytes + ) + assert( + coreSideLogger.module.io.numReqs === coreSideLogger.module.io.numResps, + "FAIL: number of requests and responses to the coalescer do not match" + ) + } } } From bbe62a8583b822d892f639deae123606c4d37a80 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 21 Apr 2023 17:23:51 -0700 Subject: [PATCH 34/37] Properly count per-lane req/resps --- src/main/scala/tilelink/Coalescing.scala | 31 +++++++++++++++--------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 6119bd2..bd0c1e1 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -273,6 +273,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule ) println(s"=========== table sourceWidth: ${sourceWidth}") + println(s"=========== table sizeBits: ${sizeBits}") newEntry.source := coalSourceId newEntry.lanes.foreach { l => @@ -674,7 +675,7 @@ class TraceLine extends Bundle with HasTraceLine { val source = UInt(32.W) val address = UInt(64.W) val is_store = Bool() - val size = UInt(32.W) // this is log2(bytesize) as in TL A bundle + val size = UInt(8.W) // this is log2(bytesize) as in TL A bundle val data = UInt(64.W) } @@ -940,18 +941,26 @@ class MemTraceLogger( // the entire bits. resp.address := 0.U resp.data := tlOut.d.bits.data - - // stats - when(req.valid) { - numReqs := numReqs + 1.U - reqBytes := reqBytes + (1.U << tlIn.a.bits.size) - } - when(resp.valid) { - numResps := numResps + 1.U - respBytes := respBytes + (1.U << tlOut.d.bits.size) - } } + // stats + val numReqsThisCycle = + laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } + val numRespsThisCycle = + laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } + val reqBytesThisCycle = + laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) => + b0 + b1 + } + val respBytesThisCycle = + laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) => + b0 + b1 + } + numReqs := numReqs + numReqsThisCycle + numResps := numResps + numRespsThisCycle + reqBytes := reqBytes + reqBytesThisCycle + respBytes := respBytes + respBytesThisCycle + // Flatten per-lane signals to the Verilog blackbox input. // // This is a clunky workaround of the fact that Chisel doesn't allow partial From 8a7e6f13918719cd1d1c8863adcabc004bee3c15 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 21 Apr 2023 18:20:16 -0700 Subject: [PATCH 35/37] Replace hardcoded trace widths with proper params --- src/main/resources/vsrc/SimMemTrace.v | 12 +++--- src/main/resources/vsrc/SimMemTraceLogger.v | 2 +- src/main/scala/tilelink/Coalescing.scala | 41 ++++++++++++++------- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index f21aeec..1fc858a 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -1,7 +1,7 @@ // FIXME hardcoded `define DATA_WIDTH 64 `define MAX_NUM_LANES 32 -`define LOGSIZE_WIDTH 32 +`define LOGSIZE_WIDTH 8 import "DPI-C" function void memtrace_init( input string filename @@ -41,7 +41,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( longint __in_address [NUM_LANES-1:0]; bit __in_is_store [NUM_LANES-1:0]; - int __in_size [NUM_LANES-1:0]; + reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0]; longint __in_data [NUM_LANES-1:0]; bit __in_finished; @@ -57,10 +57,10 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( reg [NUM_LANES-1:0] __in_valid_reg; reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_LANES-1:0]; - reg [NUM_LANES-1:0] __in_is_store_reg; - int __in_size_reg [NUM_LANES-1:0]; - reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_LANES-1:0]; - reg __in_finished_reg; + reg [NUM_LANES-1:0] __in_is_store_reg; + reg [`LOGSIZE_WIDTH-1:0] __in_size_reg [NUM_LANES-1:0]; + reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_LANES-1:0]; + reg __in_finished_reg; genvar g; diff --git a/src/main/resources/vsrc/SimMemTraceLogger.v b/src/main/resources/vsrc/SimMemTraceLogger.v index 2f6ddae..3ffa794 100644 --- a/src/main/resources/vsrc/SimMemTraceLogger.v +++ b/src/main/resources/vsrc/SimMemTraceLogger.v @@ -2,7 +2,7 @@ `define DATA_WIDTH 64 `define MAX_NUM_LANES 32 `define SOURCEID_WIDTH 32 -`define LOGSIZE_WIDTH 32 +`define LOGSIZE_WIDTH 8 import "DPI-C" function int memtracelogger_init( input bit is_response, diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index bd0c1e1..9ad1c5b 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -282,7 +282,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule r.valid := false.B r.source := i.U // FIXME bogus r.offset := 1.U - r.size := 2.U + r.size := 2.U // FIXME hardcoded } } newEntry.lanes(0).reqs(0).valid := true.B @@ -669,7 +669,8 @@ trait HasTraceLine { val data: UInt } -// used for both request and response. response had address set to 0 +// Used for both request and response. Response had address set to 0 +// NOTE: these widths have to agree with what's hardcoded in Verilog. class TraceLine extends Bundle with HasTraceLine { val valid = Bool() val source = UInt(32.W) @@ -693,14 +694,17 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) // back to each lane's. val laneReqs = Wire(Vec(numLanes, new TraceLine)) + val addrW = laneReqs(0).address.getWidth + val sizeW = laneReqs(0).size.getWidth + val dataW = laneReqs(0).data.getWidth laneReqs.zipWithIndex.foreach { case (req, i) => req.valid := sim.io.trace_read.valid(i) - // TODO: don't take source id from the original trace for now + // TODO: driver trace doesn't contain source id req.source := 0.U - req.address := sim.io.trace_read.address(64 * i + 63, 64 * i) + req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i) req.is_store := sim.io.trace_read.is_store(i) - req.size := sim.io.trace_read.size(32 * i + 31, 32 * i) - req.data := sim.io.trace_read.data(64 * i + 63, 64 * i) + req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i) + req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } // To prevent collision of sourceId with a current in-flight message, @@ -781,6 +785,11 @@ class SimMemTrace(filename: String, numLanes: Int) Map("FILENAME" -> filename, "NUM_LANES" -> numLanes) ) with HasBlackBoxResource { + val traceLineT = new TraceLine + val addrW = traceLineT.address.getWidth + val sizeW = traceLineT.size.getWidth + val dataW = traceLineT.data.getWidth + val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) @@ -793,10 +802,10 @@ class SimMemTrace(filename: String, numLanes: Int) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. // TODO: assumes 64-bit address. - val address = Output(UInt((64 * numLanes).W)) + val address = Output(UInt((addrW * numLanes).W)) val is_store = Output(UInt(numLanes.W)) - val size = Output(UInt((32 * numLanes).W)) - val data = Output(UInt((64 * numLanes).W)) + val size = Output(UInt((sizeW * numLanes).W)) + val data = Output(UInt((dataW * numLanes).W)) val finished = Output(Bool()) } }) @@ -1019,20 +1028,26 @@ class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int) ) ) with HasBlackBoxResource { + val traceLineT = new TraceLine + val sourceW = traceLineT.source.getWidth + val addrW = traceLineT.address.getWidth + val sizeW = traceLineT.size.getWidth + val dataW = traceLineT.data.getWidth + val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) val trace_log = new Bundle with HasTraceLine { val valid = Input(UInt(numLanes.W)) - val source = Input(UInt((32 * numLanes).W)) + val source = Input(UInt((sourceW * numLanes).W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. // TODO: assumes 64-bit address. - val address = Input(UInt((64 * numLanes).W)) + val address = Input(UInt((addrW * numLanes).W)) val is_store = Input(UInt(numLanes.W)) - val size = Input(UInt((32 * numLanes).W)) - val data = Input(UInt((64 * numLanes).W)) + val size = Input(UInt((sizeW * numLanes).W)) + val data = Input(UInt((dataW * numLanes).W)) val ready = Output(Bool()) } }) From c5d722112b2fdfde5faaf5e47fd7d43c9adf478b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 22 Apr 2023 14:58:02 -0700 Subject: [PATCH 36/37] Make word size global object --- src/main/scala/tilelink/Coalescing.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 9ad1c5b..c159abb 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -55,16 +55,18 @@ class RespQueueEntry(val sourceWidth: Int, val dataWidthInBits: Int, val sizeWid val data = UInt(dataWidthInBits.W) // read data } +// 32-bit system +object WordSizeInBytes { + def apply(): Int = 4 +} + class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModuleImp(outer) { // Make sure IdentityNode is connected to an upstream node, not just the // coalescer TL master node assert(outer.node.in.length >= 2) - // 32-bit system. FIXME hardcoded - val wordSize = 4 - val reqQueueDepth = 1 - val respQueueDepth = 4 // FIXME test + val respQueueDepth = 4 // TODO: hardcoded val sourceWidth = outer.node.in(1)._1.params.sourceBits val addressWidth = outer.node.in(1)._1.params.addressBits @@ -78,7 +80,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule // coalesced request. Upper bound is request queue depth. val numPerLaneReqs = 1 - val respQueueEntryT = new RespQueueEntry(sourceWidth, wordSize * 8, sizeWidth) + val respQueueEntryT = new RespQueueEntry(sourceWidth, WordSizeInBytes() * 8, sizeWidth) val respQueues = Seq.tabulate(numLanes) { _ => Module( new MultiPortQueue( @@ -348,8 +350,6 @@ class UncoalescingUnit( val inflightTable = Module( new InflightCoalReqTable(numLanes, numPerLaneReqs, sourceWidth, numInflightCoalRequests) ) - val wordSize = 4 // FIXME duplicate - val io = IO(new Bundle { val coalReqValid = Input(Bool()) val newEntry = Input(inflightTable.entryT) @@ -359,7 +359,7 @@ class UncoalescingUnit( val uncoalResps = Output( Vec( numLanes, - Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, wordSize * 8, sizeWidth))) + Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, WordSizeInBytes() * 8, sizeWidth))) ) ) }) From 90a797e71a72b34a772af2c4c53fbf60e4a019b7 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 22 Apr 2023 21:04:45 -0700 Subject: [PATCH 37/37] Make MemTraceDriver generate word-sized requests Try to model the core's behavior which accesses cache in word granularity. This also simplifies the coalescer design as coalescer no longer needs to uncoalesce response data chunk into single bytes (and therefore fewer muxes). --- src/main/scala/tilelink/Coalescing.scala | 56 ++++++++++++++++++------ 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index c159abb..1a06db0 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -520,7 +520,6 @@ class InflightCoalReqTableEntry( val size = UInt(sizeBits.W) } class PerLane extends Bundle { - // FIXME: if numPerLaneReqs != 2 ** sourceWidth, we need to store srcId as well val reqs = Vec(numPerLaneReqs, new PerCoreReq) } // sourceId of the coalesced response that just came back. This will be the @@ -674,7 +673,7 @@ trait HasTraceLine { class TraceLine extends Bundle with HasTraceLine { val valid = Bool() val source = UInt(32.W) - val address = UInt(64.W) + val address = UInt(64.W) // FIXME: in Verilog this is the same as data width val is_store = Bool() val size = UInt(8.W) // this is log2(bytesize) as in TL A bundle val data = UInt(64.W) @@ -683,9 +682,7 @@ class TraceLine extends Bundle with HasTraceLine { class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) extends LazyModuleImp(outer) with UnitTestModule { - val sim = Module( - new SimMemTrace(traceFile, numLanes) - ) + val sim = Module(new SimMemTrace(traceFile, numLanes)) sim.io.clock := clock sim.io.reset := reset.asBool sim.io.trace_read.ready := true.B @@ -720,21 +717,54 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) Cat(8.U(4.W), addr(27, 0)) } - // Generate TL requests according to the trace line. + // Generate TL requests corresponding to the trace lines (outer.laneNodes zip laneReqs).foreach { case (node, req) => - val (tlOut, edge) = node.out(0) + // Core only makes accesses of granularity larger than a word, so we want + // the trace driver to act so as well. + // That means if req.size is smaller than word size, we need to pad data + // with zeros to generate a word-size request, and set mask accordingly. + val offsetInWord = req.address % WordSizeInBytes().U + val subword = req.size < log2Ceil(WordSizeInBytes()).U + val mask = Wire(UInt(WordSizeInBytes().W)) + val wordData = Wire(UInt((WordSizeInBytes() * 8).W)) + val sizeInBytes = Wire(UInt((sizeW + 1).W)) + sizeInBytes := (1.U) << req.size + mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U) + wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data) + val wordAlignedAddress = req.address & ~((1 << log2Ceil(WordSizeInBytes())) - 1).U(addrW.W) + + assert( + req.size <= log2Ceil(WordSizeInBytes()).U, + s"trace driver currently does not support access sizes larger than word size (${WordSizeInBytes()})" + ) + val wordAlignedSize = 2.U // FIXME: hardcoded + + // when(req.valid && subword) { + // printf( + // "address=%x, size=%d, data=%x, addressMask=%x, wordAlignedAddress=%x, mask=%x, wordData=%x\n", + // req.address, + // req.size, + // req.data, + // ~((1 << log2Ceil(WordSizeInBytes())) - 1).U(addrW.W), + // wordAlignedAddress, + // mask, + // wordData + // ) + // } + + val (tlOut, edge) = node.out(0) val (plegal, pbits) = edge.Put( fromSource = sourceIdCounter, - toAddress = hashToValidPhyAddr(req.address), - lgSize = req.size, // trace line already holds log2(size) + toAddress = hashToValidPhyAddr(wordAlignedAddress), + lgSize = wordAlignedSize, // trace line already holds log2(size) // data should be aligned to beatBytes - data = (req.data << (8.U * (req.address % edge.manager.beatBytes.U))) + data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))) ) val (glegal, gbits) = edge.Get( fromSource = sourceIdCounter, - toAddress = hashToValidPhyAddr(req.address), - lgSize = req.size + toAddress = hashToValidPhyAddr(wordAlignedAddress), + lgSize = wordAlignedSize ) val legal = Mux(req.is_store, plegal, glegal) val bits = Mux(req.is_store, pbits, gbits) @@ -796,7 +826,7 @@ class SimMemTrace(filename: String, numLanes: Int) // These names have to match declarations in the Verilog code, eg. // trace_read_address. - val trace_read = new Bundle { // FIXME: can't use HasTraceLine because this doesn't have source + val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source val ready = Input(Bool()) val valid = Output(UInt(numLanes.W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into