diff --git a/src/main/scala/radiance/memory/Coalescing.scala b/src/main/scala/radiance/memory/Coalescing.scala index 0b592b1..876bf6c 100644 --- a/src/main/scala/radiance/memory/Coalescing.scala +++ b/src/main/scala/radiance/memory/Coalescing.scala @@ -954,7 +954,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) // This is the final coalesced request. val coalReq = inflightTable.io.outCoalReq // downstream backpressure on the coalesced edge - // TODO: custom <>? + // @cleanup: custom <>? inflightTable.io.outCoalReq.ready := tlCoal.a.ready tlCoal.a.valid := coalReq.valid val (legal, tlBits) = coalReq.bits.toTLA(edgeCoal) @@ -1078,9 +1078,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) // Connect lookup result from InflightTable uncoalescer.io.inflightLookup <> inflightTable.io.lookup - // InflightTable IO: Look up the table with incoming coalesced responses - // @cleanup: this should be done inside uncoalescer - inflightTable.io.lookupSourceId := tlCoal.d.bits.source + // Look up the inflight table with incoming coalesced responses + // @cleanup: would be cleaner if inflightTable lookup is contained inside + // uncoalescer + inflightTable.io.lookupSourceId := coalSourceGen.io.inResp.bits.source // Connect uncoalescer results back into response queue (respQueues zip uncoalescer.io.respQueueIO).zipWithIndex.foreach @@ -1132,17 +1133,6 @@ class Uncoalescer( ) }) - // Lookup table as soon as a coalesced response fires - // @perf: might result in long ready chain? - io.inflightLookup.ready := io.coalResp.fire - - // Only accept coalesced response when all enq ports of the response queue - // are ready. This is necessary because uncoalescing logic is a - // combinational logic that produces all the split responses at the same - // cycle, so it needs to be guaranteed that all of them has somewhere to go. - val allRespQueueEnqReady = io.respQueueIO.map(_.map(_.ready).reduce(_ && _)).reduce(_ && _) - io.coalResp.ready := allRespQueueEnqReady - // Un-coalescing logic // def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, logSize: UInt): UInt = { @@ -1165,32 +1155,59 @@ class Uncoalescer( chunks(offset) // MUX } - // Un-coalesce responses back to individual lanes - // Connect uncoalesced results back into each lane's response queue - val foundRow = io.inflightLookup.bits - (foundRow.lanes zip io.respQueueIO).zipWithIndex.foreach { case ((foundLane, enqIOs), lane) => - foundLane.reqs.zipWithIndex.foreach { case (foundReq, depth) => + // Pipeline registers for the inflight table lookup result, and the coalesced + // response itself. We cut timing here expecting that the table lookup + // will take up a long path. + val coalRespPipeReg = Module(new Queue(chiselTypeOf(io.coalResp.bits), 1, pipe = true)) + coalRespPipeReg.io.enq <> io.coalResp + val tablePipeReg = Module(new Queue(chiselTypeOf(io.inflightLookup.bits), 1, pipe = true)) + tablePipeReg.io.enq <> io.inflightLookup + tablePipeReg.io.enq.valid := io.inflightLookup.fire + + // inflightTable looks up as soon as ready signal goes up, assuming + // io.lookupSourceId is valid, so need to be careful when we assert ready by + // checking both if we have space in the pipeline register, and if there is a + // valid response on the channel + io.inflightLookup.ready := tablePipeReg.io.enq.ready && io.coalResp.fire + + // Only proceed uncoalescing when all enq ports of the response queue are + // ready. This is necessary because uncoalescing logic is a combinational + // logic that produces all the split responses at the same cycle, so it needs + // to be guaranteed that all of them has somewhere to go. + val allRespQueueEnqReady = io.respQueueIO.map(_.map(_.ready).reduce(_ && _)).reduce(_ && _) + tablePipeReg.io.deq.ready := allRespQueueEnqReady + coalRespPipeReg.io.deq.ready := allRespQueueEnqReady + + assert(tablePipeReg.io.enq.fire === coalRespPipeReg.io.enq.fire, + "enqueue timing for uncoalescer pipeline registers out-of-sync!") + assert(tablePipeReg.io.deq.fire === coalRespPipeReg.io.deq.fire, + "dequeue timing for uncoalescer pipeline registers out-of-sync!") + + // Un-coalesce responses back to individual lanes. Connect uncoalesced + // results back into each lane's response queue. + val tableRow = tablePipeReg.io.deq + (io.respQueueIO zip tableRow.bits.lanes).zipWithIndex.foreach { case ((enqIOs, lane), laneNum) => + lane.reqs.zipWithIndex.foreach { case (req, depth) => val enqIO = enqIOs(depth) - // spatial-only coalescing: only looking at 0th srcId entry enqIO.valid := false.B enqIO.bits := DontCare // debug // when (resp.valid) { - // printf(s"${i}-th uncoalesced response came back from lane ${lane}\n") + // printf(s"${i}-th uncoalesced response came back from lane ${laneNum}\n") // } // dontTouch(q.io.enq(respQueueCoalPortOffset)) - when(io.inflightLookup.valid && foundReq.valid) { - enqIO.valid := io.coalResp.fire && foundReq.valid - enqIO.bits.op := foundReq.op - enqIO.bits.source := foundReq.source - val logSize = foundRow.sizeEnumT.enumToLogSize(foundReq.sizeEnum) + when(tableRow.valid && req.valid) { + enqIO.valid := tableRow.fire && req.valid + enqIO.bits.op := req.op + enqIO.bits.source := req.source + val logSize = tableRow.bits.sizeEnumT.enumToLogSize(req.sizeEnum) enqIO.bits.size := logSize enqIO.bits.data := getCoalescedDataChunk( - io.coalResp.bits.data, - io.coalResp.bits.data.getWidth, - foundReq.offset, + coalRespPipeReg.io.deq.bits.data, + coalRespPipeReg.io.deq.bits.data.getWidth, + req.offset, logSize ) // is this necessary? @@ -1219,16 +1236,9 @@ class InFlightTable( config.maxCoalLogSize, // FIXME: offsetBits? config.sizeEnum ) - val entries = config.numNewSrcIds val sourceWidth = log2Ceil(config.numOldSrcIds) - println(s"CoalescingUnit InFlightTable config: {") - println(s" sourceWidth: ${sourceWidth}") - println(s" offsetBits: ${offsetBits}") - println(s" sizeEnumBits: ${entryT.sizeEnumT.getWidth}") - println(s"}") - val io = IO(new Bundle { // Enqueue IO // @@ -1258,6 +1268,12 @@ class InFlightTable( val lookupSourceId = Input(UInt(sourceWidth.W)) }) + println(s"CoalescingUnit InFlightTable config: {") + println(s" sourceWidth: ${sourceWidth}") + println(s" offsetBits: ${offsetBits}") + println(s" sizeEnumBits: ${entryT.sizeEnumT.getWidth}") + println(s"}") + val table = Mem( entries, new Bundle { @@ -1339,12 +1355,14 @@ class InFlightTable( // Lookup logic io.lookup.valid := table(io.lookupSourceId).valid io.lookup.bits := table(io.lookupSourceId).bits - // Dequeue as soon as lookup succeeds - when(io.lookup.fire) { - // every lookup to the table should succeed as the request should have - // gotten recorded earlier than the response + // every lookup to the table should succeed as the request should have + // gotten recorded earlier than the response + when(io.lookup.ready) { assert(table(io.lookupSourceId).valid === true.B, "table lookup with a valid sourceId failed") + } + // Dequeue as soon as lookup succeeds + when(io.lookup.fire) { table(io.lookupSourceId).valid := false.B } assert(