diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala
index 3ca3d4a..2d3b118 100644
--- a/src/main/scala/tilelink/Coalescing.scala
+++ b/src/main/scala/tilelink/Coalescing.scala
@@ -130,6 +130,20 @@ class RespQueueEntry(sourceWidth: Int, sizeWidth: Int, maxSize: Int) extends Bun
   val error = Bool()
 }
 
+class ReqSourceGen(sourceWidth: Int) extends Module {
+  val io = IO(new Bundle {
+    val gen = Input(Bool())
+    val id = Output(Valid(UInt(sourceWidth.W)))
+  })
+
+  val head = RegInit(UInt(sourceWidth.W), 0.U)
+
+  head := Mux(io.gen, head + 1.U, head)
+
+  // FIXME: keep track of ones in use & set invalid when out
+  io.id.valid := true.B
+  io.id.bits := head
+}
 
 // A shift-register queue implementation that supports invalidating entries
 // and exposing queue contents as output IO. (TODO: support deadline)
@@ -151,7 +165,7 @@ class CoalShiftQueue[T <: Data](
     // coalescer because it has potentially expensive PopCount
   })
 
-  private val valid = RegInit(VecInit(Seq.fill(entries) { false.B }))
+  val valid = RegInit(VecInit(Seq.fill(entries) { false.B }))
   // "Used" flag is 1 for every entry between the current queue head and tail,
   // even if that entry has been invalidated:
   //
@@ -224,44 +238,166 @@ class CoalShiftQueue[T <: Data](
 }
 
 // Software model: coalescer.py
-class MonoCoalescer[T <: Data](coalSize: Int, coalWindow: Seq[CoalShiftQueue[T]],
-                               config: CoalescerConfig) extends Module {
+class MonoCoalescer(coalSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
+                    config: CoalescerConfig) extends Module {
   val io = IO(new Bundle {
-    val leader_idx = Output(UInt(log2Ceil(config.NUM_LANES).W))
-    val base_addr = Output(UInt(config.ADDR_WIDTH.W))
-    val match_oh = Output(Vec(config.NUM_LANES, UInt(config.DEPTH.W)))
-    val coverage_hits = Output(UInt((1 << config.MAX_SIZE).W))
+    val window = Input(Vec(config.NUM_LANES, windowT.io.cloneType))
+    val results = Output(new Bundle {
+      val leaderIdx = Output(UInt(log2Ceil(config.NUM_LANES).W))
+      val baseAddr = Output(UInt(config.ADDR_WIDTH.W))
+      val matchOH = Output(Vec(config.NUM_LANES, UInt(config.DEPTH.W)))
+      val matchCount = Output(UInt(log2Ceil(config.NUM_LANES * config.DEPTH).W))
+      val coverageHits = Output(UInt((1 << config.MAX_SIZE).W))
+    })
   })
 
   io := DontCare
 
   val size = coalSize
   val mask = ((1 << config.ADDR_WIDTH - 1) - (1 << size - 1)).U
-  val window = coalWindow
 
-  def can_match(req0: Valid[ReqQueueEntry], req1: Valid[ReqQueueEntry]): Bool = {
-    (req0.bits.op === req1.bits.op) &&
-    (req0.valid && req1.valid) &&
-    ((req0.bits.address & this.mask) === (req1.bits.address & this.mask))
+  def canMatch(req0: ReqQueueEntry, req0v: Bool, req1: ReqQueueEntry, req1v: Bool): Bool = {
+    (req0.op === req1.op) &&
+    (req0v && req1v) &&
+    ((req0.address & this.mask) === (req1.address & this.mask))
   }
 
   // combinational logic to drive output from window contents
+  val leaders = io.window.map(_.elts.head)
+  val leadersValid = io.window.map(_.mask.asBools.head)
 
-  val leaders = coalWindow.map(_.io.elts.head)
+  // TODO: match leader to only lanes >= leader idx
+  val matches = leaders.zip(leadersValid).map { case (leader, leaderValid) =>
+    io.window.map {followerLane =>
+      followerLane.elts.zip(followerLane.mask.asBools).map { case (follower, followerValid) =>
+        this.canMatch(follower, followerValid, leader, leaderValid)
+      }
+    }
+  }
+
+  val matchCounts = matches.map(leader => leader.map(PopCount(_)).reduce(_ + _))
+  val canCoalesce = matchCounts.map(_ > 1.U)
+
+  // TODO: maybe use round robin arbiter instead of argmax to pick leader
+  val chosenLeaderIdx = matchCounts.zipWithIndex.map {
+    case (a, b) => (a, b.U)
+  }.reduce[(UInt, UInt)] { case ((a, i), (b, j)) =>
+    (Mux(a > b, a, b), Mux(a > b, i, j))
+  }._2
+
+  val chosenLeader = VecInit(leaders)(chosenLeaderIdx)
+  val chosenMatches = VecInit(matches.map(leader => VecInit(leader.map(VecInit(_).asUInt))))(chosenLeaderIdx)
+  val chosenMatchCount = VecInit(matchCounts)(chosenLeaderIdx)
+
+  // coverage calculation
+  def getOffsetSlice(addr: UInt) = addr(size - 1, config.WORD_WIDTH)
+  val offsets = io.window.map(_.elts).flatMap(_.map(req => getOffsetSlice(req.address)))
+  val hits = Seq.tabulate(1 << (size - config.WORD_WIDTH)) { target =>
+    offsets.map(_ === target.U).reduce(_ || _)
+  }
+
+  io.results.leaderIdx := chosenLeaderIdx
+  io.results.baseAddr := chosenLeader.address & mask
+  io.results.matchOH := chosenMatches
+  io.results.matchCount := chosenMatchCount
+  io.results.coverageHits := PopCount(hits)
 }
 
 // Software model: coalescer.py
-class MultiCoalescer[T <: Data]
-    (sizes: Seq[Int], window: Seq[CoalShiftQueue[T]], coalReqT: ReqQueueEntry,
-     config: CoalescerConfig) extends Module {
+class MultiCoalescer(windowT: CoalShiftQueue[ReqQueueEntry], coalReqT: ReqQueueEntry,
+                     config: CoalescerConfig) extends Module {
 
-  val coalescers = sizes.map(size => Module(new MonoCoalescer(size, window, config)))
   val io = IO(new Bundle {
-    val out_req = Output(Valid(coalReqT.cloneType))
+    val window = Input(Vec(config.NUM_LANES, windowT.io.cloneType))
+    val out_req = DecoupledIO(coalReqT.cloneType)
     val invalidate = Output(Valid(Vec(config.NUM_LANES, UInt(config.DEPTH.W))))
   })
 
-  io := DontCare
+  val coalescers = config.COAL_SIZES.map(size => Module(new MonoCoalescer(size, windowT, config)))
+  coalescers.foreach(_.io.window := io.window)
+
+  def normalize(x: Seq[UInt]): Seq[UInt] = {
+    x.zip(config.COAL_SIZES).map { case (hits, size) =>
+      (hits << (config.MAX_SIZE - size).U).asUInt
+    }
+  }
+
+  def argMax(x: Seq[UInt]): UInt = {
+    x.zipWithIndex.map {
+      case (a, b) => (a, b.U)
+    }.reduce[(UInt, UInt)] { case ((a, i), (b, j)) =>
+      (Mux(a > b, a, b), Mux(a > b, i, j)) // TODO: tie-breaker
+    }._2
+  }
+
+  val normalizedMatches = normalize(coalescers.map(_.io.results.matchCount))
+  val normalizedHits = normalize(coalescers.map(_.io.results.coverageHits))
+
+  val chosenIdx = Wire(UInt(log2Ceil(config.COAL_SIZES.size).W))
+  val chosenValid = Wire(Bool())
+  // minimum 25% coverage
+  val minCoverage = 1.max(1 << (config.MAX_SIZE - 4))
+  when (normalizedHits.map(_ > minCoverage.U).reduce(_ || _)) {
+    chosenIdx := argMax(normalizedHits)
+    chosenValid := true.B
+  }.elsewhen(normalizedMatches.map(_ > 1.U).reduce(_ || _)) {
+    chosenIdx := argMax(normalizedMatches)
+    chosenValid := true.B
+  }.otherwise {
+    chosenIdx := DontCare
+    chosenValid := false.B
+  }
+
+  // create coalesced request
+  val chosenBundle = VecInit(coalescers.map(_.io.results))(chosenIdx)
+  val chosenSize = VecInit(coalescers.map(_.size.U))(chosenIdx)
+
+  // flatten requests and matches
+  val flatReqs = io.window.flatMap(_.elts)
+  val flatMatches = chosenBundle.matchOH.flatMap(_.asBools)
+
+  // check for word alignment in addresses
+  assert(io.window.flatMap(_.elts.map(req => req.address(config.WORD_WIDTH - 1, 0) === 0.U)).reduce(_ || _),
+    "one or more addresses used for coalescing is not word-aligned")
+
+  // note: this is word-level coalescing. if finer granularity is needed, need to modify code
+  val numWords = (1.U << (chosenSize - config.WORD_WIDTH.U)).asUInt
+  val maxWords = 1 << (config.MAX_SIZE - config.WORD_WIDTH)
+  val addrMask = Wire(UInt(config.MAX_SIZE.W))
+  addrMask := (1.U << chosenSize).asUInt - 1.U
+
+  val data = Wire(Vec(maxWords, UInt((config.WORD_SIZE * 8).W)))
+  val mask = Wire(Vec(maxWords, UInt(config.WORD_SIZE.W)))
+
+  for (i <- 0 until maxWords) {
+    val sel = flatReqs.zip(flatMatches).map { case (req, m) =>
+      m && ((req.address(config.MAX_SIZE - 1, 0) & addrMask) === i.U)
+    }
+    // TODO: SW uses priority encoder, not sure about behavior of MuxCase
+    data(i) := MuxCase(DontCare, flatReqs.zip(sel).map { case (req, s) =>
+      s -> req.data
+    })
+    mask(i) := Mux(i.U < numWords,
+      MuxCase(0.U, flatReqs.zip(sel).map { case (req, s) =>
+        s -> req.mask
+      }),
+      0.U // TODO: Do we care about masks at positions > size specified? if not, use DontCare
+    )
+  }
+
+  val sourceGen = Module(new ReqSourceGen(log2Ceil(config.NUM_NEW_IDS)))
+  sourceGen.io.gen := io.out_req.fire // use up a source ID only when request is created
+
+  io.out_req.bits.source := sourceGen.io.id.bits
+  io.out_req.bits.mask := mask.asUInt
+  io.out_req.bits.data := data.asUInt
+  io.out_req.bits.size := chosenSize
+  io.out_req.bits.address := chosenBundle.baseAddr
+  io.out_req.bits.op := VecInit(io.window.map(_.elts.head))(chosenBundle.leaderIdx).op
+  io.out_req.valid := chosenValid && sourceGen.io.id.valid
+
+  io.invalidate.bits := chosenBundle.matchOH
+  io.invalidate.valid := io.out_req.fire // invalidate only when fire
 }
 
 class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends LazyModuleImp(outer) {
@@ -281,7 +417,8 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
   }
 
   val coalReqT = new ReqQueueEntry(sourceWidth, log2Ceil(config.MAX_SIZE), config.ADDR_WIDTH, config.MAX_SIZE)
-  val coalescer = Module(new MultiCoalescer(config.COAL_SIZES, reqQueues, coalReqT, config))
+  val coalescer = Module(new MultiCoalescer(reqQueues.head, coalReqT, config))
+  coalescer.io.window := reqQueues.map(_.io)
 
   // Per-lane request and response queues
   //
@@ -332,6 +469,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
 
   tlCoal.a.valid := coalescer.io.out_req.valid
   tlCoal.a.bits := coalescer.io.out_req.bits.toTLA(edgeCoal)
+  coalescer.io.out_req.ready := tlCoal.a.ready
   tlCoal.b.ready := true.B
   tlCoal.c.valid := false.B
   // tlCoal.d.ready := true.B // this should be connected to uncoalescer's ready, done below.