package freechips.rocketchip.tilelink

import chisel3._
import chisel3.util._
import chisel3.experimental._
import freechips.rocketchip.diplomacy._
import freechips.rocketchip.tilelink._
import org.chipsalliance.cde.config.{Parameters, Field}

case object VortexL1Key extends Field[Option[VortexL1Config]](None /*default*/ )

case class VortexL1Config(
    numBanks: Int,
    wordSize: Int, // This is the read/write granularity of the L1 cache
    cacheLineSize: Int,
    coreTagWidth: Int,
    writeInfoReqQSize: Int,
    mshrSize: Int,
    memSideSourceIds: Int,
    uncachedAddrSets: Seq[AddressSet]
) {
  def coreTagPlusSizeWidth: Int = {
    log2Ceil(wordSize) + coreTagWidth
  }
  // NOTE: This assertion depends on the fact that the Vortex cache is
  // configured to have 1 bank, and that it uses MSHR id as the tag of
  // memory-side requests.  Otherwise, it will append bank id to the tag as
  // well and break this requirement.
  require(
    mshrSize == memSideSourceIds,
    "MSHR size must match the number of sourceIds to downstream."
  )
}

object defaultVortexL1Config
    extends VortexL1Config(
      numBanks = 4,
      wordSize = 16,
      cacheLineSize = 16,
      coreTagWidth = 8,
      writeInfoReqQSize = 16,
      mshrSize = 8,
      memSideSourceIds = 8,
      // Don't cache CLINT region to ensure coherent access
      uncachedAddrSets = Seq(AddressSet(0x2000000L, 0xffffL))
    )

class VortexL1Cache(config: VortexL1Config)(implicit p: Parameters)
    extends LazyModule {
  val banks = Seq.tabulate(config.numBanks) { bankId =>
    // helps with name mangling in Verilog
    val bank = LazyModule(new VortexBank(config, bankId))
    bank
  }
  // passthrough
  val passThrough = LazyModule(new VortexBankPassThrough(config))

  // visibility node that exposes to upstream
  val coresideNode = TLIdentityNode()

  // core-side crossbar that arbitrates core requests to banks
  protected val bankXbar = LazyModule(new TLXbar)
  bankXbar.node :=* coresideNode
  banks.foreach { _.coresideNode :=* bankXbar.node }
  passThrough.coresideNode :=* bankXbar.node

  // master node that exposes to and drives the downstream
  val masterNode = TLIdentityNode()
  banks.foreach { masterNode := _.vxCacheToL2Node }
  masterNode := passThrough.vxCacheToL2Node

  lazy val module = new LazyModuleImp(this)
}

// TODO: Make this a Blocking Module
class VortexBankPassThrough(config: VortexL1Config)(implicit p: Parameters)
    extends LazyModule {
  // Slave node to upstream
  val managerParam = Seq(
    TLSlavePortParameters.v1(
      beatBytes = config.wordSize,
      managers = Seq(
        TLSlaveParameters.v1(
          address = config.uncachedAddrSets,
          regionType = RegionType.IDEMPOTENT,
          executable = false,
          supportsGet = TransferSizes(1, config.wordSize),
          supportsPutPartial = TransferSizes(1, config.wordSize),
          supportsPutFull = TransferSizes(1, config.wordSize),
          fifoId = Some(0)
        )
      )
    )
  )

  // Master node to downstream
  val clientParam = Seq(
    TLMasterPortParameters.v1(
      clients = Seq(
        TLMasterParameters.v1(
          name = "VortexBank",
          sourceId = IdRange(
            0,
            1 << (log2Ceil(
              config.memSideSourceIds
            ) + 5 /*FIXME: give more sourceId so that passthrough doesn't block; hacky*/ )
          ),
          supportsProbe = TransferSizes(1, config.wordSize),
          supportsGet = TransferSizes(1, config.wordSize),
          supportsPutFull = TransferSizes(1, config.wordSize),
          supportsPutPartial = TransferSizes(1, config.wordSize)
        )
      )
    )
  )

  val coresideNode = TLManagerNode(managerParam)
  val vxCacheFetchNode = TLClientNode(clientParam)
  val vxCacheToL2Node = TLIdentityNode()
  vxCacheToL2Node := TLWidthWidget(config.cacheLineSize) := vxCacheFetchNode

  // passthrough logic
  lazy val module = new LazyModuleImp(this) {
    val (upstream, _) = coresideNode.in(0)
    val (downstream, _) = vxCacheFetchNode.out(0)

    downstream.a <> upstream.a
    upstream.d <> downstream.d
  }
}

class VortexBank(
    config: VortexL1Config,
    bankId: Int,
)(implicit p: Parameters)
    extends LazyModule {
  // Generate AddressSet by excluding Addr we don't want
  def generateAddressSets(): Seq[AddressSet] = {
    // suppose have 4 bank
    // base for bank 1: ...000000|01|0000
    // mask for bank 1;    111111|00|1111
    val mask = 0xffffffffL ^ ((config.numBanks - 1) * config.wordSize)
    val base = 0x00000000L | (bankId * config.wordSize)

    val excludeSets = config.uncachedAddrSets
    var remainingSets: Seq[AddressSet] = Seq(AddressSet(base, mask))
    for (excludeSet <- excludeSets) {
      remainingSets = remainingSets.flatMap(_.subtract(excludeSet))
    }
    remainingSets
  }

  // Slave node to upstream
  val managerParam = Seq(
    TLSlavePortParameters.v1(
      beatBytes = config.wordSize,
      managers = Seq(
        TLSlaveParameters.v1(
          address = generateAddressSets(),
          regionType = RegionType.IDEMPOTENT, // idk what this does
          executable = false,
          supportsGet = TransferSizes(1, config.wordSize),
          supportsPutPartial = TransferSizes(1, config.wordSize),
          supportsPutFull = TransferSizes(1, config.wordSize),
          fifoId = Some(0)
        )
      )
    )
  )

  // Master node to downstream
  val clientParam = Seq(
    TLMasterPortParameters.v1(
      clients = Seq(
        TLMasterParameters.v1(
          name = "VortexBank",
          sourceId = IdRange(0, config.memSideSourceIds),
          supportsProbe = TransferSizes(1, config.wordSize),
          supportsGet = TransferSizes(1, config.wordSize),
          supportsPutFull = TransferSizes(1, config.wordSize),
          supportsPutPartial = TransferSizes(1, config.wordSize)
        )
      )
    )
  )

  // Core -> VxCache
  val coresideNode = TLManagerNode(managerParam)
  val vxCacheToL2Node = TLIdentityNode()
  val vxCacheFetchNode = TLClientNode(clientParam)

  // We need this widthWidget here, because whenever the bank is performing
  // read and write to Mem, it must have the illusion that dataWidth is as big
  // as as its cacheline size
  vxCacheToL2Node := TLWidthWidget(config.cacheLineSize) := vxCacheFetchNode
  lazy val module = new VortexBankImp(this, config);
}

class VortexBankImp(
    outer: VortexBank,
    config: VortexL1Config
) extends LazyModuleImp(outer) {
  val vxCache = Module(
    new VX_cache_top(
      WORD_SIZE = config.wordSize,
      CACHE_LINE_SIZE = config.cacheLineSize,
      CORE_TAG_WIDTH = config.coreTagPlusSizeWidth,
      MSHR_SIZE = config.mshrSize
    )
  );

  vxCache.io.clk := clock
  vxCache.io.reset := reset

  val writeReqCount = RegInit(UInt(32.W), 0.U)
  val writeInputFire = Wire(Bool())
  val writeOutputFire = Wire(Bool())

  when(writeInputFire && ~writeOutputFire) {
    writeReqCount := writeReqCount + 1.U
  }.elsewhen(~writeInputFire && writeOutputFire) {
    writeReqCount := writeReqCount - 1.U
  }

  dontTouch(writeInputFire)
  dontTouch(writeOutputFire)
  dontTouch(writeReqCount)

  class WriteReqInfo extends Bundle {
    val id = UInt(32.W)
    val size = UInt(32.W)
  }

  class ReadReqInfo(config: VortexL1Config) extends Bundle {
    val size = UInt(log2Ceil(config.wordSize).W)
    val id = UInt(config.coreTagWidth.W)
  }

  val coreWriteReqQueue = Module(
    new Queue(
      (new WriteReqInfo).cloneType,
      config.writeInfoReqQSize,
      true,
      false
    )
  )
  val readReqInfo = Wire(new ReadReqInfo(config))

  // Translate TL request from Coalescer to requests for VX_cache
  def TLReq2VXReq = {
    val (tlInFromCoal, _) = outer.coresideNode.in.head

    // coal -> vxCache
    tlInFromCoal.a.ready :=
      vxCache.io.core_req_ready && coreWriteReqQueue.io.enq.ready // not optimal
    vxCache.io.core_req_valid := tlInFromCoal.a.valid

    // read = 0, write = 1
    vxCache.io.core_req_rw := !(tlInFromCoal.a.bits.opcode === TLMessages.Get)
    // 4 is also hardcoded, it should be log2WordSize
    vxCache.io.core_req_addr := tlInFromCoal.a.bits.address(
      31,
      log2Ceil(config.wordSize)
    )
    vxCache.io.core_req_byteen := tlInFromCoal.a.bits.mask
    vxCache.io.core_req_data := tlInFromCoal.a.bits.data

    // combine size and tag field into one big wire, to put into
    // vxCache.io.core_req_tag
    readReqInfo.id := tlInFromCoal.a.bits.source
    readReqInfo.size := tlInFromCoal.a.bits.size
    // ignore param, size, corrupt
    vxCache.io.core_req_tag := readReqInfo.asTypeOf(vxCache.io.core_req_tag)

    writeInputFire := vxCache.io.core_req_rw && tlInFromCoal.a.fire

    // vxCache -> coal response on channel D
    // ok ... this part is a little tricky, the downstream coalescer requires
    // the L1 cache to send ack and dataAck, this is how coalescer knows when
    // an inflight ID has retired if we don't send ack, the coalescer will run
    // out of IDs, and can't generate new request

    // Optimization: for write requests from upstream (i.e. coalescer), we send
    // back ack as soon as we can without waiting for the actual ack from
    // downstream (i.e. L2).
    //
    // We still need to store these pending core write requests somewhere,
    // because we can't always ack them in the next cycle, ex. when there's a
    // competing read response.

    // FIXME: currently assuming below buffer is never full
    coreWriteReqQueue.io.enq.valid :=
      tlInFromCoal.a.fire && !(tlInFromCoal.a.bits.opcode === TLMessages.Get)
    coreWriteReqQueue.io.enq.bits.id := tlInFromCoal.a.bits.source
    coreWriteReqQueue.io.enq.bits.size := tlInFromCoal.a.bits.size

    // Prioritize ack for any pending reads over write acks in the queue. Don't
    // ack write if vxCache has a current valid response for reads (vxCache
    // response is always for reads.)
    coreWriteReqQueue.io.deq.ready := tlInFromCoal.d.ready && ~vxCache.io.core_rsp_valid

    // handle competition between a pending read ack response and write ack
    // response
    vxCache.io.core_rsp_ready := tlInFromCoal.d.ready
    tlInFromCoal.d.valid := vxCache.io.core_rsp_valid || coreWriteReqQueue.io.deq.valid
    tlInFromCoal.d.bits.source := Mux(
      vxCache.io.core_rsp_valid,
      vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).id,
      coreWriteReqQueue.io.deq.bits.id
    )
    tlInFromCoal.d.bits.opcode := Mux(
      vxCache.io.core_rsp_valid, // always for reads
      TLMessages.AccessAckData,
      TLMessages.AccessAck
    )
    tlInFromCoal.d.bits.size := Mux(
      vxCache.io.core_rsp_valid,
      vxCache.io.core_rsp_tag.asTypeOf(readReqInfo).size,
      coreWriteReqQueue.io.deq.bits.size
    )
    tlInFromCoal.d.bits.param := 0.U
    tlInFromCoal.d.bits.sink := 0.U
    tlInFromCoal.d.bits.denied := false.B
    tlInFromCoal.d.bits.corrupt := false.B
    tlInFromCoal.d.bits.data := vxCache.io.core_rsp_data
  }

  // Since Vortex L1 is a write-through cache, it doesn't bookkeep writes in
  // its MSHR and therefore doesn't allocate a new tag id for write requests.
  // We use a separate source ID allocator to solve this.
  val sourceGen = Module(
    new NewSourceGenerator(
      log2Ceil(config.memSideSourceIds),
      metadata = Some(UInt(32.W)),
      ignoreInUse = false
    )
  )

  // Translate VX_cache mem request to a TL request to be sent to L2
  def VXReq2TLReq = {
    val (tlOutToL2, _) = outer.vxCacheFetchNode.out.head

    // vxCache -> downstream L2 request
    vxCache.io.mem_req_ready := tlOutToL2.a.ready && sourceGen.io.id.valid
    tlOutToL2.a.valid := vxCache.io.mem_req_valid && sourceGen.io.id.valid

    sourceGen.io.gen := tlOutToL2.a.fire
    sourceGen.io.meta := vxCache.io.mem_req_tag // save the old read id

    writeOutputFire := tlOutToL2.a.fire && vxCache.io.mem_req_rw

    tlOutToL2.a.bits.opcode := Mux(
      vxCache.io.mem_req_rw,
      Mux(
        vxCache.io.mem_req_byteen.andR,
        TLMessages.PutFullData,
        TLMessages.PutPartialData
      ),
      TLMessages.Get
    )

    tlOutToL2.a.bits.address := Cat(vxCache.io.mem_req_addr, 0.U(4.W))
    tlOutToL2.a.bits.mask := Mux(
      vxCache.io.mem_req_rw,
      vxCache.io.mem_req_byteen,
      0xffff.U
    )
    tlOutToL2.a.bits.data := vxCache.io.mem_req_data
    tlOutToL2.a.bits.source := sourceGen.io.id.bits
    // ignore param, size, corrupt fields
    tlOutToL2.a.bits.param := 0.U
    tlOutToL2.a.bits.size := 4.U // FIXME: hardcoded
    tlOutToL2.a.bits.corrupt := false.B
    // downstream L2 -> vxCache response
    tlOutToL2.d.ready := vxCache.io.mem_rsp_ready

    vxCache.io.mem_rsp_valid :=
      tlOutToL2.d.valid && (tlOutToL2.d.bits.opcode === TLMessages.AccessAckData)
    vxCache.io.mem_rsp_tag := sourceGen.io.peek
    vxCache.io.mem_rsp_data := tlOutToL2.d.bits.data

    sourceGen.io.reclaim.valid := tlOutToL2.d.fire
    sourceGen.io.reclaim.bits := tlOutToL2.d.bits.source
  }

  TLReq2VXReq
  VXReq2TLReq
}

class VX_cache_top(
    // these values should match the default settings in Verilog
    // TODO: INSTANCE_ID
    CACHE_SIZE: Int = 16384 / 4, // <FIXME, divided by 4 for faster simulation
    CACHE_LINE_SIZE: Int = 16,
    NUM_WAYS: Int = 4,
    // for single-bank configuration, set NUM_REQS = 1 and instead set
    // WORD_SIZE to something wider than 4
    WORD_SIZE: Int = 16,
    CRSQ_SIZE: Int = 2,
    MSHR_SIZE: Int = 16,
    MRSQ_SIZE: Int = 0,
    MREQ_SIZE: Int = 4,
    WRITE_ENABLE: Int = 1,
    UUID_WIDTH: Int = 0, // FIXME: should be different for debug
    CORE_TAG_WIDTH: Int =
      16, // source ID ranges from 0 to 1 << 10, we need to allocate upper bits to save size
    CORE_OUT_REG : Int = 0,
    MEM_OUT_REG : Int = 0,
) extends BlackBox(
      Map(
        // NOTE: NUM_REQS is analogous to SIMD width, whereas NUM_BANKS is the
        // actual number of banks.  VX_cache.sv instantiates VX_stream_xbar
        // that arbitrates the higher NUM_REQS into NUM_BANKS.  Since we do
        // that logic ourselves using TL units, fix those params to 1 for the
        // Verilog side.
        "NUM_REQS" -> 1,
        "CACHE_SIZE" -> CACHE_SIZE,
        "LINE_SIZE" -> CACHE_LINE_SIZE,
        // NUM_BANKS is set to 1 to treat a whole VX_cache_top instance as a
        // single bank
        "NUM_BANKS" -> 1,
        "NUM_WAYS" -> NUM_WAYS,
        "WORD_SIZE" -> WORD_SIZE,
        "CRSQ_SIZE" -> CRSQ_SIZE,
        "MSHR_SIZE" -> MSHR_SIZE,
        "MRSQ_SIZE" -> MRSQ_SIZE,
        "MREQ_SIZE" -> MREQ_SIZE,
        "WRITE_ENABLE" -> WRITE_ENABLE,
        "UUID_WIDTH" -> UUID_WIDTH,
        "TAG_WIDTH" -> CORE_TAG_WIDTH,
        "CORE_OUT_REG" -> CORE_OUT_REG,
        "MEM_OUT_REG" -> MEM_OUT_REG,
        // Although VX_cache_top exposes it as a parameter, MEM_TAG_WIDTH is
        // not really configurable -- it is set to be a concatenation of the
        // MSHR id and cache bank id.  Instead of trying to configure it from
        // Chisel side, we try to figure out its value that's elaborated in the
        // Verilog side and configure the Chisel io width correspondingly.
        // "MEM_TAG_WIDTH" -> MEM_TAG_WIDTH
      )
    )
    with HasBlackBoxResource {

  def memTagWidth(mshrSize: Int, numBanks: Int): Int =
    log2Ceil(mshrSize) + log2Ceil(numBanks)
  val MEM_TAG_WIDTH = memTagWidth(MSHR_SIZE, 1/* NUM_BANKS */)

  val io = IO(new Bundle {
    val clk = Input(Clock())
    val reset = Input(Reset())

    // CACHE <> CORE
    val core_req_valid = Input(Bool())
    val core_req_rw = Input(Bool())
    val core_req_byteen = Input(UInt(WORD_SIZE.W))
    val core_req_addr = Input(UInt(WORD_ADDR_WIDTH.W))
    val core_req_data = Input(UInt((WORD_SIZE * 8).W))
    val core_req_tag = Input(UInt(CORE_TAG_WIDTH.W))
    val core_req_ready = Output(Bool())

    val core_rsp_valid = Output(Bool()) // 1 bit wide
    val core_rsp_data = Output(UInt((WORD_SIZE * 8).W))
    val core_rsp_tag = Output(UInt(CORE_TAG_WIDTH.W))
    val core_rsp_ready = Input(Bool())

    // CACHE <> L2
    val mem_req_valid = Output(Bool())
    val mem_req_rw = Output(Bool())
    val mem_req_byteen = Output(UInt(CACHE_LINE_SIZE.W))
    val mem_req_addr = Output(UInt(MEM_ADDR_WIDTH.W))
    val mem_req_data = Output(UInt((CACHE_LINE_SIZE * 8).W))
    val mem_req_tag = Output(UInt(MEM_TAG_WIDTH.W))
    val mem_req_ready = Input(Bool())

    val mem_rsp_valid = Input(Bool())
    val mem_rsp_data = Input(UInt((CACHE_LINE_SIZE * 8).W))
    val mem_rsp_tag = Input(UInt(MEM_TAG_WIDTH.W))
    val mem_rsp_ready = Output(Bool())
  })

  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_bank.sv")
  // addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_bypass.sv")
  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_data.sv")
  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_define.vh")
  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_init.sv")
  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_mshr.sv")
  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache.sv")
  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_tags.sv")
  addResource("/vsrc/vortex/hw/rtl/cache/VX_cache_top.sv")
}

// <FIXME> Delete the following NewSourceGenerator when merging with origin/graphics
// we should just use the one in coalescing.scala written by hansung

class NewSourceGenerator[T <: Data](
    sourceWidth: Int,
    metadata: Option[T] = None,
    ignoreInUse: Boolean = false
) extends Module {
  def getMetadataType = metadata match {
    case Some(gen) => gen.cloneType
    case None      => UInt(0.W)
  }
  val io = IO(new Bundle {
    val gen = Input(Bool())
    val reclaim = Input(Valid(UInt(sourceWidth.W)))
    val id = Output(Valid(UInt(sourceWidth.W)))
    // below are used only when metadata is not None
    // `meta` is used as input when a request succeeds id generation to store
    // its value to the table.
    // `peek` is the retrieved metadata saved for the request when corresponding
    // request has come back, setting `reclaim`.
    // Although these do not use ValidIO, it is safe because any in-flight
    // response coming back should have allocated a valid entry in the table
    // when it went out.
    val meta = Input(getMetadataType)
    val peek = Output(getMetadataType)
    // for debugging; indicates whether there is at least one inflight request
    // that hasn't been reclaimed yet
    val inflight = Output(Bool())
  })
  val head = RegInit(UInt(sourceWidth.W), 0.U)
  head := Mux(io.gen, head + 1.U, head)

  val outstanding = RegInit(UInt((sourceWidth + 1).W), 0.U)
  io.inflight := (outstanding > 0.U) || io.gen

  val numSourceId = 1 << sourceWidth
  val row = new Bundle {
    val meta = getMetadataType
    val id = Valid(UInt(sourceWidth.W))
    val age = UInt(32.W) // New age field for debugging
  }
  // valid: in use, invalid: available
  // val occupancyTable = Mem(numSourceId, Valid(UInt(sourceWidth.W)))
  val occupancyTable = Mem(numSourceId, row)
  when(reset.asBool) {
    (0 until numSourceId).foreach { i =>
      occupancyTable(i).id.valid := false.B
      occupancyTable(i).age := 0.U // Reset age during reset
    }
  }
  val frees = (0 until numSourceId).map(!occupancyTable(_).id.valid)
  val lowestFree = PriorityEncoder(frees)
  val lowestFreeRow = occupancyTable(lowestFree)

  io.id.valid := (if (ignoreInUse) true.B else !lowestFreeRow.id.valid)
  io.id.bits := lowestFree
  when(io.gen && io.id.valid /* fire */ ) {
    occupancyTable(io.id.bits).id.valid := true.B // mark in use
    occupancyTable(
      io.id.bits
    ).age := 0.U // reset age upon issuing, double safety
    if (metadata.isDefined) {
      occupancyTable(io.id.bits).meta := io.meta
    }
  }

  // Increase age of all inflight IDs by 1, except for the one being reclaimed
  for (i <- 0 until numSourceId) {
    when(
      occupancyTable(
        i
      ).id.valid && (i.U =/= io.reclaim.bits || !io.reclaim.valid)
    ) {
      occupancyTable(i).age := occupancyTable(i).age + 1.U
    }
  }

  when(io.reclaim.valid) {
    assert(
      occupancyTable(io.reclaim.bits).id.valid === true.B,
      "tried to reclaim a non-used id"
    )
    occupancyTable(io.reclaim.bits).id.valid := false.B // mark freed
    occupancyTable(io.reclaim.bits).age := 0.U
  }

  io.peek := {
    if (metadata.isDefined) occupancyTable(io.reclaim.bits).meta else 0.U
  }

  when(io.gen && io.id.valid) {
    when(!io.reclaim.valid) {
      assert(outstanding < (1 << sourceWidth).U)
      outstanding := outstanding + 1.U
    }
  }.elsewhen(io.reclaim.valid) {
    assert(outstanding > 0.U)
    outstanding := outstanding - 1.U
  }

  // Debugging wires
  val ages = VecInit((0 until numSourceId).map(i => occupancyTable(i).age))
  val oldestIndex = PriorityEncoder(
    ages.map(a => a === ages.reduce((x, y) => Mux(x > y, x, y)))
  )
  val oldestIdInflight = Wire(UInt(sourceWidth.W))
  val oldestMetadata = Wire(getMetadataType)
  val oldestAge = Wire(UInt(32.W))

  oldestIdInflight := oldestIndex
  oldestMetadata := occupancyTable(oldestIndex).meta
  oldestAge := occupancyTable(oldestIndex).age
  assert(
    oldestAge <= 2000.U,
    "One id in the SourceGen is not released for long time, potential bug !"
  )

  dontTouch(oldestIdInflight)
  dontTouch(oldestMetadata)
  dontTouch(oldestAge)
  dontTouch(outstanding)

}