CoalArbiter RTL written, verification WIP, merged changes from graphics
This commit is contained in:
@@ -22,7 +22,9 @@ MemTraceReader::MemTraceReader(const std::string &filename)
|
|||||||
|
|
||||||
infile.open(filename);
|
infile.open(filename);
|
||||||
if (infile.fail()) {
|
if (infile.fail()) {
|
||||||
fprintf(stderr, "failed to open file %s\n", filename.c_str());
|
fprintf(stderr, "MemTraceReader: error: failed to open file %s\n",
|
||||||
|
filename.c_str());
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,8 +62,6 @@ void MemTraceReader::parse(const bool has_source) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!(infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id)) {
|
if (!(infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id)) {
|
||||||
printf("char=[%c]\n", infile.peek());
|
|
||||||
// assert(!infile.eof());
|
|
||||||
error(fileline, "failed parsing cycle..lane_id");
|
error(fileline, "failed parsing cycle..lane_id");
|
||||||
}
|
}
|
||||||
if (has_source && !(infile >> source)) {
|
if (has_source && !(infile >> source)) {
|
||||||
@@ -101,8 +101,6 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id,
|
|||||||
MemTraceLine line;
|
MemTraceLine line;
|
||||||
line.valid = false;
|
line.valid = false;
|
||||||
|
|
||||||
// printf("tick(): cycle=%ld\n", cycle);
|
|
||||||
|
|
||||||
if (finished()) {
|
if (finished()) {
|
||||||
return line;
|
return line;
|
||||||
}
|
}
|
||||||
@@ -112,7 +110,11 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id,
|
|||||||
// the next line is in the future.
|
// the next line is in the future.
|
||||||
if (line.cycle < cycle) {
|
if (line.cycle < cycle) {
|
||||||
long fileline = read_pos - std::cbegin(trace_buf) + 1;
|
long fileline = read_pos - std::cbegin(trace_buf) + 1;
|
||||||
error(fileline, "some trace lines are left unread in the past");
|
error(fileline, "some trace lines are left unread in the past. "
|
||||||
|
"Tried cycle=" +
|
||||||
|
std::to_string(cycle) +
|
||||||
|
", found line.cycle=" + std::to_string(line.cycle) +
|
||||||
|
". Is NUM_LANES set correctly?");
|
||||||
return MemTraceLine{};
|
return MemTraceLine{};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,14 +136,17 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, const int lane_id,
|
|||||||
// monotonically increment read_pos. lane_id need not be contiguous, e.g.
|
// monotonically increment read_pos. lane_id need not be contiguous, e.g.
|
||||||
// 0->1->3 is fine.
|
// 0->1->3 is fine.
|
||||||
++read_pos;
|
++read_pos;
|
||||||
return line;
|
|
||||||
} else {
|
} else {
|
||||||
// For debugging purposes, instead of early-returning on
|
// For debugging purposes, instead of early-returning on
|
||||||
// !trace_read_ready, print something to notify we are blocking a valid
|
// !trace_read_ready, print something to notify we are blocking a valid
|
||||||
// trace line.
|
// trace line.
|
||||||
printf("All Lanes Blocked on this cycle! cycle=%ld \n", cycle);
|
printf("All Lanes Blocked on this cycle! cycle=%ld \n", cycle);
|
||||||
return MemTraceLine{};
|
|
||||||
}
|
}
|
||||||
|
// We want to return valid line regardless of `trace_read_ready` or not,
|
||||||
|
// because we want to let the driver know that it missed a valid line at the
|
||||||
|
// given cycle, so that it holds its cycle counter and safely reads back the
|
||||||
|
// line in the future.
|
||||||
|
return line;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(!"unreachable");
|
assert(!"unreachable");
|
||||||
|
|||||||
@@ -39,49 +39,32 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) (
|
|||||||
output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data,
|
output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data,
|
||||||
output trace_read_finished
|
output trace_read_finished
|
||||||
);
|
);
|
||||||
bit __in_valid [NUM_LANES-1:0];
|
bit __in_valid [NUM_LANES-1:0];
|
||||||
longint __in_address [NUM_LANES-1:0];
|
longint __in_address [NUM_LANES-1:0];
|
||||||
|
bit __in_is_store [NUM_LANES-1:0];
|
||||||
bit __in_is_store [NUM_LANES-1:0];
|
|
||||||
reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0];
|
reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0];
|
||||||
longint __in_data [NUM_LANES-1:0];
|
longint __in_data [NUM_LANES-1:0];
|
||||||
|
bit __in_finished;
|
||||||
bit __in_finished;
|
|
||||||
string __uartlog;
|
|
||||||
|
|
||||||
// Cycle counter that is used to query C parser whether we have a request
|
|
||||||
// coming in at the current cycle.
|
|
||||||
|
|
||||||
|
|
||||||
// registers that stage outputs of the C parser
|
|
||||||
reg [NUM_LANES-1:0] __in_valid_wire;
|
|
||||||
reg [`DATA_WIDTH-1:0] __in_address_wire [NUM_LANES-1:0];
|
|
||||||
|
|
||||||
reg [NUM_LANES-1:0] __in_is_store_wire;
|
|
||||||
reg [`LOGSIZE_WIDTH-1:0] __in_size_wire [NUM_LANES-1:0];
|
|
||||||
reg [`DATA_WIDTH-1:0] __in_data_wire [NUM_LANES-1:0];
|
|
||||||
reg __in_finished_wire;
|
|
||||||
|
|
||||||
genvar g;
|
genvar g;
|
||||||
|
|
||||||
generate
|
generate
|
||||||
for (g = 0; g < NUM_LANES; g = g + 1) begin
|
for (g = 0; g < NUM_LANES; g = g + 1) begin
|
||||||
assign trace_read_valid[g] = __in_valid_wire[g];
|
assign trace_read_valid[g] = __in_valid[g];
|
||||||
assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_wire[g];
|
assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address[g];
|
||||||
|
|
||||||
assign trace_read_is_store[g] = __in_is_store_wire[g];
|
assign trace_read_is_store[g] = __in_is_store[g];
|
||||||
assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size_wire[g];
|
assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size[g];
|
||||||
assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_wire[g];
|
assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data[g];
|
||||||
end
|
end
|
||||||
endgenerate
|
endgenerate
|
||||||
assign trace_read_finished = __in_finished_wire;
|
assign trace_read_finished = __in_finished;
|
||||||
|
|
||||||
initial begin
|
initial begin
|
||||||
/* $value$plusargs("uartlog=%s", __uartlog); */
|
/* $value$plusargs("uartlog=%s", __uartlog); */
|
||||||
memtrace_init(FILENAME);
|
memtrace_init(FILENAME);
|
||||||
end
|
end
|
||||||
|
|
||||||
always @(*) begin
|
always @(posedge clock) begin
|
||||||
if (reset) begin
|
if (reset) begin
|
||||||
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
|
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
|
||||||
__in_valid[tid] = 1'b0;
|
__in_valid[tid] = 1'b0;
|
||||||
@@ -91,55 +74,29 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) (
|
|||||||
__in_size[tid] = `LOGSIZE_WIDTH'b0;
|
__in_size[tid] = `LOGSIZE_WIDTH'b0;
|
||||||
__in_data[tid] = `DATA_WIDTH'b0;
|
__in_data[tid] = `DATA_WIDTH'b0;
|
||||||
end
|
end
|
||||||
|
|
||||||
__in_finished = 1'b0;
|
__in_finished = 1'b0;
|
||||||
|
|
||||||
//cycle_counter <= `DATA_WIDTH'b0;
|
|
||||||
|
|
||||||
// setting default value for register to avoid latches
|
|
||||||
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
|
|
||||||
__in_valid_wire[tid] = 1'b0;
|
|
||||||
__in_address_wire[tid] = `DATA_WIDTH'b0;
|
|
||||||
|
|
||||||
__in_is_store_wire[tid] = 1'b0;
|
|
||||||
__in_size_wire[tid] = `LOGSIZE_WIDTH'b0;
|
|
||||||
__in_data_wire[tid] = `DATA_WIDTH'b0;
|
|
||||||
end
|
|
||||||
|
|
||||||
__in_finished_wire = 1'b0;
|
|
||||||
end else begin
|
end else begin
|
||||||
|
// We have to write to __in_ regs only when trace_read_ready, or
|
||||||
|
// otherwise we might overwrite lines that were previously valid
|
||||||
|
// but the downstream missed by being not ready.
|
||||||
|
if (trace_read_ready) begin
|
||||||
|
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
|
||||||
|
memtrace_query(
|
||||||
|
trace_read_ready,
|
||||||
|
trace_read_cycle,
|
||||||
|
tid,
|
||||||
|
|
||||||
// Getting values from C function into pseudeo register
|
__in_valid[tid],
|
||||||
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
|
__in_address[tid],
|
||||||
memtrace_query(
|
|
||||||
trace_read_ready,
|
|
||||||
// Since parsed results are latched to the output on the next
|
|
||||||
// cycle due to staging registers, we need to pass in the next cycle
|
|
||||||
// to sync up.
|
|
||||||
trace_read_cycle, // the left replace next_cycle_counter,
|
|
||||||
tid,
|
|
||||||
|
|
||||||
__in_valid[tid],
|
__in_is_store[tid],
|
||||||
__in_address[tid],
|
__in_size[tid],
|
||||||
|
__in_data[tid],
|
||||||
|
|
||||||
__in_is_store[tid],
|
__in_finished
|
||||||
__in_size[tid],
|
);
|
||||||
__in_data[tid],
|
end
|
||||||
|
|
||||||
__in_finished
|
|
||||||
);
|
|
||||||
end
|
end
|
||||||
|
|
||||||
// Connect values from pseudo register into verilog register
|
|
||||||
for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin
|
|
||||||
__in_valid_wire[tid] = __in_valid[tid];
|
|
||||||
__in_address_wire[tid] = __in_address[tid];
|
|
||||||
|
|
||||||
__in_is_store_wire[tid] = __in_is_store[tid];
|
|
||||||
__in_size_wire[tid] = __in_size[tid];
|
|
||||||
__in_data_wire[tid] = __in_data[tid];
|
|
||||||
end
|
|
||||||
__in_finished_wire = __in_finished;
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
endmodule
|
endmodule
|
||||||
|
|||||||
@@ -233,11 +233,13 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
|
|||||||
}))
|
}))
|
||||||
|
|
||||||
// shift hint is when the heads have no more coalescable left this or next cycle
|
// shift hint is when the heads have no more coalescable left this or next cycle
|
||||||
val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, i) =>
|
val shiftHint = !(io.coalescable zip io.invalidate.bits.map(_(0))).map { case (c, inv) =>
|
||||||
c && !(io.invalidate.valid && i)
|
c && !(io.invalidate.valid && inv)
|
||||||
}.reduce(_ || _)
|
}.reduce(_ || _)
|
||||||
val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
|
val syncedEnqValid = io.queue.enq.map(_.valid).reduce(_ || _)
|
||||||
val syncedDeqValid = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire
|
// syncedDeqValidNextCycle being true means the arbiter has completed
|
||||||
|
// processing all of the ready-to-go requests.
|
||||||
|
val syncedDeqValidNextCycle = io.queue.deq.map(x => x.valid && !x.ready).reduce(_ || _) // valid and not fire
|
||||||
|
|
||||||
for (i <- 0 until config.numLanes) {
|
for (i <- 0 until config.numLanes) {
|
||||||
val enq = io.queue.enq(i)
|
val enq = io.queue.enq(i)
|
||||||
@@ -247,7 +249,7 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
|
|||||||
ctrl.full := writePtr(i) === entries.U
|
ctrl.full := writePtr(i) === entries.U
|
||||||
ctrl.empty := writePtr(i) === 0.U
|
ctrl.empty := writePtr(i) === 0.U
|
||||||
// shift when no outstanding dequeue, no more coalescable chunks, and not empty
|
// shift when no outstanding dequeue, no more coalescable chunks, and not empty
|
||||||
ctrl.shift := !syncedDeqValid && shiftHint && !ctrl.empty
|
ctrl.shift := !syncedDeqValidNextCycle && shiftHint && !ctrl.empty
|
||||||
|
|
||||||
// dequeue is valid when:
|
// dequeue is valid when:
|
||||||
// head entry is valid, has not been processed by downstream, and is not coalescable
|
// head entry is valid, has not been processed by downstream, and is not coalescable
|
||||||
@@ -293,6 +295,9 @@ class CoalShiftQueue[T <: Data](gen: T, entries: Int, config: CoalescerConfig) e
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// When doing spatial-only coalescing, queues should never drift from each
|
||||||
|
// other, i.e. the queue heads should always contain mem requests from the
|
||||||
|
// same instruction.
|
||||||
val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
|
val queueInSync = controlSignals.map(_ === controlSignals.head).reduce(_ && _) &&
|
||||||
writePtr.map(_ === writePtr.head).reduce(_ && _)
|
writePtr.map(_ === writePtr.head).reduce(_ && _)
|
||||||
assert(queueInSync, "shift queue lanes are not in sync")
|
assert(queueInSync, "shift queue lanes are not in sync")
|
||||||
@@ -326,23 +331,15 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
|
|||||||
val leaders = io.window.elts.map(_.head)
|
val leaders = io.window.elts.map(_.head)
|
||||||
val leadersValid = io.window.mask.map(_.asBools.head)
|
val leadersValid = io.window.mask.map(_.asBools.head)
|
||||||
|
|
||||||
// When doing spatial-only coalescing, queues should never drift from each
|
|
||||||
// other, i.e. the queue heads should always contain mem requests from the
|
|
||||||
// same instruction.
|
|
||||||
// FIXME: This relies on the MemTraceDriver's behavior of generating TL
|
|
||||||
// requests with full source info even when the corresponding lane is not
|
|
||||||
// active.
|
|
||||||
def testNoQueueDrift: Bool = leaders.map(_.source === leaders.head.source).reduce(_ || _)
|
|
||||||
def printQueueHeads = {
|
def printQueueHeads = {
|
||||||
leaders.zipWithIndex.foreach{ case (head, i) =>
|
leaders.zipWithIndex.foreach{ case (head, i) =>
|
||||||
printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
|
printf(s"ReqQueueEntry[${i}].head = v:%d, source:%d, addr:%x\n",
|
||||||
leadersValid(i), head.source, head.address)
|
leadersValid(i), head.source, head.address)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
when (leadersValid.reduce(_ || _)) {
|
// when (leadersValid.reduce(_ || _)) {
|
||||||
assert(testNoQueueDrift, "unexpected drift between lane request queues")
|
// printQueueHeads
|
||||||
// printQueueHeads
|
// }
|
||||||
}
|
|
||||||
|
|
||||||
val size = coalLogSize
|
val size = coalLogSize
|
||||||
val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
|
val addrMask = (((1 << config.addressWidth) - 1) - ((1 << size) - 1)).U
|
||||||
@@ -375,14 +372,21 @@ class MonoCoalescer(coalLogSize: Int, windowT: CoalShiftQueue[ReqQueueEntry],
|
|||||||
.reduce(_ +& _))
|
.reduce(_ +& _))
|
||||||
val canCoalesce = matchCounts.map(_ > 1.U)
|
val canCoalesce = matchCounts.map(_ > 1.U)
|
||||||
|
|
||||||
// Elect the leader out of all potential leaders that have matchCounts > 1.
|
// Elect the leader that has the most match counts.
|
||||||
// TODO: potentially expensive: magnitude comparator
|
// TODO: potentially expensive: magnitude comparator
|
||||||
// Maybe choose leftmost leader (priority encoder) instead of argmax
|
def chooseLeaderArgMax(matchCounts: Seq[UInt]): UInt = {
|
||||||
val chosenLeaderIdx = matchCounts.zipWithIndex.map {
|
matchCounts.zipWithIndex.map {
|
||||||
case (c, i) => (c, i.U)
|
case (c, i) => (c, i.U)
|
||||||
}.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
|
}.reduce[(UInt, UInt)] { case ((c0, i), (c1, j)) =>
|
||||||
(Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
|
(Mux(c0 >= c1, c0, c1), Mux(c0 >= c1, i, j))
|
||||||
}._2
|
}._2
|
||||||
|
}
|
||||||
|
// Elect leader by choosing the smallest-index lane that has a valid
|
||||||
|
// match, i.e. using priority encoder.
|
||||||
|
def chooseLeaderPriorityEncoder(matchCounts: Seq[UInt]): UInt = {
|
||||||
|
PriorityEncoder(matchCounts.map(_ > 1.U))
|
||||||
|
}
|
||||||
|
val chosenLeaderIdx = chooseLeaderPriorityEncoder(matchCounts)
|
||||||
|
|
||||||
val chosenLeader = VecInit(leaders)(chosenLeaderIdx)
|
val chosenLeader = VecInit(leaders)(chosenLeaderIdx)
|
||||||
// matchTable for the chosen lane, but converted to a Vec[UInt]
|
// matchTable for the chosen lane, but converted to a Vec[UInt]
|
||||||
@@ -578,11 +582,14 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
|
|||||||
reqQueues.io.coalescable := coalescer.io.coalescable
|
reqQueues.io.coalescable := coalescer.io.coalescable
|
||||||
reqQueues.io.invalidate := coalescer.io.invalidate
|
reqQueues.io.invalidate := coalescer.io.invalidate
|
||||||
|
|
||||||
// Per-lane request and response queues
|
// ===========================================================================
|
||||||
|
// Request flow
|
||||||
|
// ===========================================================================
|
||||||
//
|
//
|
||||||
// Override IdentityNode implementation so that we can instantiate
|
// Override IdentityNode implementation so that we can instantiate
|
||||||
// queues between input and output edges to buffer requests and responses.
|
// queues between input and output edges to buffer requests and responses.
|
||||||
// See IdentityNode definition in `diplomacy/Nodes.scala`.
|
// See IdentityNode definition in `diplomacy/Nodes.scala`.
|
||||||
|
//
|
||||||
(outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
|
(outer.cpuNode.in zip outer.cpuNode.out).zipWithIndex.foreach {
|
||||||
case (((tlIn, _), (tlOut, edgeOut)), lane) =>
|
case (((tlIn, _), (tlOut, edgeOut)), lane) =>
|
||||||
// Request queue
|
// Request queue
|
||||||
@@ -604,7 +611,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
|
|||||||
val deq = reqQueues.io.queue.deq(lane)
|
val deq = reqQueues.io.queue.deq(lane)
|
||||||
enq.valid := tlIn.a.valid
|
enq.valid := tlIn.a.valid
|
||||||
enq.bits := req
|
enq.bits := req
|
||||||
deq.ready := true.B // TODO: deq.ready should respect downstream arbiter
|
// TODO: deq.ready should respect downstream arbiter
|
||||||
|
deq.ready := true.B
|
||||||
|
// Stall upstream core or memtrace driver when shiftqueue is not ready
|
||||||
|
tlIn.a.ready := enq.ready
|
||||||
tlOut.a.valid := deq.valid
|
tlOut.a.valid := deq.valid
|
||||||
tlOut.a.bits := deq.bits.toTLA(edgeOut)
|
tlOut.a.bits := deq.bits.toTLA(edgeOut)
|
||||||
|
|
||||||
@@ -641,11 +651,12 @@ class CoalescingUnitImp(outer: CoalescingUnit, config: CoalescerConfig) extends
|
|||||||
tlCoal.e.valid := false.B
|
tlCoal.e.valid := false.B
|
||||||
|
|
||||||
|
|
||||||
// ==================================================================
|
// ===========================================================================
|
||||||
// ******************************************************************
|
// Response flow
|
||||||
// ************************* REORG BOUNDARY *************************
|
// ===========================================================================
|
||||||
// ******************************************************************
|
//
|
||||||
// ==================================================================
|
// Connect uncoalescer output and noncoalesced response ports to the response
|
||||||
|
// queues.
|
||||||
|
|
||||||
// The maximum number of requests from a single lane that can go into a
|
// The maximum number of requests from a single lane that can go into a
|
||||||
// coalesced request. Upper bound is min(DEPTH, 2**sourceWidth).
|
// coalesced request. Upper bound is min(DEPTH, 2**sourceWidth).
|
||||||
@@ -1083,24 +1094,18 @@ class TraceLine extends Bundle with HasTraceLine {
|
|||||||
class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFile: String)
|
class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFile: String)
|
||||||
extends LazyModuleImp(outer)
|
extends LazyModuleImp(outer)
|
||||||
with UnitTestModule {
|
with UnitTestModule {
|
||||||
|
// Current cycle mark to read from trace
|
||||||
|
val traceReadCycle = RegInit(1.U(64.W))
|
||||||
|
|
||||||
val globalClkCounter = RegInit(1.U(64.W))
|
// If any of the downstream lane is not ready, hold on from advancing
|
||||||
val traceReadCycle = RegInit(1.U(64.W))
|
val downstreamReady = outer.laneNodes.map(_.out(0)._1.a.ready).reduce(_ && _)
|
||||||
val downstreamSQready = WireInit(true.B)
|
|
||||||
|
|
||||||
//make the downstream only ready 1/4 of the time
|
|
||||||
//This is to test Tracer System's ability to hold on requests
|
|
||||||
//FIXME
|
|
||||||
downstreamSQready := (globalClkCounter(1,0) =/= 0.U)
|
|
||||||
//Connect Signals to Verilog BlackBox
|
|
||||||
val sim = Module(new SimMemTrace(traceFile, config.numLanes))
|
val sim = Module(new SimMemTrace(traceFile, config.numLanes))
|
||||||
sim.io.clock := clock
|
sim.io.clock := clock
|
||||||
sim.io.reset := reset.asBool
|
sim.io.reset := reset.asBool
|
||||||
sim.io.trace_read.ready := downstreamSQready
|
sim.io.trace_read.ready := downstreamReady
|
||||||
//FIXME - 1.U hardcoded, currently there is a delay between chisel and verilog
|
|
||||||
sim.io.trace_read.cycle := traceReadCycle
|
sim.io.trace_read.cycle := traceReadCycle
|
||||||
|
|
||||||
|
|
||||||
// Read output from Verilog BlackBox
|
// Read output from Verilog BlackBox
|
||||||
// Split output of SimMemTrace, which is flattened across all lanes,back to each lane's.
|
// Split output of SimMemTrace, which is flattened across all lanes,back to each lane's.
|
||||||
val laneReqs = Wire(Vec(config.numLanes, new TraceLine))
|
val laneReqs = Wire(Vec(config.numLanes, new TraceLine))
|
||||||
@@ -1109,26 +1114,28 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil
|
|||||||
val dataW = laneReqs(0).data.getWidth
|
val dataW = laneReqs(0).data.getWidth
|
||||||
laneReqs.zipWithIndex.foreach { case (req, i) =>
|
laneReqs.zipWithIndex.foreach { case (req, i) =>
|
||||||
req.valid := sim.io.trace_read.valid(i)
|
req.valid := sim.io.trace_read.valid(i)
|
||||||
// TODO: driver trace doesn't contain source id
|
req.source := 0.U // driver trace doesn't contain source id
|
||||||
req.source := 0.U
|
|
||||||
req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i)
|
req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i)
|
||||||
req.is_store := sim.io.trace_read.is_store(i)
|
req.is_store := sim.io.trace_read.is_store(i)
|
||||||
req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i)
|
req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i)
|
||||||
req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i)
|
req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i)
|
||||||
}
|
}
|
||||||
|
|
||||||
globalClkCounter := globalClkCounter + 1.U
|
// def missedLine = {
|
||||||
val existValidReq = WireInit(false.B)
|
// val existsValidLine = WireInit(false.B)
|
||||||
existValidReq := laneReqs.map(_.valid).reduce(_||_)
|
// existsValidLine := laneReqs.map(_.valid).reduce(_||_)
|
||||||
val validReqBlocked = WireInit(false.B)
|
// val missedLine = WireInit(false.B)
|
||||||
validReqBlocked := !downstreamSQready && existValidReq
|
// missedLine := !downstreamReady && existsValidLine
|
||||||
//Debug
|
|
||||||
dontTouch(downstreamSQready)
|
// // Debug
|
||||||
dontTouch(existValidReq)
|
// dontTouch(downstreamReady)
|
||||||
dontTouch(validReqBlocked)
|
// dontTouch(existsValidLine)
|
||||||
// Do Not Update TraceReadCycle if downstream is blocking
|
// dontTouch(missedLine)
|
||||||
when(!validReqBlocked){
|
|
||||||
traceReadCycle := traceReadCycle + 1.U
|
// missedLine
|
||||||
|
// }
|
||||||
|
when (downstreamReady){
|
||||||
|
traceReadCycle := traceReadCycle + 1.U
|
||||||
}
|
}
|
||||||
|
|
||||||
// To prevent collision of sourceId with a current in-flight message,
|
// To prevent collision of sourceId with a current in-flight message,
|
||||||
@@ -1163,19 +1170,6 @@ class MemTraceDriverImp(outer: MemTraceDriver, config: CoalescerConfig, traceFil
|
|||||||
val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
|
val wordAlignedAddress = req.address & ~((1 << log2Ceil(config.wordSizeInBytes)) - 1).U(addrW.W)
|
||||||
val wordAlignedSize = Mux(subword, 2.U, req.size)
|
val wordAlignedSize = Mux(subword, 2.U, req.size)
|
||||||
|
|
||||||
// when(req.valid && subword) {
|
|
||||||
// printf(
|
|
||||||
// "address=%x, size=%d, data=%x, addressMask=%x, wordAlignedAddress=%x, mask=%x, wordData=%x\n",
|
|
||||||
// req.address,
|
|
||||||
// req.size,
|
|
||||||
// req.data,
|
|
||||||
// ~((1 << log2Ceil(config.WORD_SIZE)) - 1).U(addrW.W),
|
|
||||||
// wordAlignedAddress,
|
|
||||||
// mask,
|
|
||||||
// wordData
|
|
||||||
// )
|
|
||||||
// }
|
|
||||||
|
|
||||||
val (tlOut, edge) = node.out(0)
|
val (tlOut, edge) = node.out(0)
|
||||||
val (plegal, pbits) = edge.Put(
|
val (plegal, pbits) = edge.Put(
|
||||||
fromSource = sourceIdCounter,
|
fromSource = sourceIdCounter,
|
||||||
@@ -1356,7 +1350,9 @@ class MemTraceLogger(
|
|||||||
|
|
||||||
// requests on TL A channel
|
// requests on TL A channel
|
||||||
//
|
//
|
||||||
req.valid := tlIn.a.valid
|
// Only log trace when fired, e.g. both upstream and downstream is ready
|
||||||
|
// and transaction happened.
|
||||||
|
req.valid := tlIn.a.fire
|
||||||
req.size := tlIn.a.bits.size
|
req.size := tlIn.a.bits.size
|
||||||
req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
|
req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode)
|
||||||
req.source := tlIn.a.bits.source
|
req.source := tlIn.a.bits.source
|
||||||
@@ -1364,27 +1360,6 @@ class MemTraceLogger(
|
|||||||
// originally requested, so no postprocessing required
|
// originally requested, so no postprocessing required
|
||||||
req.address := tlIn.a.bits.address
|
req.address := tlIn.a.bits.address
|
||||||
|
|
||||||
// TL data
|
|
||||||
//
|
|
||||||
// When tlIn.a.bits.size is smaller than the data bus width, need to
|
|
||||||
// figure out which byte lanes we actually accessed so that
|
|
||||||
// we can write that to the memory trace.
|
|
||||||
// See Section 4.5 Byte Lanes in spec 1.8.1
|
|
||||||
|
|
||||||
// This assert only holds true for PutFullData and not PutPartialData,
|
|
||||||
// where HIGH bits in the mask may not be contiguous.
|
|
||||||
assert(
|
|
||||||
PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
|
|
||||||
"mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic"
|
|
||||||
)
|
|
||||||
val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
|
|
||||||
val dataW = tlIn.params.dataBits
|
|
||||||
val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U))
|
|
||||||
req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
|
|
||||||
// when (req.valid) {
|
|
||||||
// printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
|
|
||||||
// }
|
|
||||||
|
|
||||||
when(req.valid) {
|
when(req.valid) {
|
||||||
TLPrintf(
|
TLPrintf(
|
||||||
s"MemTraceLogger (${loggerName}:downstream)",
|
s"MemTraceLogger (${loggerName}:downstream)",
|
||||||
@@ -1397,9 +1372,33 @@ class MemTraceLogger(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TL data
|
||||||
|
//
|
||||||
|
// When tlIn.a.bits.size is smaller than the data bus width, need to
|
||||||
|
// figure out which byte lanes we actually accessed so that
|
||||||
|
// we can write that to the memory trace.
|
||||||
|
// See Section 4.5 Byte Lanes in spec 1.8.1
|
||||||
|
|
||||||
|
// This assert only holds true for PutFullData and not PutPartialData,
|
||||||
|
// where HIGH bits in the mask may not be contiguous.
|
||||||
|
assert(
|
||||||
|
PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size),
|
||||||
|
"mask HIGH popcount do not match the TL size. " +
|
||||||
|
"Partial masks are not allowed for PutFull"
|
||||||
|
)
|
||||||
|
val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask)
|
||||||
|
val dataW = tlIn.params.dataBits
|
||||||
|
val mask = ~(~(0.U(dataW.W)) << ((1.U << tlIn.a.bits.size) * 8.U))
|
||||||
|
req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U))
|
||||||
|
// when (req.valid) {
|
||||||
|
// printf("trailingZerosInMask=%d, mask=%x, data=%x\n", trailingZerosInMask, mask, req.data)
|
||||||
|
// }
|
||||||
|
|
||||||
// responses on TL D channel
|
// responses on TL D channel
|
||||||
//
|
//
|
||||||
resp.valid := tlOut.d.valid
|
// Only log trace when fired, e.g. both upstream and downstream is ready
|
||||||
|
// and transaction happened.
|
||||||
|
resp.valid := tlOut.d.fire
|
||||||
resp.size := tlOut.d.bits.size
|
resp.size := tlOut.d.bits.size
|
||||||
resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode)
|
resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode)
|
||||||
resp.source := tlOut.d.bits.source
|
resp.source := tlOut.d.bits.source
|
||||||
@@ -1433,7 +1432,7 @@ class MemTraceLogger(
|
|||||||
//
|
//
|
||||||
// This is a clunky workaround of the fact that Chisel doesn't allow partial
|
// This is a clunky workaround of the fact that Chisel doesn't allow partial
|
||||||
// assignment to a bitfield range of a wide signal.
|
// assignment to a bitfield range of a wide signal.
|
||||||
def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
|
def flattenTrace(simIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = {
|
||||||
// these will get optimized out
|
// these will get optimized out
|
||||||
val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
|
val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid)))
|
||||||
val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
|
val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source)))
|
||||||
@@ -1449,12 +1448,12 @@ class MemTraceLogger(
|
|||||||
vecSize(i) := l.size
|
vecSize(i) := l.size
|
||||||
vecData(i) := l.data
|
vecData(i) := l.data
|
||||||
}
|
}
|
||||||
traceLogIO.valid := vecValid.asUInt
|
simIO.valid := vecValid.asUInt
|
||||||
traceLogIO.source := vecSource.asUInt
|
simIO.source := vecSource.asUInt
|
||||||
traceLogIO.address := vecAddress.asUInt
|
simIO.address := vecAddress.asUInt
|
||||||
traceLogIO.is_store := vecIsStore.asUInt
|
simIO.is_store := vecIsStore.asUInt
|
||||||
traceLogIO.size := vecSize.asUInt
|
simIO.size := vecSize.asUInt
|
||||||
traceLogIO.data := vecData.asUInt
|
simIO.data := vecData.asUInt
|
||||||
}
|
}
|
||||||
|
|
||||||
if (simReq.isDefined) {
|
if (simReq.isDefined) {
|
||||||
@@ -1544,7 +1543,7 @@ class DummyDriver(config: CoalescerConfig)(implicit p: Parameters)
|
|||||||
val clientParam = Seq(
|
val clientParam = Seq(
|
||||||
TLMasterParameters.v1(
|
TLMasterParameters.v1(
|
||||||
name = "dummy-core-node-" + i.toString,
|
name = "dummy-core-node-" + i.toString,
|
||||||
sourceId = IdRange(0, defaultConfig.numOldSrcIds)
|
sourceId = IdRange(0, config.numOldSrcIds)
|
||||||
// visibility = Seq(AddressSet(0x0000, 0xffffff))
|
// visibility = Seq(AddressSet(0x0000, 0xffffff))
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -1635,10 +1634,7 @@ class DummyCoalescerTest(timeout: Int = 500000)(implicit p: Parameters)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// tracedriver --> coalescer --> tracelogger --> tlram
|
// tracedriver --> coalescer --> tracelogger --> tlram
|
||||||
class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule {
|
class TLRAMCoalescerLogger(filename: String)(implicit p: Parameters) extends LazyModule {
|
||||||
// val filename = "test.trace"
|
|
||||||
val filename = "vecadd.core1.thread4.trace"
|
|
||||||
// val filename = "nvbit.vecadd.n100000.filter_sm0.trace"
|
|
||||||
// TODO: use parameters for numLanes
|
// TODO: use parameters for numLanes
|
||||||
val numLanes = defaultConfig.numLanes
|
val numLanes = defaultConfig.numLanes
|
||||||
|
|
||||||
@@ -1680,13 +1676,14 @@ class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule {
|
|||||||
(coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes),
|
(coreSideLogger.module.io.reqBytes === coreSideLogger.module.io.respBytes),
|
||||||
"FAIL: requests and responses traffic to the coalescer do not match"
|
"FAIL: requests and responses traffic to the coalescer do not match"
|
||||||
)
|
)
|
||||||
|
printf("SUCCESS: coalescer response traffic matched requests!\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters)
|
class TLRAMCoalescerLoggerTest(filename: String, timeout: Int = 500000)(implicit p: Parameters)
|
||||||
extends UnitTest(timeout) {
|
extends UnitTest(timeout) {
|
||||||
val dut = Module(LazyModule(new TLRAMCoalescerLogger).module)
|
val dut = Module(LazyModule(new TLRAMCoalescerLogger(filename)).module)
|
||||||
dut.io.start := io.start
|
dut.io.start := io.start
|
||||||
io.finished := dut.io.finished
|
io.finished := dut.io.finished
|
||||||
}
|
}
|
||||||
@@ -2137,3 +2134,5 @@ class CoalArbiterImpl(outer: CoalArbiter,
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,10 +2,12 @@ package freechips.rocketchip.tilelink.coalescing
|
|||||||
|
|
||||||
import chisel3._
|
import chisel3._
|
||||||
import chiseltest._
|
import chiseltest._
|
||||||
|
import chiseltest.simulator.VerilatorFlags
|
||||||
import org.scalatest.flatspec.AnyFlatSpec
|
import org.scalatest.flatspec.AnyFlatSpec
|
||||||
import freechips.rocketchip.tilelink._
|
import freechips.rocketchip.tilelink._
|
||||||
import freechips.rocketchip.util.MultiPortQueue
|
import freechips.rocketchip.util.MultiPortQueue
|
||||||
import freechips.rocketchip.diplomacy._
|
import freechips.rocketchip.diplomacy._
|
||||||
|
import freechips.rocketchip.subsystem.WithoutTLMonitors
|
||||||
import org.chipsalliance.cde.config.Parameters
|
import org.chipsalliance.cde.config.Parameters
|
||||||
import chisel3.util.{DecoupledIO, Valid}
|
import chisel3.util.{DecoupledIO, Valid}
|
||||||
import chisel3.util.experimental.BoringUtils
|
import chisel3.util.experimental.BoringUtils
|
||||||
@@ -190,8 +192,8 @@ object testConfig extends CoalescerConfig(
|
|||||||
respQueueDepth = 4,
|
respQueueDepth = 4,
|
||||||
coalLogSizes = Seq(4, 5),
|
coalLogSizes = Seq(4, 5),
|
||||||
sizeEnum = DefaultInFlightTableSizeEnum,
|
sizeEnum = DefaultInFlightTableSizeEnum,
|
||||||
numArbiterOutputPorts = 4,
|
|
||||||
numCoalReqs = 1,
|
numCoalReqs = 1,
|
||||||
|
numArbiterOutputPorts = 4,
|
||||||
bankStrideInBytes = 64
|
bankStrideInBytes = 64
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -229,8 +231,8 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
|
|||||||
}
|
}
|
||||||
|
|
||||||
it should "coalesce fully consecutive accesses at size 4, only once" in {
|
it should "coalesce fully consecutive accesses at size 4, only once" in {
|
||||||
test(LazyModule(new DummyCoalescingUnitTB()).module)
|
test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module)
|
||||||
.withAnnotations(Seq(VerilatorBackendAnnotation, WriteFstAnnotation))
|
.withAnnotations(Seq(VerilatorBackendAnnotation, VerilatorFlags(Seq("--coverage-line")), WriteFstAnnotation))
|
||||||
// .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation))
|
// .withAnnotations(Seq(VcsBackendAnnotation, WriteFsdbAnnotation))
|
||||||
{ c =>
|
{ c =>
|
||||||
val nodes = c.coalIOs.map(_.head)
|
val nodes = c.coalIOs.map(_.head)
|
||||||
@@ -291,8 +293,8 @@ class CoalescerUnitTest extends AnyFlatSpec with ChiselScalatestTester {
|
|||||||
}
|
}
|
||||||
|
|
||||||
it should "coalesce identical addresses (stride of 0)" in {
|
it should "coalesce identical addresses (stride of 0)" in {
|
||||||
test(LazyModule(new DummyCoalescingUnitTB()).module)
|
test(LazyModule(new DummyCoalescingUnitTB()(new WithoutTLMonitors())).module)
|
||||||
.withAnnotations(Seq(VcsBackendAnnotation))
|
.withAnnotations(Seq(VerilatorBackendAnnotation))
|
||||||
{ c =>
|
{ c =>
|
||||||
println(s"coalIO length = ${c.coalIOs(0).length}")
|
println(s"coalIO length = ${c.coalIOs(0).length}")
|
||||||
val nodes = c.coalIOs.map(_.head)
|
val nodes = c.coalIOs.map(_.head)
|
||||||
|
|||||||
Reference in New Issue
Block a user