diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index 5e29d57..99bf113 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -3,12 +3,16 @@ #include #endif #include -#include +#include #include +#include #include #include #include "SimMemTrace.h" +// Global singleton instance +static std::unique_ptr reader; + MemTraceReader::MemTraceReader(const std::string &filename) { char cwd[4096]; if (getcwd(cwd, sizeof(cwd))) { @@ -34,10 +38,21 @@ void MemTraceReader::parse() { printf("MemTraceReader: started parsing\n"); - while (infile >> line.cycle >> line.loadstore >> line.core_id >> + long size = 0; + std::string loadstore; // FIXME: likely slow + while (infile >> line.cycle >> loadstore >> line.core_id >> line.lane_id >> std::hex >> line.address >> line.data >> std::dec >> - line.data_size) { + size) { line.valid = true; + + line.is_store = (loadstore == "STORE"); + + assert(size > 0 && "invalid size in trace"); + int lgsize = static_cast(log2(size)); + assert((size & ~(~0lu << lgsize)) == 0 && + "non-power-of-2 size detected in trace"); + line.log_data_size = lgsize; + trace.push_back(line); } read_pos = trace.cbegin(); @@ -75,8 +90,9 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, // read it right now. return MemTraceLine{}; } else if (line.cycle == cycle && line.lane_id == lane_id) { - printf("fire! cycle=%ld, valid=%d, %s addr=%x \n", cycle, line.valid, - line.loadstore, line.address); + printf("fire! cycle=%ld, valid=%d, %s addr=%lx, size=%d \n", cycle, + line.valid, (line.is_store ? "STORE" : "LOAD"), line.address, + line.log_data_size); // FIXME! Currently lane_id is assumed to be in round-robin order, e.g. // 0->1->2->3->0->..., both in the trace file and the order the caller calls @@ -119,11 +135,11 @@ extern "C" void memtrace_init(const char *filename) { // TODO: accept core_id as well extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long trace_read_cycle, - int trace_read_lane_id, + int trace_read_lane_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, unsigned char *trace_read_is_store, - int *trace_read_store_mask, + int *trace_read_size, unsigned long *trace_read_data, unsigned char *trace_read_finished) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, @@ -136,8 +152,8 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, auto line = reader->read_trace_at(trace_read_cycle, trace_read_lane_id); *trace_read_valid = line.valid; *trace_read_address = line.address; - *trace_read_is_store = strcmp(line.loadstore, "STORE") == 0 ; - *trace_read_store_mask = line.data_size; + *trace_read_is_store = line.is_store; + *trace_read_size = line.log_data_size; *trace_read_data = line.data; // This means finished and valid will go up at the same cycle. Need to // handle this without skipping the last line. diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index b046fcc..adc5ef0 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -2,20 +2,16 @@ #include #include -class MemTraceReader; - -// Global singleton instance of MemTraceReader -static std::unique_ptr reader; - struct MemTraceLine { bool valid = false; long cycle = 0; - char loadstore[10]; int core_id = 0; int lane_id = 0; + int source = 0; unsigned long address = 0; + bool is_store = 0; unsigned long data = 0; - int data_size = 0; + int log_data_size = 0; }; class MemTraceReader { @@ -31,14 +27,34 @@ public: std::vector::const_iterator read_pos; }; +class MemTraceWriter { +public: + MemTraceWriter(const bool is_response, const std::string &filename); + ~MemTraceWriter(); + void write_line_to_trace(const MemTraceLine line); + + bool is_response; + FILE *outfile; +}; + extern "C" void memtrace_init(const char *filename); extern "C" void memtrace_query(unsigned char trace_read_ready, unsigned long trace_read_cycle, - int trace_read_lane_id, + int trace_read_lane_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, unsigned char *trace_read_is_store, - int *trace_read_store_mask, + int *trace_read_size, unsigned long *trace_read_data, - unsigned char *trace_read_finished - ); + unsigned char *trace_read_finished); +extern "C" int memtracelogger_init(int is_response, const char *filename); +extern "C" void memtracelogger_log(int handle, + unsigned char trace_log_valid, + unsigned long trace_log_cycle, + int trace_log_lane_id, + int trace_log_source, + unsigned long trace_log_address, + unsigned char trace_log_is_store, + int trace_log_size, + unsigned long trace_log_data, + unsigned char *trace_log_ready); diff --git a/src/main/resources/csrc/SimMemTraceLogger.cc b/src/main/resources/csrc/SimMemTraceLogger.cc new file mode 100644 index 0000000..1226143 --- /dev/null +++ b/src/main/resources/csrc/SimMemTraceLogger.cc @@ -0,0 +1,107 @@ +#ifndef NO_VPI +#include +#include +#endif +#include "SimMemTrace.h" +#include +#include +#include +#include +#include +#include + +// Contains handle for every logger that is instantiated per Verilog module +// instance +static std::vector> loggers; + +MemTraceWriter::MemTraceWriter(const bool is_response, + const std::string &filename) { + this->is_response = is_response; + + char cwd[4096]; + if (getcwd(cwd, sizeof(cwd))) { + printf("MemTraceWriter: current working dir: %s\n", cwd); + } + + outfile = fopen(filename.c_str(), "w"); + if (!outfile) { + fprintf(stderr, "failed to open file %s\n", filename.c_str()); + } +} + +MemTraceWriter::~MemTraceWriter() { + fclose(outfile); + printf("MemTraceWriter destroyed\n"); +} + +void MemTraceWriter::write_line_to_trace(const MemTraceLine line) { + fprintf(outfile, "%ld %s %d %d %d 0x%lx 0x%lx %u\n", line.cycle, + (line.is_store ? "STORE" : "LOAD"), line.core_id, line.lane_id, + line.source, line.address, line.data, (1u << line.log_data_size)); +} + +// Returns the "handle" ID for this particular logger instance. +extern "C" int memtracelogger_init(int is_response, const char *filename) { +#ifndef NO_VPI + s_vpi_vlog_info info; + if (!vpi_get_vlog_info(&info)) { + fprintf(stderr, "fatal: failed to get plusargs from VCS\n"); + exit(1); + } + const char *TRACEFILENAME_PLUSARG = "+memtracefile="; + for (int i = 0; i < info.argc; i++) { + char *input_arg = info.argv[i]; + if (strncmp(input_arg, TRACEFILENAME_PLUSARG, + strlen(TRACEFILENAME_PLUSARG)) == 0) { + filename = input_arg + strlen(TRACEFILENAME_PLUSARG); + break; + } + } +#endif + + int handle = loggers.size(); + loggers.emplace_back(std::make_unique(is_response, filename)); + + printf("memtracelogger_init: handle=%d, is_response=%d, filename=[%s]\n", + handle, is_response, filename); + + return handle; +} + +// This is used to log both TileLink A and D channels. +// TODO: accept core_id as well +extern "C" void memtracelogger_log(int handle, + unsigned char trace_log_valid, + unsigned long trace_log_cycle, + int trace_log_lane_id, + int trace_log_source, + unsigned long trace_log_address, + unsigned char trace_log_is_store, + int trace_log_size, + unsigned long trace_log_data, + unsigned char *trace_log_ready) { + // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, + // trace_read_lane_id); + *trace_log_ready = 1; + + if (!trace_log_valid) { + return; + } + + // printf("%s: [%lu] valid: address=%lx, tid=%u, size=%d\n", __func__, + // trace_log_cycle, trace_log_address, trace_log_lane_id, trace_log_size); + + MemTraceLine line{.valid = (trace_log_valid == 1), + .cycle = static_cast(trace_log_cycle), + .core_id = 0, // TODO support multicores + .lane_id = trace_log_lane_id, + .source = trace_log_source, + .address = trace_log_address, + .is_store = (trace_log_is_store == 1), + .data = trace_log_data, + .log_data_size = trace_log_size}; + + assert(0 <= handle && handle < loggers.size() && "wrong trace logger handle"); + auto logger = loggers[handle].get(); + logger->write_line_to_trace(line); +} diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index 9a91848..1fc858a 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -1,6 +1,7 @@ +// FIXME hardcoded `define DATA_WIDTH 64 `define MAX_NUM_LANES 32 -`define MASK_WIDTH 8 +`define LOGSIZE_WIDTH 8 import "DPI-C" function void memtrace_init( input string filename @@ -14,35 +15,34 @@ import "DPI-C" function void memtrace_query ( input bit trace_read_ready, input longint trace_read_cycle, - input int trace_read_tid, + input int trace_read_lane_id, output bit trace_read_valid, output longint trace_read_address, output bit trace_read_is_store, - output int trace_read_store_mask, + output int trace_read_size, output longint trace_read_data, output bit trace_read_finished ); module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( - input clock, - input reset, + input clock, + input reset, - // These have to match the IO port of the Chisel wrapper module. - input trace_read_ready, - output [NUM_LANES-1:0] trace_read_valid, - output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address, - - output [NUM_LANES-1:0] trace_read_is_store, - output [NUM_LANES*`MASK_WIDTH-1:0] trace_read_store_mask, - output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, - output trace_read_finished + // These have to match the IO port name of the Chisel wrapper module. + input trace_read_ready, + output [NUM_LANES-1:0] trace_read_valid, + output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_address, + output [NUM_LANES-1:0] trace_read_is_store, + output [`LOGSIZE_WIDTH*NUM_LANES-1:0] trace_read_size, + output [`DATA_WIDTH*NUM_LANES-1:0] trace_read_data, + output trace_read_finished ); - bit __in_valid[NUM_LANES-1:0]; - longint __in_address[NUM_LANES-1:0]; + bit __in_valid [NUM_LANES-1:0]; + longint __in_address [NUM_LANES-1:0]; - bit __in_is_store[NUM_LANES-1:0]; - int __in_store_mask [NUM_LANES-1:0]; - longint __in_data[NUM_LANES-1:0]; + bit __in_is_store [NUM_LANES-1:0]; + reg [`LOGSIZE_WIDTH-1:0] __in_size [NUM_LANES-1:0]; + longint __in_data [NUM_LANES-1:0]; bit __in_finished; string __uartlog; @@ -54,13 +54,13 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( assign next_cycle_counter = cycle_counter + 1'b1; // registers that stage outputs of the C parser - reg [NUM_LANES-1:0] __in_valid_reg; + reg [NUM_LANES-1:0] __in_valid_reg; reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_LANES-1:0]; - reg [NUM_LANES-1:0] __in_is_store_reg; - reg [`MASK_WIDTH-1:0] __in_store_mask_reg [NUM_LANES-1:0]; - reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_LANES-1:0]; - reg __in_finished_reg; + reg [NUM_LANES-1:0] __in_is_store_reg; + reg [`LOGSIZE_WIDTH-1:0] __in_size_reg [NUM_LANES-1:0]; + reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_LANES-1:0]; + reg __in_finished_reg; genvar g; @@ -70,7 +70,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_reg[g]; assign trace_read_is_store[g] = __in_is_store_reg[g]; - assign trace_read_store_mask[`MASK_WIDTH*(g+1)-1:`MASK_WIDTH*g] = __in_store_mask_reg[g]; + assign trace_read_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g] = __in_size_reg[g]; assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_reg[g]; end endgenerate @@ -81,17 +81,14 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( memtrace_init(FILENAME); end - // Evaluate the signals on the positive edge always @(posedge clock) begin - - // Setting reset value if (reset) begin for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin __in_valid[tid] = 1'b0; __in_address[tid] = `DATA_WIDTH'b0; __in_is_store[tid] = 1'b0; - __in_store_mask[tid] = `MASK_WIDTH'b0; + __in_size[tid] = `LOGSIZE_WIDTH'b0; __in_data[tid] = `DATA_WIDTH'b0; end @@ -105,7 +102,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address_reg[tid] <= `DATA_WIDTH'b0; __in_is_store_reg[tid] = 1'b0; - __in_store_mask_reg[tid] = `MASK_WIDTH'b0; + __in_size_reg[tid] = `LOGSIZE_WIDTH'b0; __in_data_reg[tid] = `DATA_WIDTH'b0; end @@ -127,7 +124,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address[tid], __in_is_store[tid], - __in_store_mask[tid], + __in_size[tid], __in_data[tid], __in_finished @@ -140,7 +137,7 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_LANES = 4) ( __in_address_reg[tid] <= __in_address[tid]; __in_is_store_reg[tid] <= __in_is_store[tid]; - __in_store_mask_reg[tid] <= __in_store_mask[tid]; + __in_size_reg[tid] <= __in_size[tid]; __in_data_reg[tid] <= __in_data[tid]; end __in_finished_reg <= __in_finished; diff --git a/src/main/resources/vsrc/SimMemTraceLogger.v b/src/main/resources/vsrc/SimMemTraceLogger.v new file mode 100644 index 0000000..3ffa794 --- /dev/null +++ b/src/main/resources/vsrc/SimMemTraceLogger.v @@ -0,0 +1,106 @@ +// FIXME hardcoded +`define DATA_WIDTH 64 +`define MAX_NUM_LANES 32 +`define SOURCEID_WIDTH 32 +`define LOGSIZE_WIDTH 8 + +import "DPI-C" function int memtracelogger_init( + input bit is_response, + input string filename +); + +// Make sure to sync the parameters for: +// (1) import "DPI-C" declaration +// (2) C function declaration +// (3) DPI function calls inside initial/always blocks +import "DPI-C" function void memtracelogger_log +( + input int handle, + input bit trace_log_valid, + input longint trace_log_cycle, + input int trace_log_lane_id, + input int trace_log_source, + input longint trace_log_address, + input bit trace_log_is_store, + input int trace_log_size, + input longint trace_log_data, + output bit trace_log_ready +); + +module SimMemTraceLogger #(parameter + IS_RESPONSE = 0, + FILENAME = "undefined", + NUM_LANES = 4) ( + input clock, + input reset, + + // NOTE: LSB is lane 0 + input [NUM_LANES-1:0] trace_log_valid, + input [`SOURCEID_WIDTH*NUM_LANES-1:0] trace_log_source, + input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_address, + input [NUM_LANES-1:0] trace_log_is_store, + input [`LOGSIZE_WIDTH*NUM_LANES-1:0] trace_log_size, + input [`DATA_WIDTH*NUM_LANES-1:0] trace_log_data, + output trace_log_ready +); + int logger_handle; + bit __in_ready; + + // cycle_counter will start off right after reset is deasserted which should + // synchronize itself with SimMemTrace.cycle_counter + reg [`DATA_WIDTH-1:0] cycle_counter; + wire [`DATA_WIDTH-1:0] next_cycle_counter; + assign next_cycle_counter = cycle_counter + 1'b1; + + // wires going into the DPC + wire __valid [NUM_LANES-1:0]; + wire [`SOURCEID_WIDTH-1:0] __source [NUM_LANES-1:0]; + wire [`DATA_WIDTH-1:0] __address [NUM_LANES-1:0]; + wire __is_store [NUM_LANES-1:0]; + wire [`LOGSIZE_WIDTH-1:0] __size [NUM_LANES-1:0]; + wire [`DATA_WIDTH-1:0] __data [NUM_LANES-1:0]; + + assign trace_log_ready = __in_ready; + + genvar g; + generate + for (g = 0; g < NUM_LANES; g = g + 1) begin + // LSB is lane 0 + assign __valid[g] = trace_log_valid[g]; + assign __source[g] = trace_log_source[`SOURCEID_WIDTH*(g+1)-1:`SOURCEID_WIDTH*g]; + assign __address[g] = trace_log_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g]; + assign __is_store[g] = trace_log_is_store[g]; + assign __size[g] = trace_log_size[`LOGSIZE_WIDTH*(g+1)-1:`LOGSIZE_WIDTH*g]; + assign __data[g] = trace_log_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g]; + end + endgenerate + + initial begin + /* $value$plusargs("uartlog=%s", __uartlog); */ + logger_handle = memtracelogger_init(IS_RESPONSE, FILENAME); + end + + always @(posedge clock) begin + if (reset) begin + __in_ready = 1'b1; + cycle_counter <= `DATA_WIDTH'b0; + end else begin + cycle_counter <= next_cycle_counter; + + for (integer tid = 0; tid < NUM_LANES; tid = tid + 1) begin + memtracelogger_log( + logger_handle, + __valid[tid], + cycle_counter, + tid, + __source[tid], + __address[tid], + __is_store[tid], + __size[tid], + __data[tid], + __in_ready + ); + end + end + end +endmodule diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 4c23e25..f9a672b 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -106,7 +106,6 @@ class MultiCoalescer[QueueT: CoalShiftQueue] }) } - class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModuleImp(outer) { // Make sure IdentityNode is connected to an upstream node, not just the // coalescer TL master node @@ -126,11 +125,10 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule // The maximum number of requests from a single lane that can go into a // coalesced request. Upper bound is min(DEPTH, 2**sourceWidth). - val numPerLaneReqs = DEPTH - + val numPerLaneReqs = CoalescerConsts.DEPTH val coalescer = Module(new MultiCoalescer(Seq(4, 5, 6), reqQueues, reqQueueEntryT)) - val respQueueEntryT = new RespQueueEntry(sourceWidth, wordSize * 8) + val respQueueEntryT = new RespQueueEntry(sourceWidth, CoalescerConsts.WORD_SIZE * 8, sizeWidth) val respQueues = Seq.tabulate(numLanes) { _ => Module( new MultiPortQueue( @@ -181,85 +179,95 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val reqQueue = reqQueues(lane) val req = Wire(reqQueueEntryT) - // ********** - // CONNECTING IO - // ********** + req.op := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) + req.source := tlIn.a.bits.source + req.address := tlIn.a.bits.address + req.data := tlIn.a.bits.data + req.size := tlIn.a.bits.size - assert(~tlIn.a.valid || (tlIn.a.bits.opcode === OpCode.Get || tlIn.a.bits.opcode === OpCode.PutFullData || - tlIn.a.bits.opcode === OpCode.PutPartialData), "Coalescer input has unsupported TL opcode"); - req.op := tlIn.a.bits.opcode === OpCode.Get ? 0.U : 1.U - req.source := tlIn.a.bits.source - req.address := tlIn.a.bits.address - req.data := tlIn.a.bits.data - req.size := tlIn.a.bits.size + assert(reqQueue.io.queue.enq.ready, "reqQueue is supposed to be always ready") + reqQueue.io.queue.enq.valid := tlIn.a.valid + reqQueue.io.queue.enq.bits := req + // TODO: deq.ready should respect downstream ready + reqQueue.io.queue.deq.ready := true.B + reqQueue.io.invalidate.bits := coalescer.io.invalidate.bits(lane) + reqQueue.io.invalidate.valid := coalescer.io.invalidate.valid - reqQueue.io.enq.valid := tlIn.a.valid - reqQueue.io.enq.bits := req - // TODO: deq.ready should respect downstream ready - reqQueue.io.deq.ready := true.B - reqQueue.io.invalidate.bits := 0.U // TODO - reqQueue.io.invalidate.valid := false.B // TODO - printf(s"reqQueue(${lane}).count=%d\n", reqQueue.io.count) + tlOut.a.valid := reqQueue.io.queue.deq.valid - val reqHead = reqQueue.io.deq.bits - // FIXME: generate Get or Put according to read/write - val (reqLegal, reqBits) = edgeOut.Get( - fromSource = reqHead.source, - // `toAddress` should be aligned to 2**lgSize - toAddress = reqHead.address, - lgSize = 0.U - ) - assert(reqLegal, "unhandled illegal TL req gen") + val reqHead = reqQueue.io.queue.deq.bits + val (plegal, pbits) = edgeOut.Put( + fromSource = reqHead.source, + toAddress = reqHead.address, + lgSize = reqHead.size, + // data is already aligned by MemTraceDriver + // NOTE: if tlIn has different parameters, this will no longer be the + // case + data = reqHead.data, + mask = reqHead.mask + ) + val (glegal, gbits) = edgeOut.Get( + fromSource = reqHead.source, + toAddress = reqHead.address, + lgSize = reqHead.size + ) + val legal = Mux(reqHead.op.asBool, plegal, glegal) + val bits = Mux(reqHead.op.asBool, pbits, gbits) + assert(legal, "unhandled illegal TL req gen") + tlOut.a.bits := bits - tlOut.a.bits := reqBits // TODO: this is incorrect, this does not take iinto account of queue - tlOut.a.valid := reqQueue.io.deq.valid + // Response queue + // + // This queue will serialize non-coalesced responses along with + // coalesced responses and serve them back to the core side. + val respQueue = respQueues(lane) + val resp = Wire(respQueueEntryT) + resp.source := tlOut.d.bits.source + resp.op := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) + resp.size := tlOut.d.bits.size + resp.data := tlOut.d.bits.data - // Response queue - // - // This queue will serialize non-coalesced responses along with - // coalesced responses and serve them back to the core side. - val respQueue = respQueues(lane) - val resp = Wire(respQueueEntryT) - resp.source := tlOut.d.bits.source - resp.data := tlOut.d.bits.data - // TODO: read/write bit? + // Queue up responses that didn't get coalesced originally ("noncoalesced" responses). + // Coalesced (but uncoalesced back) responses will also be enqueued into the same queue. + assert( + respQueue.io.enq(respQueueNoncoalPort).ready, + "respQueue: enq port for noncoalesced response is blocked" + ) + respQueue.io.enq(respQueueNoncoalPort).valid := tlOut.d.valid + respQueue.io.enq(respQueueNoncoalPort).bits := resp + // TODO: deq.ready should respect upstream ready + respQueue.io.deq(respQueueNoncoalPort).ready := true.B - // Queue up responses that didn't get coalesced originally ("noncoalesced" responses). - // Coalesced (but uncoalesced back) responses will also be enqueued into the same queue. - assert( - respQueue.io.enq(respQueueNoncoalPort).ready, - "respQueue: enq port for noncoalesced response is blocked" - ) - respQueue.io.enq(respQueueNoncoalPort).valid := tlOut.d.valid - respQueue.io.enq(respQueueNoncoalPort).bits := resp - // TODO: deq.ready should respect upstream ready - respQueue.io.deq(respQueueNoncoalPort).ready := true.B + tlIn.d.valid := respQueue.io.deq(respQueueNoncoalPort).valid + val respHead = respQueue.io.deq(respQueueNoncoalPort).bits + val apBits = edgeIn.AccessAck( + toSource = respHead.source, + lgSize = respHead.size + ) + val agBits = edgeIn.AccessAck( + toSource = respHead.source, + lgSize = respHead.size, + data = respHead.data + ) + val respBits = Mux(respHead.isStore, apBits, agBits) + tlIn.d.bits := respBits - tlIn.d.valid := respQueue.io.deq(respQueueNoncoalPort).valid - val respHead = respQueue.io.deq(respQueueNoncoalPort).bits - val respBits = edgeIn.AccessAck( - toSource = respHead.source, - lgSize = 0.U, - data = respHead.data - ) - tlIn.d.bits := respBits - - // Debug only - val inflightCounter = RegInit(UInt(32.W), 0.U) - when(tlOut.a.valid) { - // don't inc/dec on simultaneous req/resp - when(!tlOut.d.valid) { - inflightCounter := inflightCounter + 1.U + // Debug only + val inflightCounter = RegInit(UInt(32.W), 0.U) + when(tlOut.a.valid) { + // don't inc/dec on simultaneous req/resp + when(!tlOut.d.valid) { + inflightCounter := inflightCounter + 1.U + } + }.elsewhen(tlOut.d.valid) { + inflightCounter := inflightCounter - 1.U } - }.elsewhen(tlOut.d.valid) { - inflightCounter := inflightCounter - 1.U - } - dontTouch(inflightCounter) - dontTouch(tlIn.a) - dontTouch(tlIn.d) - dontTouch(tlOut.a) - dontTouch(tlOut.d) + dontTouch(inflightCounter) + dontTouch(tlIn.a) + dontTouch(tlIn.d) + dontTouch(tlOut.a) + dontTouch(tlOut.d) } // Generate coalesced requests @@ -268,24 +276,30 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule val (tlCoal, edgeCoal) = outer.coalescerNode.out(0) val coalReqAddress = Wire(UInt(tlCoal.params.addressBits.W)) - // TODO: bogus address + // FIXME: bogus address coalReqAddress := (0xabcd.U + coalSourceId) << 4 - // FIXME: coalesce lane 0 and lane 2's queue head whenever they're valid + // FIXME: bogus coalescing logic: coalesce whenever all 4 lanes have valid + // queue head coalReqValid := reqQueues(0).io.deq.valid && reqQueues(1).io.deq.valid && reqQueues(2).io.deq.valid && reqQueues(3).io.deq.valid + // coalReqValid := false.B when(coalReqValid) { // invalidate original requests due to coalescing + // FIXME: bogus reqQueues(0).io.invalidate := 0x1.U reqQueues(1).io.invalidate := 0x1.U reqQueues(2).io.invalidate := 0x1.U reqQueues(3).io.invalidate := 0x1.U + printf("coalescing succeeded!\n") } + // TODO: write request val (legal, bits) = edgeCoal.Get( fromSource = coalSourceId, // `toAddress` should be aligned to 2**lgSize toAddress = coalReqAddress, // 64 bits = 8 bytes = 2**(3) bytes + // TODO: parameterize to eg. cache line size lgSize = 3.U ) assert(legal, "unhandled illegal TL req gen") @@ -298,19 +312,24 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule // Construct new entry for the inflight table // FIXME: don't instantiate inflight table entry type here. It leaks the table's impl - // detail outside to the coalescer + // detail to the coalescer val offsetBits = 4 // FIXME hardcoded val sizeBits = 2 // FIXME hardcoded val newEntry = Wire( new InflightCoalReqTableEntry(numLanes, numPerLaneReqs, sourceWidth, offsetBits, sizeBits) ) + + println(s"=========== table sourceWidth: ${sourceWidth}") + println(s"=========== table sizeBits: ${sizeBits}") + newEntry.source := coalSourceId newEntry.lanes.foreach { l => - l.reqs.foreach { r => + l.reqs.zipWithIndex.foreach { case (r, i) => // TODO: this part needs the actual coalescing logic to work r.valid := false.B + r.source := i.U // FIXME bogus r.offset := 1.U - r.size := 2.U + r.size := 2.U // FIXME hardcoded } } newEntry.lanes(0).reqs(0).valid := true.B @@ -326,6 +345,7 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule numLanes, numPerLaneReqs, sourceWidth, + sizeWidth, coalDataWidth, outer.numInflightCoalRequests ) @@ -337,6 +357,8 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule uncoalescer.io.coalRespSrcId := tlCoal.d.bits.source uncoalescer.io.coalRespData := tlCoal.d.bits.data + println(s"=========== coalRespData width: ${tlCoal.d.bits.data.widthOption.get}") + // Queue up synthesized uncoalesced responses into each lane's response queue (respQueues zip uncoalescer.io.uncoalResps).foreach { case (q, lanes) => lanes.zipWithIndex.foreach { case (resp, i) => @@ -353,7 +375,6 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule } // Debug - dontTouch(coalReqValid) dontTouch(coalReqAddress) val coalRespData = tlCoal.d.bits.data dontTouch(coalRespData) @@ -363,17 +384,16 @@ class CoalescingUnitImp(outer: CoalescingUnit, numLanes: Int) extends LazyModule } class UncoalescingUnit( - val numLanes: Int, - val numPerLaneReqs: Int, - val sourceWidth: Int, - val coalDataWidth: Int, - val numInflightCoalRequests: Int - ) extends Module { + val numLanes: Int, + val numPerLaneReqs: Int, + val sourceWidth: Int, + val sizeWidth: Int, + val coalDataWidth: Int, + val numInflightCoalRequests: Int +) extends Module { val inflightTable = Module( new InflightCoalReqTable(numLanes, numPerLaneReqs, sourceWidth, numInflightCoalRequests) ) - val wordSize = 4 // FIXME duplicate - val io = IO(new Bundle { val coalReqValid = Input(Bool()) val newEntry = Input(inflightTable.entryT) @@ -381,7 +401,10 @@ class UncoalescingUnit( val coalRespSrcId = Input(UInt(sourceWidth.W)) val coalRespData = Input(UInt(coalDataWidth.W)) val uncoalResps = Output( - Vec(numLanes, Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, wordSize * 8)))) + Vec( + numLanes, + Vec(numPerLaneReqs, ValidIO(new RespQueueEntry(sourceWidth, WordSizeInBytes() * 8, sizeWidth))) + ) ) }) @@ -405,7 +428,10 @@ class UncoalescingUnit( def getCoalescedDataChunk(data: UInt, dataWidth: Int, offset: UInt, byteSize: Int): UInt = { val bitSize = byteSize * 8 val sizeMask = (1.U << bitSize) - 1.U - assert(dataWidth % bitSize == 0, "coalesced data width not evenly divisible by size") + assert( + dataWidth > 0 && dataWidth % bitSize == 0, + s"coalesced data width ($dataWidth) not evenly divisible by core req size ($bitSize)" + ) val numChunks = dataWidth / bitSize val chunks = Wire(Vec(numChunks, UInt(bitSize.W))) val offsets = (0 until numChunks) @@ -418,23 +444,22 @@ class UncoalescingUnit( // Un-coalesce responses back to individual lanes val found = inflightTable.io.lookup.bits - (found.lanes zip io.uncoalResps).foreach { case (lane, ioLane) => - lane.reqs.zipWithIndex.foreach { case (req, i) => - val ioReq = ioLane(i) + (found.lanes zip io.uncoalResps).foreach { case (perLane, ioPerLane) => + perLane.reqs.zipWithIndex.foreach { case (oldReq, i) => + val ioOldReq = ioPerLane(i) // FIXME: only looking at 0th srcId entry - ioReq.valid := false.B - ioReq.bits := DontCare + ioOldReq.valid := false.B + ioOldReq.bits := DontCare when(inflightTable.io.lookup.valid) { - ioReq.valid := req.valid - ioReq.bits.source := 0.U - + ioOldReq.valid := oldReq.valid + ioOldReq.bits.source := oldReq.source // FIXME: disregard size enum for now val byteSize = 4 - ioReq.bits.data := - getCoalescedDataChunk(io.coalRespData, coalDataWidth, req.offset, byteSize) + ioOldReq.bits.data := + getCoalescedDataChunk(io.coalRespData, coalDataWidth, oldReq.offset, byteSize) } } } @@ -478,6 +503,8 @@ class InflightCoalReqTable( table(i).valid := false.B table(i).bits.lanes.foreach { l => l.reqs.foreach { r => + r.valid := false.B + r.source := 0.U r.offset := 0.U r.size := 0.U } @@ -490,13 +517,12 @@ class InflightCoalReqTable( .map { i => table(i).valid } .reduce { (v0, v1) => v0 && v1 } // Inflight table should never be full. It should have enough number of - // entries to keep track of all outstanding core-side requests; otherwise, - // it will stall the core issuing logic. - assert(!full, "table is blocking coalescer") + // entries to keep track of all outstanding core-side requests, i.e. + // (2 ** oldSrcIdBits) entries. + assert(!full, "inflight table is full and blocking coalescer") dontTouch(full) // Enqueue logic - // io.enq.ready := !full val enqFire = io.enq.ready && io.enq.valid when(enqFire) { @@ -511,7 +537,6 @@ class InflightCoalReqTable( } // Lookup logic - // io.lookup.valid := table(io.lookupSourceId).valid io.lookup.bits := table(io.lookupSourceId).bits val lookupFire = io.lookup.ready && io.lookup.valid @@ -524,21 +549,23 @@ class InflightCoalReqTable( } class InflightCoalReqTableEntry( - val numLanes: Int, - // Maximum number of requests from a single lane that can get coalesced into a single request - val numPerLaneReqs: Int, - val sourceWidth: Int, - val offsetBits: Int, - val sizeBits: Int - ) extends Bundle { - class CoreReq extends Bundle { + + val numLanes: Int, + // Maximum number of requests from a single lane that can get coalesced into a single request + val numPerLaneReqs: Int, + val sourceWidth: Int, + val offsetBits: Int, + val sizeBits: Int +) extends Bundle { + class PerCoreReq extends Bundle { val valid = Bool() + // FIXME: oldId and newId shares the same width + val source = UInt(sourceWidth.W) val offset = UInt(offsetBits.W) val size = UInt(sizeBits.W) } class PerLane extends Bundle { - // FIXME: if numPerLaneReqs != 2 ** sourceWidth, we need to store srcId as well - val reqs = Vec(numPerLaneReqs, new CoreReq) + val reqs = Vec(numPerLaneReqs, new PerCoreReq) } // sourceId of the coalesced response that just came back. This will be the // key that queries the table. @@ -561,6 +588,8 @@ class CoalShiftQueue[T <: Data]( val invalidate = Input(Valid(UInt(entries.W))) val mask = Output(UInt(entries.W)) val elts = Output(Vec(entries, gen)) + // 'QueueIO' provides io.count, but we might not want to use it in the + // coalescer because it has potentially expensive PopCount }) private val valid = RegInit(VecInit(Seq.fill(entries) { false.B })) @@ -585,7 +614,7 @@ class CoalShiftQueue[T <: Data]( def paddedUsed = pad({ i: Int => used(i) }) def validAfterInv(i: Int) = valid(i) && !io.invalidate.bits(i) - val shift = io.queue.deq.ready || (used =/= 0.U) && !validAfterInv(0) + val shift = (used =/= 0.U) && (io.queue.deq.ready || !validAfterInv(0)) for (i <- 0 until entries) { val wdata = if (i == entries - 1) io.queue.enq.bits else Mux(!used(i + 1), io.queue.enq.bits, elts(i + 1)) val wen = Mux( @@ -635,14 +664,32 @@ class CoalShiftQueue[T <: Data]( io.queue.count := PopCount(io.mask) } -class MemTraceDriver(numLanes: Int = 4, traceFile : String = "vecadd.core1.thread4.trace")(implicit p: Parameters) extends LazyModule { +object TLUtils { + def AOpcodeIsStore(opcode: UInt): Bool = { + assert( + opcode === TLMessages.PutFullData || opcode === TLMessages.Get, + "unhandled TL A opcode found" + ) + Mux(opcode === TLMessages.PutFullData, true.B, false.B) + } + def DOpcodeIsStore(opcode: UInt): Bool = { + assert( + opcode === TLMessages.AccessAck || opcode === TLMessages.AccessAckData, + "unhandled TL D opcode found" + ) + Mux(opcode === TLMessages.AccessAck, true.B, false.B) + } +} +class MemTraceDriver(numLanes: Int = 4, filename: String = "vecadd.core1.thread4.trace")(implicit + p: Parameters +) extends LazyModule { // Create N client nodes together val laneNodes = Seq.tabulate(numLanes) { i => val clientParam = Seq( TLMasterParameters.v1( name = "MemTraceDriver" + i.toString, - sourceId = IdRange(0, 0x1000) + sourceId = IdRange(0, 0x10) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) ) @@ -654,23 +701,33 @@ class MemTraceDriver(numLanes: Int = 4, traceFile : String = "vecadd.core1.threa val node = TLIdentityNode() laneNodes.foreach { l => node := l } - lazy val module = new MemTraceDriverImp(this, numLanes, traceFile) + lazy val module = new MemTraceDriverImp(this, numLanes, filename) } -class TraceReq extends Bundle { +trait HasTraceLine { + val valid: UInt + val source: UInt + val address: UInt + val is_store: UInt + val size: UInt + val data: UInt +} + +// Used for both request and response. Response had address set to 0 +// NOTE: these widths have to agree with what's hardcoded in Verilog. +class TraceLine extends Bundle with HasTraceLine { val valid = Bool() - val address = UInt(64.W) + val source = UInt(32.W) + val address = UInt(64.W) // FIXME: in Verilog this is the same as data width val is_store = Bool() - val mask = UInt(8.W) + val size = UInt(8.W) // this is log2(bytesize) as in TL A bundle val data = UInt(64.W) } -class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile : String) - extends LazyModuleImp(outer) +class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile: String) + extends LazyModuleImp(outer) with UnitTestModule { - val sim = Module( - new SimMemTrace(traceFile, numLanes) - ) + val sim = Module(new SimMemTrace(traceFile, numLanes)) sim.io.clock := clock sim.io.reset := reset.asBool sim.io.trace_read.ready := true.B @@ -678,15 +735,18 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile : String // Split output of SimMemTrace, which is flattened across all lanes, // back to each lane's. - // Maybe this part can be improved, since now we are still mannually shifting everything - val laneReqs = Wire(Vec(numLanes, new TraceReq)) + val laneReqs = Wire(Vec(numLanes, new TraceLine)) + val addrW = laneReqs(0).address.getWidth + val sizeW = laneReqs(0).size.getWidth + val dataW = laneReqs(0).data.getWidth laneReqs.zipWithIndex.foreach { case (req, i) => - req.valid := (sim.io.trace_read.valid >> i) - req.address := (sim.io.trace_read.address >> (64 * i)) - req.is_store := (sim.io.trace_read.is_store >> i) - req.mask := (sim.io.trace_read.store_mask >> (8 * i)) - req.data := (sim.io.trace_read.data >> (64 * i)) - + req.valid := sim.io.trace_read.valid(i) + // TODO: driver trace doesn't contain source id + req.source := 0.U + req.address := sim.io.trace_read.address(addrW * (i + 1) - 1, addrW * i) + req.is_store := sim.io.trace_read.is_store(i) + req.size := sim.io.trace_read.size(sizeW * (i + 1) - 1, sizeW * i) + req.data := sim.io.trace_read.data(dataW * (i + 1) - 1, dataW * i) } // To prevent collision of sourceId with a current in-flight message, @@ -695,31 +755,77 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile : String val sourceIdCounter = RegInit(0.U(64.W)) sourceIdCounter := sourceIdCounter + 1.U - //Issue here is that Vortex mem range is not within Chipyard Mem range - //In default setting, all mem-req for program data must be within 0X80000000 -> 0X90000000 - // - def hashToValidPhyAddr(addr : UInt) : UInt = { - Cat(8.U(4.W), addr(27, 3), 0.U(3.W) ) + // Issue here is that Vortex mem range is not within Chipyard Mem range + // In default setting, all mem-req for program data must be within + // 0X80000000 -> 0X90000000 + def hashToValidPhyAddr(addr: UInt): UInt = { + Cat(8.U(4.W), addr(27, 0)) } - // Connect each lane to its respective TL node. + // Generate TL requests corresponding to the trace lines (outer.laneNodes zip laneReqs).foreach { case (node, req) => - val (tlOut, edge) = node.out(0) + // Core only makes accesses of granularity larger than a word, so we want + // the trace driver to act so as well. + // That means if req.size is smaller than word size, we need to pad data + // with zeros to generate a word-size request, and set mask accordingly. + val offsetInWord = req.address % WordSizeInBytes().U + val subword = req.size < log2Ceil(WordSizeInBytes()).U + val mask = Wire(UInt(WordSizeInBytes().W)) + val wordData = Wire(UInt((WordSizeInBytes() * 8).W)) + val sizeInBytes = Wire(UInt((sizeW + 1).W)) + sizeInBytes := (1.U) << req.size + mask := Mux(subword, (~((~0.U(64.W)) << sizeInBytes)) << offsetInWord, ~0.U) + wordData := Mux(subword, req.data << (offsetInWord * 8.U), req.data) + val wordAlignedAddress = req.address & ~((1 << log2Ceil(WordSizeInBytes())) - 1).U(addrW.W) + + assert( + req.size <= log2Ceil(WordSizeInBytes()).U, + s"trace driver currently does not support access sizes larger than word size (${WordSizeInBytes()})" + ) + val wordAlignedSize = 2.U // FIXME: hardcoded + + // when(req.valid && subword) { + // printf( + // "address=%x, size=%d, data=%x, addressMask=%x, wordAlignedAddress=%x, mask=%x, wordData=%x\n", + // req.address, + // req.size, + // req.data, + // ~((1 << log2Ceil(WordSizeInBytes())) - 1).U(addrW.W), + // wordAlignedAddress, + // mask, + // wordData + // ) + // } + + val (tlOut, edge) = node.out(0) val (plegal, pbits) = edge.Put( fromSource = sourceIdCounter, - toAddress = hashToValidPhyAddr(req.address), - lgSize = 3.U, - data = req.data + toAddress = hashToValidPhyAddr(wordAlignedAddress), + lgSize = wordAlignedSize, // trace line already holds log2(size) + // data should be aligned to beatBytes + data = (wordData << (8.U * (wordAlignedAddress % edge.manager.beatBytes.U))) ) val (glegal, gbits) = edge.Get( fromSource = sourceIdCounter, - toAddress = hashToValidPhyAddr(req.address), - lgSize = 3.U + toAddress = hashToValidPhyAddr(wordAlignedAddress), + lgSize = wordAlignedSize ) val legal = Mux(req.is_store, plegal, glegal) val bits = Mux(req.is_store, pbits, gbits) + when(tlOut.a.valid) { + TracePrintf( + "MemTraceDriver", + tlOut.a.bits.address, + tlOut.a.bits.size, + tlOut.a.bits.mask, + req.is_store, + tlOut.a.bits.data, + req.data + ) + } + assert(legal, "illegal TL req gen") tlOut.a.valid := req.valid tlOut.a.bits := bits @@ -728,19 +834,25 @@ class MemTraceDriverImp(outer: MemTraceDriver, numLanes: Int, traceFile : String tlOut.d.ready := true.B tlOut.e.valid := false.B + println(s"======= MemTraceDriver: TL data width: ${tlOut.params.dataBits}") + dontTouch(tlOut.a) dontTouch(tlOut.d) } - io.finished := sim.io.trace_read.finished - when(io.finished){ - assert(false.B, "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)") + // Give some slack time after trace EOF to the downstream system so that we + // make sure to receive all outstanding responses. + val finishCounter = RegInit(200.U(64.W)) + when(sim.io.trace_read.finished) { + finishCounter := finishCounter - 1.U } - - // Clock Counter, for debugging purpose - val clkcount = RegInit(0.U(64.W)) - clkcount := clkcount + 1.U - dontTouch(clkcount) + io.finished := (finishCounter === 0.U) + // when(io.finished) { + // assert( + // false.B, + // "\n\n\nsimulation Successfully finished\n\n\n (this assertion intentional fail upon MemTracer termination)" + // ) + // } } class SimMemTrace(filename: String, numLanes: Int) @@ -748,22 +860,27 @@ class SimMemTrace(filename: String, numLanes: Int) Map("FILENAME" -> filename, "NUM_LANES" -> numLanes) ) with HasBlackBoxResource { + val traceLineT = new TraceLine + val addrW = traceLineT.address.getWidth + val sizeW = traceLineT.size.getWidth + val dataW = traceLineT.data.getWidth + val io = IO(new Bundle { val clock = Input(Clock()) val reset = Input(Bool()) // These names have to match declarations in the Verilog code, eg. // trace_read_address. - val trace_read = new Bundle { + val trace_read = new Bundle { // can't use HasTraceLine because this doesn't have source val ready = Input(Bool()) val valid = Output(UInt(numLanes.W)) // Chisel can't interface with Verilog 2D port, so flatten all lanes into // single wide 1D array. // TODO: assumes 64-bit address. - val address = Output(UInt((64 * numLanes).W)) + val address = Output(UInt((addrW * numLanes).W)) val is_store = Output(UInt(numLanes.W)) - val store_mask = Output(UInt((8 * numLanes).W)) - val data = Output(UInt((64 * numLanes).W)) + val size = Output(UInt((sizeW * numLanes).W)) + val data = Output(UInt((dataW * numLanes).W)) val finished = Output(Bool()) } }) @@ -773,23 +890,336 @@ class SimMemTrace(filename: String, numLanes: Int) addResource("/csrc/SimMemTrace.h") } -class CoalConnectTrace(implicit p: Parameters) extends LazyModule { +class MemTraceLogger( + numLanes: Int, + // base filename for the generated trace files. full filename will be + // suffixed depending on `reqEnable`/`respEnable`/`loggerName`. + filename: String = "vecadd.core1.thread4.trace", + reqEnable: Boolean = true, + respEnable: Boolean = true, + // filename suffix that is unique to this logger module. + loggerName: String = ".logger" +)(implicit + p: Parameters +) extends LazyModule { + val node = TLIdentityNode() + + // val beatBytes = 8 // FIXME: hardcoded + // val node = TLManagerNode(Seq.tabulate(numLanes) { _ => + // TLSlavePortParameters.v1( + // Seq( + // TLSlaveParameters.v1( + // address = List(AddressSet(0x0000, 0xffffff)), // FIXME: hardcoded + // supportsGet = TransferSizes(1, beatBytes), + // supportsPutPartial = TransferSizes(1, beatBytes), + // supportsPutFull = TransferSizes(1, beatBytes) + // ) + // ), + // beatBytes = beatBytes + // ) + // }) + + // Copied from freechips.rocketchip.trailingZeros which only supports Scala + // integers + def trailingZeros(x: UInt): UInt = { + Mux(x === 0.U, x.widthOption.get.U, Log2(x & -x)) + } + + lazy val module = new Impl + class Impl extends LazyModuleImp(this) { + val io = IO(new Bundle { + val numReqs = Output(UInt(64.W)) + val numResps = Output(UInt(64.W)) + val reqBytes = Output(UInt(64.W)) + val respBytes = Output(UInt(64.W)) + }) + + val numReqs = RegInit(0.U(64.W)) + val numResps = RegInit(0.U(64.W)) + val reqBytes = RegInit(0.U(64.W)) + val respBytes = RegInit(0.U(64.W)) + io.numReqs := numReqs + io.numResps := numResps + io.reqBytes := reqBytes + io.respBytes := respBytes + + val simReq = + if (reqEnable) + Some(Module(new SimMemTraceLogger(false, s"${filename}.${loggerName}.req", numLanes))) + else None + val simResp = + if (respEnable) + Some(Module(new SimMemTraceLogger(true, s"${filename}.${loggerName}.resp", numLanes))) + else None + if (simReq.isDefined) { + simReq.get.io.clock := clock + simReq.get.io.reset := reset.asBool + } + if (simResp.isDefined) { + simResp.get.io.clock := clock + simResp.get.io.reset := reset.asBool + } + + val laneReqs = Wire(Vec(numLanes, new TraceLine)) + val laneResps = Wire(Vec(numLanes, new TraceLine)) + + assert( + numLanes == node.in.length, + "`numLanes` does not match the number of TL edges connected to the MemTraceLogger" + ) + + // snoop on the TileLink edges to log traffic + ((node.in zip node.out) zip (laneReqs zip laneResps)).foreach { + case (((tlIn, _), (tlOut, _)), (req, resp)) => + tlOut.a <> tlIn.a + tlIn.d <> tlOut.d + + // requests on TL A channel + // + req.valid := tlIn.a.valid + req.size := tlIn.a.bits.size + req.is_store := TLUtils.AOpcodeIsStore(tlIn.a.bits.opcode) + req.source := tlIn.a.bits.source + // TL always carries the exact unaligned address that the client + // originally requested, so no postprocessing required + req.address := tlIn.a.bits.address + + // TL data + // + // When tlIn.a.bits.size is smaller than the data bus width, need to + // figure out which byte lanes we actually accessed so that + // we can write that to the memory trace. + // See Section 4.5 Byte Lanes in spec 1.8.1 + + // This assert only holds true for PutFullData and not PutPartialData, + // where HIGH bits in the mask may not be contiguous. + assert( + PopCount(tlIn.a.bits.mask) === (1.U << tlIn.a.bits.size), + "mask HIGH bits do not match the TL size. This should have been handled by the TL generator logic" + ) + val trailingZerosInMask = trailingZeros(tlIn.a.bits.mask) + val mask = ~((~0.U) << (trailingZerosInMask * 8.U)) + req.data := mask & (tlIn.a.bits.data >> (trailingZerosInMask * 8.U)) + + when(req.valid) { + TracePrintf( + "MemTraceLogger", + tlIn.a.bits.address, + tlIn.a.bits.size, + tlIn.a.bits.mask, + req.is_store, + tlIn.a.bits.data, + req.data + ) + } + + // responses on TL D channel + // + resp.valid := tlOut.d.valid + resp.size := tlOut.d.bits.size + resp.is_store := TLUtils.DOpcodeIsStore(tlOut.d.bits.opcode) + resp.source := tlOut.d.bits.source + // NOTE: TL D channel doesn't carry address nor mask, so there's no easy + // way to figure out which bytes the master actually use. Since we + // don't care too much about addresses in the trace anyway, just store + // the entire bits. + resp.address := 0.U + resp.data := tlOut.d.bits.data + } + + // stats + val numReqsThisCycle = + laneReqs.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } + val numRespsThisCycle = + laneResps.map { l => Mux(l.valid, 1.U(64.W), 0.U(64.W)) }.reduce { (v0, v1) => v0 + v1 } + val reqBytesThisCycle = + laneReqs.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) => + b0 + b1 + } + val respBytesThisCycle = + laneResps.map { l => Mux(l.valid, 1.U(64.W) << l.size, 0.U(64.W)) }.reduce { (b0, b1) => + b0 + b1 + } + numReqs := numReqs + numReqsThisCycle + numResps := numResps + numRespsThisCycle + reqBytes := reqBytes + reqBytesThisCycle + respBytes := respBytes + respBytesThisCycle + + // Flatten per-lane signals to the Verilog blackbox input. + // + // This is a clunky workaround of the fact that Chisel doesn't allow partial + // assignment to a bitfield range of a wide signal. + def flattenTrace(traceLogIO: Bundle with HasTraceLine, perLane: Vec[TraceLine]) = { + // these will get optimized out + val vecValid = Wire(Vec(numLanes, chiselTypeOf(perLane(0).valid))) + val vecSource = Wire(Vec(numLanes, chiselTypeOf(perLane(0).source))) + val vecAddress = Wire(Vec(numLanes, chiselTypeOf(perLane(0).address))) + val vecIsStore = Wire(Vec(numLanes, chiselTypeOf(perLane(0).is_store))) + val vecSize = Wire(Vec(numLanes, chiselTypeOf(perLane(0).size))) + val vecData = Wire(Vec(numLanes, chiselTypeOf(perLane(0).data))) + perLane.zipWithIndex.foreach { case (l, i) => + vecValid(i) := l.valid + vecSource(i) := l.source + vecAddress(i) := l.address + vecIsStore(i) := l.is_store + vecSize(i) := l.size + vecData(i) := l.data + } + traceLogIO.valid := vecValid.asUInt + traceLogIO.source := vecSource.asUInt + traceLogIO.address := vecAddress.asUInt + traceLogIO.is_store := vecIsStore.asUInt + traceLogIO.size := vecSize.asUInt + traceLogIO.data := vecData.asUInt + } + + if (simReq.isDefined) { + flattenTrace(simReq.get.io.trace_log, laneReqs) + assert( + simReq.get.io.trace_log.ready === true.B, + "MemTraceLogger is expected to be always ready" + ) + } + if (simResp.isDefined) { + flattenTrace(simResp.get.io.trace_log, laneResps) + assert( + simResp.get.io.trace_log.ready === true.B, + "MemTraceLogger is expected to be always ready" + ) + } + } +} + +// MemTraceLogger is bidirectional, and `isResponse` is how the DPI module tells +// itself whether it's logging the request stream or the response stream. This +// is necessary because we have to generate slightly different trace format +// depending on this, e.g. response trace will not contain an address column. +class SimMemTraceLogger(isResponse: Boolean, filename: String, numLanes: Int) + extends BlackBox( + Map( + "IS_RESPONSE" -> (if (isResponse) 1 else 0), + "FILENAME" -> filename, + "NUM_LANES" -> numLanes + ) + ) + with HasBlackBoxResource { + val traceLineT = new TraceLine + val sourceW = traceLineT.source.getWidth + val addrW = traceLineT.address.getWidth + val sizeW = traceLineT.size.getWidth + val dataW = traceLineT.data.getWidth + + val io = IO(new Bundle { + val clock = Input(Clock()) + val reset = Input(Bool()) + + val trace_log = new Bundle with HasTraceLine { + val valid = Input(UInt(numLanes.W)) + val source = Input(UInt((sourceW * numLanes).W)) + // Chisel can't interface with Verilog 2D port, so flatten all lanes into + // single wide 1D array. + // TODO: assumes 64-bit address. + val address = Input(UInt((addrW * numLanes).W)) + val is_store = Input(UInt(numLanes.W)) + val size = Input(UInt((sizeW * numLanes).W)) + val data = Input(UInt((dataW * numLanes).W)) + val ready = Output(Bool()) + } + }) + + addResource("/vsrc/SimMemTraceLogger.v") + addResource("/csrc/SimMemTraceLogger.cc") + addResource("/csrc/SimMemTrace.h") +} + +class TracePrintf {} + +object TracePrintf { + def apply( + printer: String, + address: UInt, + size: UInt, + mask: UInt, + is_store: Bool, + tlData: UInt, + reqData: UInt + ) = { + printf(s"${printer}: TL addr=%x, size=%d, mask=%x, store=%d", address, size, mask, is_store) + when(is_store) { + printf(", tlData=%x, reqData=%x", tlData, reqData) + } + printf("\n") + } +} + +// Synthesizable unit tests + +// tracedriver --> coalescer --> tracelogger --> tlram +class TLRAMCoalescerLogger(implicit p: Parameters) extends LazyModule { + // TODO: use parameters for numLanes + val numLanes = 4 + val driver = LazyModule(new MemTraceDriver(numLanes)) + val coreSideLogger = LazyModule( + new MemTraceLogger(numLanes, loggerName = "coreside") + ) + val coal = LazyModule(new CoalescingUnit(numLanes)) + val memSideLogger = LazyModule(new MemTraceLogger(numLanes + 1, loggerName = "memside")) + val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge + LazyModule( + // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink + // edges globally, by way of Diplomacy communicating the TL slave + // parameters to the upstream nodes. + new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) + ) + ) + + memSideLogger.node :=* coal.node :=* coreSideLogger.node :=* driver.node + rams.foreach { r => r.node := memSideLogger.node } + + lazy val module = new Impl + class Impl extends LazyModuleImp(this) with UnitTestModule { + driver.module.io.start := io.start + io.finished := driver.module.io.finished + + when(io.finished) { + printf( + "numReqs=%d, numResps=%d, reqBytes=%d, respBytes=%d\n", + coreSideLogger.module.io.numReqs, + coreSideLogger.module.io.numResps, + coreSideLogger.module.io.reqBytes, + coreSideLogger.module.io.respBytes + ) + assert( + coreSideLogger.module.io.numReqs === coreSideLogger.module.io.numResps, + "FAIL: number of requests and responses to the coalescer do not match" + ) + } + } +} + +class TLRAMCoalescerLoggerTest(timeout: Int = 500000)(implicit p: Parameters) + extends UnitTest(timeout) { + val dut = Module(LazyModule(new TLRAMCoalescerLogger).module) + dut.io.start := io.start + io.finished := dut.io.finished +} + +// tracedriver --> coalescer --> tlram +class TLRAMCoalescer(implicit p: Parameters) extends LazyModule { // TODO: use parameters for numLanes val numLanes = 4 val coal = LazyModule(new CoalescingUnit(numLanes)) val driver = LazyModule(new MemTraceDriver(numLanes)) - - coal.node :=* driver.node - - // Use TLTestRAM as bogus downstream TL manager nodes - // TODO: swap this out with a memtrace logger - val rams = Seq.tabulate(numLanes + 1) { _ => + val rams = Seq.fill(numLanes + 1)( // +1 for coalesced edge LazyModule( - // TODO: properly propagate beatBytes? + // NOTE: beatBytes here sets the data bitwidth of the upstream TileLink + // edges globally, by way of Diplomacy communicating the TL slave + // parameters to the upstream nodes. new TLRAM(address = AddressSet(0x0000, 0xffffff), beatBytes = 8) ) - } - // Connect all (N+1) outputs of coal to separate TestRAM modules + ) + + coal.node :=* driver.node rams.foreach { r => r.node := coal.node } lazy val module = new Impl @@ -799,8 +1229,8 @@ class CoalConnectTrace(implicit p: Parameters) extends LazyModule { } } -class CoalescingUnitTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { - val dut = Module(LazyModule(new CoalConnectTrace).module) +class TLRAMCoalescerTest(timeout: Int = 500000)(implicit p: Parameters) extends UnitTest(timeout) { + val dut = Module(LazyModule(new TLRAMCoalescer).module) dut.io.start := io.start io.finished := dut.io.finished } diff --git a/src/test/scala/coalescing/CoalescingUnitTest.scala b/src/test/scala/coalescing/CoalescingUnitTest.scala index 7d7099b..5f722c5 100644 --- a/src/test/scala/coalescing/CoalescingUnitTest.scala +++ b/src/test/scala/coalescing/CoalescingUnitTest.scala @@ -87,7 +87,7 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { c.io.invalidate.poke(0.U) // prepare - c.io.deq.ready.poke(false.B) + c.io.deq.ready.poke(true.B) c.io.enq.ready.expect(true.B) c.io.enq.valid.poke(true.B) c.io.enq.bits.poke(0x12.U) @@ -113,6 +113,45 @@ class CoalShiftQueueTest extends AnyFlatSpec with ChiselScalatestTester { } } + it should "work when enqueing and dequeueing simultaneously to a full queue" in { + test(new CoalShiftQueue(UInt(8.W), 1)) { c => + c.io.invalidate.poke(0.U) + + // prepare + c.io.deq.ready.poke(true.B) + c.io.enq.ready.expect(true.B) + c.io.enq.valid.poke(true.B) + c.io.enq.bits.poke(0x12.U) + c.clock.step() + // enqueue and dequeue simultaneously + c.io.deq.ready.poke(true.B) + c.io.enq.ready.expect(true.B) + c.io.enq.valid.poke(true.B) + c.io.enq.bits.poke(0x34.U) + c.io.deq.valid.expect(true.B) + c.io.deq.bits.expect(0x12.U) + c.clock.step() + // enqueue and dequeue simultaneously once more + c.io.deq.ready.poke(true.B) + c.io.enq.ready.expect(true.B) + c.io.enq.valid.poke(true.B) + c.io.enq.bits.poke(0x56.U) + c.io.deq.valid.expect(true.B) + c.io.deq.bits.expect(0x34.U) + c.clock.step() + // dequeueing back-to-back should work without any holes in the middle + c.io.deq.ready.poke(true.B) + c.io.enq.valid.poke(false.B) + c.io.deq.valid.expect(true.B) + c.io.deq.bits.expect(0x56.U) + c.clock.step() + // make sure is empty + c.io.deq.ready.poke(true.B) + c.io.enq.valid.poke(false.B) + c.io.deq.valid.expect(false.B) + } + } + it should "invalidate head being dequeued" in { test(new CoalShiftQueue(UInt(8.W), 4)) { c => c.io.invalidate.poke(0.U) @@ -216,6 +255,7 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { val numLanes = 4 val numPerLaneReqs = 2 val sourceWidth = 2 + val sizeWidth = 2 // 16B coalescing size val coalDataWidth = 128 val numInflightCoalRequests = 4 @@ -226,8 +266,9 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { numLanes, numPerLaneReqs, sourceWidth, + sizeWidth, coalDataWidth, - numInflightCoalRequests + numInflightCoalRequests, ) ) // vcs helps with simulation time, but sometimes errors with @@ -238,15 +279,19 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { c.io.coalReqValid.poke(true.B) c.io.newEntry.source.poke(sourceId) c.io.newEntry.lanes(0).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(0).reqs(0).source.poke(1.U) c.io.newEntry.lanes(0).reqs(0).offset.poke(1.U) c.io.newEntry.lanes(0).reqs(0).size.poke(2.U) c.io.newEntry.lanes(0).reqs(1).valid.poke(true.B) + c.io.newEntry.lanes(0).reqs(1).source.poke(2.U) c.io.newEntry.lanes(0).reqs(1).offset.poke(1.U) c.io.newEntry.lanes(0).reqs(1).size.poke(2.U) c.io.newEntry.lanes(2).reqs(0).valid.poke(true.B) + c.io.newEntry.lanes(2).reqs(0).source.poke(1.U) c.io.newEntry.lanes(2).reqs(0).offset.poke(2.U) c.io.newEntry.lanes(2).reqs(0).size.poke(1.U) c.io.newEntry.lanes(2).reqs(1).valid.poke(true.B) + c.io.newEntry.lanes(2).reqs(1).source.poke(2.U) c.io.newEntry.lanes(2).reqs(1).offset.poke(0.U) c.io.newEntry.lanes(2).reqs(1).size.poke(2.U) @@ -268,13 +313,13 @@ class UncoalescingUnitTest extends AnyFlatSpec with ChiselScalatestTester { c.io.uncoalResps(3)(0).valid.expect(false.B) c.io.uncoalResps(0)(0).bits.data.expect(0x89abcdefL.U) - c.io.uncoalResps(0)(0).bits.source.expect(0.U) + c.io.uncoalResps(0)(0).bits.source.expect(1.U) c.io.uncoalResps(0)(1).bits.data.expect(0x89abcdefL.U) - c.io.uncoalResps(0)(1).bits.source.expect(0.U) + c.io.uncoalResps(0)(1).bits.source.expect(2.U) c.io.uncoalResps(2)(0).bits.data.expect(0x5ca1ab1eL.U) - c.io.uncoalResps(2)(0).bits.source.expect(0.U) + c.io.uncoalResps(2)(0).bits.source.expect(1.U) c.io.uncoalResps(2)(1).bits.data.expect(0x01234567L.U) - c.io.uncoalResps(2)(1).bits.source.expect(0.U) + c.io.uncoalResps(2)(1).bits.source.expect(2.U) } } }