From 0de09daa053c033be60dab080b78664f8167b706 Mon Sep 17 00:00:00 2001 From: Vamber Yang Date: Wed, 8 Mar 2023 17:34:10 -0800 Subject: [PATCH] MemTracer able to read and write according to trace file, also support thread_id skipping in trace file --- src/main/resources/csrc/SimMemTrace.cc | 18 +++++++- src/main/resources/csrc/SimMemTrace.h | 6 ++- src/main/resources/vsrc/SimMemTrace.v | 46 ++++++++++++++++++++ src/main/scala/tilelink/Coalescing.scala | 54 +++++++++++++++++++----- 4 files changed, 110 insertions(+), 14 deletions(-) diff --git a/src/main/resources/csrc/SimMemTrace.cc b/src/main/resources/csrc/SimMemTrace.cc index dc790f9..32b9012 100644 --- a/src/main/resources/csrc/SimMemTrace.cc +++ b/src/main/resources/csrc/SimMemTrace.cc @@ -3,6 +3,7 @@ #include #endif #include +#include #include #include #include @@ -66,16 +67,23 @@ MemTraceLine MemTraceReader::read_trace_at(const long cycle, assert(false && "some trace lines are left unread in the past"); } + if (line.thread_id != thread_id) { + line.valid = false; + } if (line.cycle > cycle) { // We haven't reached the cycle mark specified in this line yet, so we don't // read it right now. return MemTraceLine{}; - } else if (line.cycle == cycle) { - printf("fire! cycle=%ld, valid=%d\n", cycle, line.valid); + } else if (line.cycle == cycle && line.thread_id == thread_id) { + printf("fire! cycle=%ld, valid=%d, %s \n", cycle, line.valid, line.loadstore); + // FIXME! Currently thread_id is assumed to be in round-robin order, e.g. // 0->1->2->3->0->..., both in the trace file and the order the caller calls // this function. If this is not true, we cannot simply monotonically // increment read_pos. + + // Only advance pointer when cycle and threa_id both match + // now increaseing sequence is fine (0, 1, 3), but unordered is not fine (0, 3, 1) ++read_pos; } @@ -96,6 +104,9 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, int trace_read_thread_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, + unsigned char *trace_read_is_store, + int *trace_read_store_mask, + unsigned long *trace_read_data, unsigned char *trace_read_finished) { // printf("memtrace_query(cycle=%ld, tid=%d)\n", trace_read_cycle, // trace_read_thread_id); @@ -107,6 +118,9 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, auto line = reader->read_trace_at(trace_read_cycle, trace_read_thread_id); *trace_read_valid = line.valid; *trace_read_address = line.address; + *trace_read_is_store = strcmp(line.loadstore, "STORE") == 0 ; + *trace_read_store_mask = line.data_size; + *trace_read_data = line.data; // This means finished and valid will go up at the same cycle. Need to // handle this without skipping the last line. *trace_read_finished = reader->finished(); diff --git a/src/main/resources/csrc/SimMemTrace.h b/src/main/resources/csrc/SimMemTrace.h index 2f45b8d..94ffef8 100644 --- a/src/main/resources/csrc/SimMemTrace.h +++ b/src/main/resources/csrc/SimMemTrace.h @@ -37,4 +37,8 @@ extern "C" void memtrace_query(unsigned char trace_read_ready, int trace_read_thread_id, unsigned char *trace_read_valid, unsigned long *trace_read_address, - unsigned char *trace_read_finished); + unsigned char *trace_read_is_store, + int *trace_read_store_mask, + unsigned long *trace_read_data, + unsigned char *trace_read_finished + ); diff --git a/src/main/resources/vsrc/SimMemTrace.v b/src/main/resources/vsrc/SimMemTrace.v index 7e4a1e9..cdf2d8b 100644 --- a/src/main/resources/vsrc/SimMemTrace.v +++ b/src/main/resources/vsrc/SimMemTrace.v @@ -1,5 +1,6 @@ `define DATA_WIDTH 64 `define MAX_NUM_THREADS 32 +`define MASK_WIDTH 8 import "DPI-C" function void memtrace_init( input string filename @@ -16,6 +17,9 @@ import "DPI-C" function void memtrace_query input int trace_read_tid, output bit trace_read_valid, output longint trace_read_address, + output bit trace_read_is_store, + output int trace_read_store_mask, + output longint trace_read_data, output bit trace_read_finished ); @@ -27,10 +31,19 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( input trace_read_ready, output [NUM_THREADS-1:0] trace_read_valid, output [`DATA_WIDTH*NUM_THREADS-1:0] trace_read_address, + + output [NUM_THREADS-1:0] trace_read_is_store, + output [NUM_THREADS*`MASK_WIDTH-1:0] trace_read_store_mask, + output [`DATA_WIDTH*NUM_THREADS-1:0] trace_read_data, output trace_read_finished ); bit __in_valid[NUM_THREADS-1:0]; longint __in_address[NUM_THREADS-1:0]; + + bit __in_is_store[NUM_THREADS-1:0]; + int __in_store_mask [NUM_THREADS-1:0]; + longint __in_data[NUM_THREADS-1:0]; + bit __in_finished; string __uartlog; @@ -43,6 +56,10 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( // registers that stage outputs of the C parser reg [NUM_THREADS-1:0] __in_valid_reg; reg [`DATA_WIDTH-1:0] __in_address_reg [NUM_THREADS-1:0]; + + reg [NUM_THREADS-1:0] __in_is_store_reg; + reg [`MASK_WIDTH-1:0] __in_store_mask_reg [NUM_THREADS-1:0]; + reg [`DATA_WIDTH-1:0] __in_data_reg [NUM_THREADS-1:0]; reg __in_finished_reg; genvar g; @@ -51,6 +68,10 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( for (g = 0; g < NUM_THREADS; g = g + 1) begin assign trace_read_valid[g] = __in_valid_reg[g]; assign trace_read_address[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_address_reg[g]; + + assign trace_read_is_store[g] = __in_is_store_reg[g]; + assign trace_read_store_mask[`MASK_WIDTH*(g+1)-1:`MASK_WIDTH*g] = __in_store_mask_reg[g]; + assign trace_read_data[`DATA_WIDTH*(g+1)-1:`DATA_WIDTH*g] = __in_data_reg[g]; end endgenerate assign trace_read_finished = __in_finished_reg; @@ -62,23 +83,37 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( // Evaluate the signals on the positive edge always @(posedge clock) begin + + // Setting reset value if (reset) begin for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin __in_valid[tid] = 1'b0; __in_address[tid] = `DATA_WIDTH'b0; + + __in_is_store[tid] = 1'b0; + __in_store_mask[tid] = `MASK_WIDTH'b0; + __in_data[tid] = `DATA_WIDTH'b0; end + __in_finished = 1'b0; cycle_counter <= `DATA_WIDTH'b0; + // setting default value for register to avoid latches for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin __in_valid_reg[tid] <= 1'b0; __in_address_reg[tid] <= `DATA_WIDTH'b0; + + __in_is_store_reg[tid] = 1'b0; + __in_store_mask_reg[tid] = `MASK_WIDTH'b0; + __in_data_reg[tid] = `DATA_WIDTH'b0; end + __in_finished_reg <= 1'b0; end else begin cycle_counter <= next_cycle_counter; + // Getting values from C function into pseudeo register for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin memtrace_query( trace_read_ready, @@ -87,15 +122,26 @@ module SimMemTrace #(parameter FILENAME = "undefined", NUM_THREADS = 4) ( // to sync up. next_cycle_counter, tid, + __in_valid[tid], __in_address[tid], + + __in_is_store[tid], + __in_store_mask[tid], + __in_data[tid], + __in_finished ); end + // Connect values from pseudo register into verilog register for (integer tid = 0; tid < NUM_THREADS; tid = tid + 1) begin __in_valid_reg[tid] <= __in_valid[tid]; __in_address_reg[tid] <= __in_address[tid]; + + __in_is_store_reg[tid] <= __in_is_store[tid]; + __in_store_mask_reg[tid] <= __in_store_mask[tid]; + __in_data_reg[tid] <= __in_data[tid]; end __in_finished_reg <= __in_finished; end diff --git a/src/main/scala/tilelink/Coalescing.scala b/src/main/scala/tilelink/Coalescing.scala index 8803a3f..fb1b8d1 100644 --- a/src/main/scala/tilelink/Coalescing.scala +++ b/src/main/scala/tilelink/Coalescing.scala @@ -100,7 +100,8 @@ class MemTraceDriver(numThreads: Int = 1)(implicit p: Parameters) val clientParam = Seq( TLMasterParameters.v1( name = "MemTraceDriver" + i.toString, - sourceId = IdRange(0, numThreads) + //Id range is indepdent from numThreads, IdRange determines the number of inflight reqs + sourceId = IdRange(0, 4) // visibility = Seq(AddressSet(0x0000, 0xffffff)) ) ) @@ -117,6 +118,15 @@ class MemTraceDriver(numThreads: Int = 1)(implicit p: Parameters) lazy val module = new MemTraceDriverImp(this, numThreads) } + +class TraceReq extends Bundle { + val valid = Bool() + val address = UInt(64.W) + val is_store = Bool() + val mask = UInt(8.W) + val data = UInt(64.W) +} + class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int) extends LazyModuleImp(outer) with UnitTestModule { @@ -129,39 +139,58 @@ class MemTraceDriverImp(outer: MemTraceDriver, numThreads: Int) // Split output of SimMemTrace, which is flattened across all threads, // back to each thread's. + + // Maybe this part can be improved, since now we are still mannually shifting everything val threadReqs = Wire(Vec(numThreads, new TraceReq)) threadReqs.zipWithIndex.foreach { case (req, i) => req.valid := (sim.io.trace_read.valid >> i) req.address := (sim.io.trace_read.address >> (64 * i)) + req.is_store := (sim.io.trace_read.is_store >> i) + req.mask := (sim.io.trace_read.store_mask >> (8 * i)) + req.data := (sim.io.trace_read.data >> (64 * i)) + } + // Connect each thread to its respective TL node. (outer.threadNodes zip threadReqs).zipWithIndex.foreach { case ((node, req), i) => val (tlOut, edge) = node.out(0) tlOut.a.valid := req.valid - // TODO: placeholders, use actual value from trace - tlOut.a.bits := edge - .Put( + + tlOut.a.bits := DontCare + tlOut.a.bits.data := 0.U + when (req.is_store) { + tlOut.a.bits := edge.Put( fromSource = 0.U, toAddress = req.address, // 64 bits = 8 bytes = 2**(3) bytes lgSize = 3.U, - data = (i + 100).U - ) - ._2 + data = req.data + )._2 + }.otherwise { + tlOut.a.bits := edge.Get( + fromSource = 0.U, + toAddress = req.address, + // 64 bits = 8 bytes = 2**(3) bytes + lgSize = 3.U, + )._2 + } + // tl_out.a.bits.mask := 0xf.U dontTouch(tlOut.a) tlOut.d.ready := true.B } io.finished := sim.io.trace_read.finished + + // Clock Counter, for debugging purpose + val clkcount = RegInit(0.U(64.W)) + clkcount := clkcount + 1.U + dontTouch(clkcount) } -class TraceReq extends Bundle { - val valid = Bool() - val address = UInt(64.W) -} + class SimMemTrace(val filename: String, numThreads: Int) extends BlackBox( @@ -179,6 +208,9 @@ class SimMemTrace(val filename: String, numThreads: Int) // single wide 1D array. // TODO: assumes 64-bit address. val address = Output(UInt((64 * numThreads).W)) + val is_store = Output(UInt(numThreads.W)) + val store_mask = Output(UInt((8 * numThreads).W)) + val data = Output(UInt((64 * numThreads).W)) val finished = Output(Bool()) } })