diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala index 040fa4a..ff7f94c 100644 --- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala +++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala @@ -200,22 +200,30 @@ class TensorCoreDecoupled( // note that both A and B are K-major to facilitate bank conflict-free SMEM // accesses, so that below code applies to both. // - // (row,col) coordinate of the compute tile - val tileRow = index - val tileCol = set - // (row,col) coordinate of the starting element of the compute tile - val elemRow = index << 1 - val elemCol = tileCol << log2Ceil(tilingParams.kc) - val rowStride = tilingParams.k * wordSize - val rowStrideBits = log2Ceil(rowStride) - val wordStrideBits = log2Ceil(wordSize) - val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits) + // a "block" is the 4*8 byte-sized contiguous memory that can be read in + // one SMEM request. The A and B matrix is assumed to be stored in + // block-wise "index"-major order (M-major for A, N-major for B) + val blockRow = set + val blockCol = index + val blockIndex = (blockRow << indexBits) + blockCol + val blockSize = numLanes * wordSize + val blockSizeBits = log2Ceil(blockSize) + val byteOffset = blockIndex << blockSizeBits + base + byteOffset - base + tileOffset + // address generation for byte-wise K-major A and B layout + // val elemRow = blockRow << 1 + // val elemCol = blockCol << log2Ceil(tilingParams.kc) + // val rowStride = tilingParams.k * wordSize + // val rowStrideBits = log2Ceil(rowStride) + // val wordStrideBits = log2Ceil(wordSize) + // val tileOffset = (elemRow << rowStrideBits) + (elemCol << wordStrideBits) + // base + tileOffset } // FIXME: bogus base address val addressA = addressGen(0.U, tagA.set, tagA.index) + // SMEM 256KB, 8 banks: 0x8000B(32KB) per bank val addressB = addressGen(0x400.U, tagB.set, tagB.index) val lastReqA = (tagA.set === lastSet.U) && (tagA.index === lastIndex.U)