tensor: Fix wrong addressGen that used bits not bytes
This commit is contained in:
@@ -202,8 +202,9 @@ class TensorCoreDecoupled(
|
|||||||
// Address generation
|
// Address generation
|
||||||
//
|
//
|
||||||
def addressGen(base: UInt, set: UInt, index: UInt): UInt = {
|
def addressGen(base: UInt, set: UInt, index: UInt): UInt = {
|
||||||
// note that both A and B are K-major to facilitate bank conflict-free SMEM
|
// A is assumed to be block-wise M-major, and B block-wise N-major, to
|
||||||
// accesses, so that below code applies to both.
|
// facilitate bank conflict-free SMEM accesses. With these layouts, the
|
||||||
|
// same code below works for both A and B.
|
||||||
//
|
//
|
||||||
// a "block" is the 4*8 byte-sized contiguous memory that can be read in
|
// a "block" is the 4*8 byte-sized contiguous memory that can be read in
|
||||||
// one SMEM request. The A and B matrix is assumed to be stored in
|
// one SMEM request. The A and B matrix is assumed to be stored in
|
||||||
@@ -211,8 +212,7 @@ class TensorCoreDecoupled(
|
|||||||
val blockRow = set
|
val blockRow = set
|
||||||
val blockCol = index
|
val blockCol = index
|
||||||
val blockIndex = (blockRow << indexBits) + blockCol
|
val blockIndex = (blockRow << indexBits) + blockCol
|
||||||
val blockSize = numLanes * laneWidth
|
val blockSize = numLanes * (laneWidth / 8/*bits*/)
|
||||||
require(blockSize == memWidth)
|
|
||||||
val blockSizeBits = log2Ceil(blockSize)
|
val blockSizeBits = log2Ceil(blockSize)
|
||||||
val byteOffset = blockIndex << blockSizeBits
|
val byteOffset = blockIndex << blockSizeBits
|
||||||
base + byteOffset
|
base + byteOffset
|
||||||
|
|||||||
Reference in New Issue
Block a user