tensor: Change B in-memory layout to column-major
This commit is contained in:
@@ -572,7 +572,7 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
||||
const uint32_t problem_size = (dim_m * dim_n) / (ELEM_PER_THREAD);
|
||||
const uint32_t num_threadblocks = problem_size / threads_per_threadblock;
|
||||
|
||||
using float_type = float;
|
||||
using float_type = float16_t;
|
||||
|
||||
// "static" shared memory allocation. This would determine threadblock
|
||||
// occupancy of a single cluster
|
||||
|
||||
@@ -173,7 +173,8 @@ int main(int argc, char *argv[]) {
|
||||
uint32_t dim_n = 64;
|
||||
uint32_t dim_k = 64;
|
||||
|
||||
using float_type = float;
|
||||
using float_type = half;
|
||||
|
||||
generate_source_matrix<float_type>(dim_m, dim_n, dim_k);
|
||||
generate_reference_matmul<float_type>(dim_m, dim_n, dim_k);
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
// BM <= BK*TM*TN
|
||||
#define BM 64
|
||||
#define BN 64
|
||||
#define BK 64
|
||||
#define BK 128
|
||||
#define WM 16
|
||||
#define WN 8
|
||||
#define TCM 8
|
||||
|
||||
Reference in New Issue
Block a user