fix: match blackwell fp8 fragment width

This commit is contained in:
Zhongdi LUO
2026-07-03 08:40:25 +00:00
parent 3f7ce1f1c9
commit 8a15e5251e
2 changed files with 10 additions and 8 deletions

View File

@@ -11,6 +11,8 @@ The validation runs one tensor warp on a 16x16x32 tile:
- B is FP8 E4M3 2.0 (`0x40`) - B is FP8 E4M3 2.0 (`0x40`)
- C is FP32 1.0 (`0x3f800000`) - C is FP32 1.0 (`0x3f800000`)
- Expected output is FP32 65.0 (`0x42820000`) - Expected output is FP32 65.0 (`0x42820000`)
- `VirgoBlackwellConfig` currently uses 4 core/memory lanes, so one
`tcgen05_cp/cb` fragment is 16 bytes.
Build: Build:

View File

@@ -8,22 +8,22 @@
#define FP8_N 16u #define FP8_N 16u
#define FP8_K 32u #define FP8_K 32u
#define FP8_TILE_BYTES 1024u #define FP8_TILE_BYTES 1024u
#define FP8_FRAGMENT_BYTES 32u #define FP8_FRAGMENT_BYTES 16u
#define FP8_FRAGMENT_WORDS (FP8_FRAGMENT_BYTES / sizeof(uint32_t)) #define FP8_FRAGMENT_WORDS (FP8_FRAGMENT_BYTES / sizeof(uint32_t))
#define FP8_FRAGMENTS (FP8_TILE_BYTES / FP8_FRAGMENT_BYTES) #define FP8_FRAGMENTS (FP8_TILE_BYTES / FP8_FRAGMENT_BYTES)
#define FP8_OUT_WORDS (FP8_M * FP8_N) #define FP8_OUT_WORDS (FP8_M * FP8_N)
#define FP8_EXPECTED 0x42820000u #define FP8_EXPECTED 0x42820000u
extern "C" { extern "C" {
volatile uint32_t g_fp8_a_frag[FP8_FRAGMENT_WORDS] __attribute__((aligned(32))) = { volatile uint32_t g_fp8_a_frag[FP8_FRAGMENT_WORDS] __attribute__((aligned(16))) = {
WU_FP8_REP8(WU_FP8_PACK4(WU_FP8_E4M3_ONE, WU_FP8_E4M3_ONE, WU_FP8_REP4(WU_FP8_PACK4(WU_FP8_E4M3_ONE, WU_FP8_E4M3_ONE,
WU_FP8_E4M3_ONE, WU_FP8_E4M3_ONE))}; WU_FP8_E4M3_ONE, WU_FP8_E4M3_ONE))};
volatile uint32_t g_fp8_b_frag[FP8_FRAGMENT_WORDS] __attribute__((aligned(32))) = { volatile uint32_t g_fp8_b_frag[FP8_FRAGMENT_WORDS] __attribute__((aligned(16))) = {
WU_FP8_REP8(WU_FP8_PACK4(WU_FP8_E4M3_TWO, WU_FP8_E4M3_TWO, WU_FP8_REP4(WU_FP8_PACK4(WU_FP8_E4M3_TWO, WU_FP8_E4M3_TWO,
WU_FP8_E4M3_TWO, WU_FP8_E4M3_TWO))}; WU_FP8_E4M3_TWO, WU_FP8_E4M3_TWO))};
volatile uint32_t g_fp8_c_frag[FP8_FRAGMENT_WORDS] __attribute__((aligned(32))) = { volatile uint32_t g_fp8_c_frag[FP8_FRAGMENT_WORDS] __attribute__((aligned(16))) = {
WU_FP8_REP8(0x3f800000u)}; WU_FP8_REP4(0x3f800000u)};
volatile uint32_t g_fp8_out[FP8_OUT_WORDS] __attribute__((aligned(32))); volatile uint32_t g_fp8_out[FP8_OUT_WORDS] __attribute__((aligned(16)));
} }
#undef WU_FP8_REP2 #undef WU_FP8_REP2