Optimize BSSN EScalar GPU path baseline

This commit is contained in:
2026-05-02 18:19:15 +08:00
parent 52beb4d153
commit 59a216ad93
13 changed files with 1366 additions and 177 deletions

View File

@@ -341,7 +341,7 @@ bool cuda_state_count_direct_supported(int state_count)
#if USE_CUDA_Z4C && (ABEtype == 2)
return state_count == Z4C_CUDA_STATE_COUNT;
#elif USE_CUDA_BSSN
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
return state_count == BSSN_CUDA_STATE_COUNT;
#else
(void)state_count;
return false;
@@ -393,6 +393,14 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
#elif USE_CUDA_BSSN
if (bssn_cuda_has_resident_state(src->Bg) == 0)
return false;
if (VarLists)
{
double *view_ptrs[BSSN_CUDA_STATE_COUNT];
if (!cuda_build_bssn_host_views(src->Bg, VarLists, BSSN_CUDA_STATE_COUNT, view_ptrs))
return false;
if (bssn_cuda_resident_state_matches(src->Bg, view_ptrs) == 0)
return false;
}
if (type == 1)
return true;
int a[3], b[3];
@@ -427,7 +435,17 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var>
(void)VarListd;
return true;
#elif USE_CUDA_BSSN
return bssn_cuda_has_resident_state(dst->Bg) != 0;
if (bssn_cuda_has_resident_state(dst->Bg) == 0)
return false;
if (VarListd)
{
double *view_ptrs[BSSN_CUDA_STATE_COUNT];
if (!cuda_build_bssn_host_views(dst->Bg, VarListd, BSSN_CUDA_STATE_COUNT, view_ptrs))
return false;
if (bssn_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
return false;
}
return true;
#else
return false;
#endif
@@ -443,7 +461,7 @@ bool cuda_direct_pack_segment(double *buffer,
if (state_count != Z4C_CUDA_STATE_COUNT)
return false;
#elif USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count != BSSN_CUDA_STATE_COUNT)
return false;
#else
return false;
@@ -490,7 +508,7 @@ bool cuda_direct_unpack_segment(double *buffer,
if (state_count != Z4C_CUDA_STATE_COUNT)
return false;
#elif USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count != BSSN_CUDA_STATE_COUNT)
return false;
#else
return false;
@@ -771,7 +789,7 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
}
#endif
#if USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count != BSSN_CUDA_STATE_COUNT)
return false;
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
bool ok = false;
@@ -963,7 +981,7 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
}
#endif
#if USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count != BSSN_CUDA_STATE_COUNT)
return false;
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
@@ -1017,7 +1035,7 @@ bool cuda_download_resident_subset_to_host(Block *block,
}
#endif
#if USE_CUDA_BSSN
if (!block || state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (!block || state_count != BSSN_CUDA_STATE_COUNT)
return false;
if (bssn_cuda_has_resident_state(block) == 0)
return true;
@@ -1032,6 +1050,8 @@ bool cuda_download_resident_subset_to_host(Block *block,
views[i] = block->fgfs[v->data->sgfn];
v = v->next;
}
if (bssn_cuda_resident_state_matches(block, views) == 0)
return false;
return bssn_cuda_download_state_subset(block, block->shape, state_count, indices, views) == 0;
#else
(void)block; (void)vars; (void)state_count;
@@ -1085,7 +1105,7 @@ bool cuda_device_state_count_supported(int state_count)
return true;
#endif
#if USE_CUDA_BSSN
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
return state_count == BSSN_CUDA_STATE_COUNT;
#else
(void)state_count;
return false;
@@ -7259,6 +7279,8 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) &&
bssn_cuda_has_resident_state(cg) &&
bssn_cuda_resident_state_matches(cg, src1_views) &&
bssn_cuda_resident_state_matches(cg, src2_views) &&
bssn_cuda_prepare_inter_time_level(cg, cg->shape,
src1_views, src2_views, 0, dst_views,
2, tindex) == 0)
@@ -7336,6 +7358,9 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) &&
bssn_cuda_has_resident_state(cg) &&
bssn_cuda_resident_state_matches(cg, src1_views) &&
bssn_cuda_resident_state_matches(cg, src2_views) &&
bssn_cuda_resident_state_matches(cg, src3_views) &&
bssn_cuda_prepare_inter_time_level(cg, cg->shape,
src1_views, src2_views, src3_views, dst_views,
3, tindex) == 0)