Optimize BSSN CUDA state transfers

This commit is contained in:
2026-04-29 18:34:31 +08:00
parent 22c1e7168b
commit 090d8657ae
3 changed files with 332 additions and 9 deletions

View File

@@ -6,6 +6,7 @@
#include "parameters.h"
#include <cstdlib>
#include <cstdio>
#include <vector>
#ifndef USE_CUDA_Z4C
#define USE_CUDA_Z4C 0
@@ -391,6 +392,113 @@ bool cuda_device_state_count_supported(int state_count)
#endif
}
#if USE_CUDA_BSSN
bool cuda_flush_device_segment_batch(Block *block,
double *data,
int state_count,
const std::vector<int> &meta,
int dir)
{
if (!block || meta.empty())
return true;
const int segment_count = (int)(meta.size() / 8);
if (dir == PACK)
return bssn_cuda_pack_state_segments_to_device_buffer(
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
return bssn_cuda_unpack_state_segments_from_device_buffer(
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
}
int cuda_data_packer_device_batched(double *data,
MyList<Parallel::gridseg> *src,
MyList<Parallel::gridseg> *dst,
int rank_in,
int dir,
MyList<var> *VarLists,
MyList<var> *VarListd,
int Symmetry)
{
(void)Symmetry;
if (!data || (dir != PACK && dir != UNPACK) || !src || !dst)
return -1;
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
const int state_count = cuda_state_var_count(VarLists, VarListd);
if (!cuda_device_state_count_supported(state_count))
return -1;
int size_out = 0;
Block *batch_block = 0;
std::vector<int> batch_meta;
batch_meta.reserve(64);
while (src && dst)
{
const bool active =
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
if (active)
{
int type;
if (src->data->Bg->lev == dst->data->Bg->lev)
type = 1;
else if (src->data->Bg->lev > dst->data->Bg->lev)
type = 2;
else
type = 3;
if (type != 1)
return -1;
Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
(dir == UNPACK && !cuda_can_direct_unpack(dst->data, type)))
return -1;
if (batch_block && batch_block != block)
{
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
return -1;
batch_meta.clear();
}
batch_block = block;
const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0)
: cuda_seg_begin(dst->data, block, 0);
const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1)
: cuda_seg_begin(dst->data, block, 1);
const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2)
: cuda_seg_begin(dst->data, block, 2);
const int sx = dst->data->shape[0];
const int sy = dst->data->shape[1];
const int sz = dst->data->shape[2];
const int region_all = sx * sy * sz;
batch_meta.push_back(i0);
batch_meta.push_back(j0);
batch_meta.push_back(k0);
batch_meta.push_back(sx);
batch_meta.push_back(sy);
batch_meta.push_back(sz);
batch_meta.push_back(region_all);
batch_meta.push_back(size_out);
size_out += state_count * region_all;
}
src = src->next;
dst = dst->next;
}
if (batch_block)
{
if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir))
return -1;
}
return size_out;
}
#endif
bool cuda_segments_same_level(MyList<Parallel::gridseg> *src,
MyList<Parallel::gridseg> *dst,
int rank_in,
@@ -465,6 +573,23 @@ int data_packer_with_device_buffer(double *data,
MyList<var> *VarListd,
int Symmetry)
{
#if USE_CUDA_BSSN
const double batched_t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
const int batched = cuda_data_packer_device_batched(data, src, dst, rank_in, dir,
VarLists, VarListd, Symmetry);
if (batched >= 0)
{
if (sync_profile_enabled())
{
const double dt = MPI_Wtime() - batched_t0;
if (dir == PACK)
sync_profile_stats().direct_pack_sec += dt;
else if (dir == UNPACK)
sync_profile_stats().direct_unpack_sec += dt;
}
return batched;
}
#endif
s_cuda_aware_pack_active = true;
int n = Parallel::data_packer(data, src, dst, rank_in, dir, VarLists, VarListd, Symmetry);
s_cuda_aware_pack_active = false;