Stabilize GPU output path and MPI sync

This commit is contained in:
2026-04-09 10:57:49 +08:00
parent 4e3946a4f0
commit 49409645c0
8 changed files with 748 additions and 334 deletions

View File

@@ -135,6 +135,18 @@ struct GpuRhsCache
const double *last_y = nullptr;
const double *last_z = nullptr;
bool meta_uploaded = false;
static const int max_mapped_buffers = 128;
const double *host_buffers[max_mapped_buffers] = {nullptr};
const double *device_buffers[max_mapped_buffers] = {nullptr};
int mapped_buffer_count = 0;
};
struct ExternalBufferRegistry
{
static const int max_mapped_buffers = 256;
const double *host_buffers[max_mapped_buffers] = {nullptr};
const double *device_buffers[max_mapped_buffers] = {nullptr};
int mapped_buffer_count = 0;
};
GpuRhsCache &gpu_rhs_cache()
@@ -143,11 +155,81 @@ GpuRhsCache &gpu_rhs_cache()
return cache;
}
ExternalBufferRegistry &external_buffer_registry()
{
static thread_local ExternalBufferRegistry registry;
return registry;
}
void reset_meta(Meta *meta)
{
memset(meta, 0, sizeof(Meta));
}
void reset_buffer_map(GpuRhsCache &cache)
{
cache.mapped_buffer_count = 0;
for (int i = 0; i < GpuRhsCache::max_mapped_buffers; ++i)
{
cache.host_buffers[i] = nullptr;
cache.device_buffers[i] = nullptr;
}
}
void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device_ptr)
{
if (!host_ptr || !device_ptr)
return;
for (int i = 0; i < cache.mapped_buffer_count; ++i)
{
if (cache.host_buffers[i] == host_ptr)
{
cache.device_buffers[i] = device_ptr;
return;
}
}
if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
return;
cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
cache.mapped_buffer_count++;
}
void reset_external_buffer_map(ExternalBufferRegistry &registry)
{
registry.mapped_buffer_count = 0;
for (int i = 0; i < ExternalBufferRegistry::max_mapped_buffers; ++i)
{
registry.host_buffers[i] = nullptr;
registry.device_buffers[i] = nullptr;
}
}
void map_external_buffer(ExternalBufferRegistry &registry, const double *host_ptr, const double *device_ptr)
{
if (!host_ptr || !device_ptr)
return;
for (int i = 0; i < registry.mapped_buffer_count; ++i)
{
if (registry.host_buffers[i] == host_ptr)
{
registry.device_buffers[i] = device_ptr;
return;
}
}
if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
return;
registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
registry.mapped_buffer_count++;
}
bool ensure_device_buffer(double **ptr, size_t count)
{
if (*ptr)
@@ -188,6 +270,45 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count)
return true;
}
const double *find_external_device_buffer(const ExternalBufferRegistry &registry, const double *host_ptr)
{
if (!host_ptr)
return nullptr;
for (int i = 0; i < registry.mapped_buffer_count; ++i)
{
if (registry.host_buffers[i] == host_ptr)
return registry.device_buffers[i];
}
return nullptr;
}
bool copy_buffers_to_device_preferring_device(const CopySpec *specs, size_t count)
{
for (size_t i = 0; i < count; ++i)
{
const double *device_src = bssn_gpu_find_device_buffer(specs[i].src);
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
const void *copy_src = specs[i].src;
if (device_src)
{
copy_src = device_src;
kind = cudaMemcpyDeviceToDevice;
}
cudaError_t err = cudaMemcpy(specs[i].dst, copy_src,
specs[i].count * sizeof(double),
kind);
if (err != cudaSuccess)
{
cerr << "cudaMemcpy(" << (kind == cudaMemcpyDeviceToDevice ? "D2D" : "H2D")
<< ") failed: " << cudaGetErrorString(err) << endl;
return false;
}
}
return true;
}
bool zero_buffers(const ZeroSpec *specs, size_t count)
{
for (size_t i = 0; i < count; ++i)
@@ -219,6 +340,8 @@ void cleanup_gpu_rhs_cache()
cache.last_x = nullptr;
cache.last_y = nullptr;
cache.last_z = nullptr;
reset_buffer_map(cache);
reset_external_buffer_map(external_buffer_registry());
}
bool register_gpu_rhs_cleanup()
@@ -285,6 +408,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
cache.last_y = nullptr;
cache.last_z = nullptr;
cache.meta_uploaded = false;
reset_buffer_map(cache);
Meta *meta = &cache.meta;
const int matrix_size = cache.matrix_size;
@@ -467,6 +591,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
cache.last_x = nullptr;
cache.last_y = nullptr;
cache.last_z = nullptr;
reset_buffer_map(cache);
return false;
}
@@ -491,8 +616,56 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
return true;
}
const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *host_ptr)
{
if (!host_ptr)
return nullptr;
for (int i = 0; i < cache.mapped_buffer_count; ++i)
{
if (cache.host_buffers[i] == host_ptr)
return cache.device_buffers[i];
}
return nullptr;
}
} // namespace
const double *bssn_gpu_find_device_buffer(const double *host_ptr)
{
const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
if (device_ptr)
return device_ptr;
return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
}
int bssn_gpu_bind_process_device(int mpi_rank)
{
const int device = select_cuda_device_for_process(mpi_rank);
if (device < 0)
return 1;
cudaError_t err = cudaSetDevice(device);
if (err != cudaSuccess)
{
cerr << "cudaSetDevice(" << device << ") failed: "
<< cudaGetErrorString(err) << endl;
return 1;
}
return 0;
}
void bssn_gpu_clear_cached_device_buffers()
{
reset_external_buffer_map(external_buffer_registry());
reset_buffer_map(gpu_rhs_cache());
}
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
{
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
}
__global__ void test_const_address(double * testd){
int _t = blockIdx.x*blockDim.x+threadIdx.x;
if(_t == 0)
@@ -739,16 +912,12 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
}
#endif //ifdef Cell
#endif //ifdef Vertex
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
cudaThreadSynchronize();
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
cudaThreadSynchronize();
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
cudaThreadSynchronize();
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
cudaThreadSynchronize();
}
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
}
__global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz)
@@ -837,19 +1006,17 @@ __global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy
__syncthreads();
}
inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
cudaThreadSynchronize();
}
inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
}
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
{
@@ -905,18 +1072,16 @@ __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,d
}
}
inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
cudaThreadSynchronize();
}
inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
{
sub_symmetry_bd(2,f,fh,SoA);
cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
}
__global__ void computeRicci_part1(double * dst)
{
@@ -930,14 +1095,12 @@ __global__ void computeRicci_part1(double * dst)
}
}
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
cudaThreadSynchronize();
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
cudaThreadSynchronize();
}/*Exception*/
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
}/*Exception*/
__global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
{
@@ -989,13 +1152,11 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
}
}
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
{
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
cudaThreadSynchronize();
}
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
{
sub_symmetry_bd(3,f,fh,SoA);
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
}
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
{
@@ -1083,12 +1244,10 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
}
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
cudaThreadSynchronize();
}
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
sub_symmetry_bd(3,f,fh,SoA);
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
}
__global__ void compute_rhs_bssn_part1()
{
@@ -2866,6 +3025,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
cache.last_z = Z;
}
reset_buffer_map(cache);
const CopySpec state_copies[] = {
{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
@@ -2892,7 +3053,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
};
if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
return 1;
const ZeroSpec zero_specs[] = {
@@ -2977,9 +3138,57 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
};
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
return 1;
double sss[3] = {1,1,1};
map_buffer(cache, chi, Mh_ chi);
map_buffer(cache, trK, Mh_ trK);
map_buffer(cache, dxx, Mh_ dxx);
map_buffer(cache, gxy, Mh_ gxy);
map_buffer(cache, gxz, Mh_ gxz);
map_buffer(cache, dyy, Mh_ dyy);
map_buffer(cache, gyz, Mh_ gyz);
map_buffer(cache, dzz, Mh_ dzz);
map_buffer(cache, Axx, Mh_ Axx);
map_buffer(cache, Axy, Mh_ Axy);
map_buffer(cache, Axz, Mh_ Axz);
map_buffer(cache, Ayy, Mh_ Ayy);
map_buffer(cache, Ayz, Mh_ Ayz);
map_buffer(cache, Azz, Mh_ Azz);
map_buffer(cache, Gamx, Mh_ Gamx);
map_buffer(cache, Gamy, Mh_ Gamy);
map_buffer(cache, Gamz, Mh_ Gamz);
map_buffer(cache, Lap, Mh_ Lap);
map_buffer(cache, betax, Mh_ betax);
map_buffer(cache, betay, Mh_ betay);
map_buffer(cache, betaz, Mh_ betaz);
map_buffer(cache, dtSfx, Mh_ dtSfx);
map_buffer(cache, dtSfy, Mh_ dtSfy);
map_buffer(cache, dtSfz, Mh_ dtSfz);
map_buffer(cache, chi_rhs, Mh_ chi_rhs);
map_buffer(cache, trK_rhs, Mh_ trK_rhs);
map_buffer(cache, gxx_rhs, Mh_ gxx_rhs);
map_buffer(cache, gxy_rhs, Mh_ gxy_rhs);
map_buffer(cache, gxz_rhs, Mh_ gxz_rhs);
map_buffer(cache, gyy_rhs, Mh_ gyy_rhs);
map_buffer(cache, gyz_rhs, Mh_ gyz_rhs);
map_buffer(cache, gzz_rhs, Mh_ gzz_rhs);
map_buffer(cache, Axx_rhs, Mh_ Axx_rhs);
map_buffer(cache, Axy_rhs, Mh_ Axy_rhs);
map_buffer(cache, Axz_rhs, Mh_ Axz_rhs);
map_buffer(cache, Ayy_rhs, Mh_ Ayy_rhs);
map_buffer(cache, Ayz_rhs, Mh_ Ayz_rhs);
map_buffer(cache, Azz_rhs, Mh_ Azz_rhs);
map_buffer(cache, Gamx_rhs, Mh_ Gamx_rhs);
map_buffer(cache, Gamy_rhs, Mh_ Gamy_rhs);
map_buffer(cache, Gamz_rhs, Mh_ Gamz_rhs);
map_buffer(cache, Lap_rhs, Mh_ Lap_rhs);
map_buffer(cache, betax_rhs, Mh_ betax_rhs);
map_buffer(cache, betay_rhs, Mh_ betay_rhs);
map_buffer(cache, betaz_rhs, Mh_ betaz_rhs);
map_buffer(cache, dtSfx_rhs, Mh_ dtSfx_rhs);
map_buffer(cache, dtSfy_rhs, Mh_ dtSfy_rhs);
map_buffer(cache, dtSfz_rhs, Mh_ dtSfz_rhs);
double sss[3] = {1,1,1};
double aas[3] = {-1,-1,1};
double asa[3] = {-1,1,-1};
double saa[3] = {1,-1,-1};
@@ -3116,9 +3325,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
#endif
//cout<<"GPU meta data ready.\n";
cudaThreadSynchronize();
//--------------test constant memory address & value--------------
//--------------test constant memory address & value--------------
/* double rank = mpi_rank;
cudaMemcpyToSymbol(F1o3,&rank, sizeof(double));
double ctest1 = -1;
@@ -3137,11 +3344,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//#4-----------------------calculate------------------------------
//4.0------enforce_ga---------
//sub_enforce_ga(matrix_size);
//4.1-----compute rhs---------
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
//4.1-----compute rhs---------
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
sub_fderivs(Mh_ betaz,Mh_ fh,Mh_ betazx,Mh_ betazy,Mh_ betazz,ssa);
sub_fderivs(Mh_ chi,Mh_ fh,Mh_ chix,Mh_ chiy,Mh_ chiz, sss);
@@ -3153,44 +3359,37 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ gxy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz, aas);
sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa);
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
sub_fdderivs(Mh_ betaz,Mh_ fh,Mh_ gxxz,Mh_ gxyz,Mh_ gxzz,Mh_ gyyz,Mh_ gyzz,Mh_ gzzz,ssa);
sub_fderivs( Mh_ Gamx, Mh_ fh,Mh_ Gamxx, Mh_ Gamxy, Mh_ Gamxz,ass);
sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas);
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
computeRicci(Mh_ dzz,Mh_ Rzz,sss, meta);
computeRicci(Mh_ gxy,Mh_ Rxy,aas, meta);
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
cudaThreadSynchronize();
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
compute_rhs_bssn_part6_gauge<<<GRID_DIM,BLOCK_DIM>>>();
#endif
@@ -3257,19 +3456,17 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
}
if(co == 0){
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
if(co == 0){
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
sub_fderivs(Mh_ Axz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz,asa);
sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss);
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
}
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
}
#if (ABV == 1)
cout<<"TODO: bssn_gpu.cu::2373 (ABV == 1)"<<endl;
@@ -3283,33 +3480,11 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
cudaMemcpy(betax, Mh_ betax, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betay, Mh_ betay, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betaz, Mh_ betaz, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);*/
if(calledby == CALLED_BY_STEP)
{
cudaMemcpy(chi_rhs, Mh_ chi_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(trK_rhs, Mh_ trK_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gxx_rhs, Mh_ gxx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gxy_rhs, Mh_ gxy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gxz_rhs, Mh_ gxz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gyy_rhs, Mh_ gyy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gyz_rhs, Mh_ gyz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(gzz_rhs, Mh_ gzz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Axx_rhs, Mh_ Axx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Axy_rhs, Mh_ Axy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Axz_rhs, Mh_ Axz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Ayy_rhs, Mh_ Ayy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Ayz_rhs, Mh_ Ayz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Azz_rhs, Mh_ Azz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Gamx_rhs, Mh_ Gamx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Gamy_rhs, Mh_ Gamy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Gamz_rhs, Mh_ Gamz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Lap_rhs, Mh_ Lap_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betax_rhs, Mh_ betax_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betay_rhs, Mh_ betay_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(betaz_rhs, Mh_ betaz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(dtSfx_rhs, Mh_ dtSfx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(dtSfy_rhs, Mh_ dtSfy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(dtSfz_rhs, Mh_ dtSfz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
}
if(calledby == CALLED_BY_STEP)
{
// Main GPU evolution path now consumes RHS directly from device caches.
// Avoiding this bulk D2H copy removes one full round-trip per RK stage.
}
else if(calledby == CALLED_BY_CONSTRAINT)
{
cudaMemcpy(Gamxxx, Mh_ Gamxxx, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);