Stabilize GPU output path and MPI sync

2026-04-09 10:57:49 +08:00
parent 4e3946a4f0
commit 49409645c0
8 changed files with 748 additions and 334 deletions
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -135,6 +135,18 @@ struct GpuRhsCache
 	const double *last_y = nullptr;
 	const double *last_z = nullptr;
 	bool meta_uploaded = false;
+	static const int max_mapped_buffers = 128;
+	const double *host_buffers[max_mapped_buffers] = {nullptr};
+	const double *device_buffers[max_mapped_buffers] = {nullptr};
+	int mapped_buffer_count = 0;
+};
+
+struct ExternalBufferRegistry
+{
+	static const int max_mapped_buffers = 256;
+	const double *host_buffers[max_mapped_buffers] = {nullptr};
+	const double *device_buffers[max_mapped_buffers] = {nullptr};
+	int mapped_buffer_count = 0;
 };

 GpuRhsCache &gpu_rhs_cache()
@@ -143,11 +155,81 @@ GpuRhsCache &gpu_rhs_cache()
 	return cache;
 }

+ExternalBufferRegistry &external_buffer_registry()
+{
+	static thread_local ExternalBufferRegistry registry;
+	return registry;
+}
+
 void reset_meta(Meta *meta)
 {
 	memset(meta, 0, sizeof(Meta));
 }

+void reset_buffer_map(GpuRhsCache &cache)
+{
+	cache.mapped_buffer_count = 0;
+	for (int i = 0; i < GpuRhsCache::max_mapped_buffers; ++i)
+	{
+		cache.host_buffers[i] = nullptr;
+		cache.device_buffers[i] = nullptr;
+	}
+}
+
+void map_buffer(GpuRhsCache &cache, const double *host_ptr, const double *device_ptr)
+{
+	if (!host_ptr || !device_ptr)
+		return;
+
+	for (int i = 0; i < cache.mapped_buffer_count; ++i)
+	{
+		if (cache.host_buffers[i] == host_ptr)
+		{
+			cache.device_buffers[i] = device_ptr;
+			return;
+		}
+	}
+
+	if (cache.mapped_buffer_count >= GpuRhsCache::max_mapped_buffers)
+		return;
+
+	cache.host_buffers[cache.mapped_buffer_count] = host_ptr;
+	cache.device_buffers[cache.mapped_buffer_count] = device_ptr;
+	cache.mapped_buffer_count++;
+}
+
+void reset_external_buffer_map(ExternalBufferRegistry &registry)
+{
+	registry.mapped_buffer_count = 0;
+	for (int i = 0; i < ExternalBufferRegistry::max_mapped_buffers; ++i)
+	{
+		registry.host_buffers[i] = nullptr;
+		registry.device_buffers[i] = nullptr;
+	}
+}
+
+void map_external_buffer(ExternalBufferRegistry &registry, const double *host_ptr, const double *device_ptr)
+{
+	if (!host_ptr || !device_ptr)
+		return;
+
+	for (int i = 0; i < registry.mapped_buffer_count; ++i)
+	{
+		if (registry.host_buffers[i] == host_ptr)
+		{
+			registry.device_buffers[i] = device_ptr;
+			return;
+		}
+	}
+
+	if (registry.mapped_buffer_count >= ExternalBufferRegistry::max_mapped_buffers)
+		return;
+
+	registry.host_buffers[registry.mapped_buffer_count] = host_ptr;
+	registry.device_buffers[registry.mapped_buffer_count] = device_ptr;
+	registry.mapped_buffer_count++;
+}
+
 bool ensure_device_buffer(double **ptr, size_t count)
 {
 	if (*ptr)
@@ -188,6 +270,45 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count)
 	return true;
 }

+const double *find_external_device_buffer(const ExternalBufferRegistry &registry, const double *host_ptr)
+{
+	if (!host_ptr)
+		return nullptr;
+
+	for (int i = 0; i < registry.mapped_buffer_count; ++i)
+	{
+		if (registry.host_buffers[i] == host_ptr)
+			return registry.device_buffers[i];
+	}
+	return nullptr;
+}
+
+bool copy_buffers_to_device_preferring_device(const CopySpec *specs, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		const double *device_src = bssn_gpu_find_device_buffer(specs[i].src);
+		cudaMemcpyKind kind = cudaMemcpyHostToDevice;
+		const void *copy_src = specs[i].src;
+		if (device_src)
+		{
+			copy_src = device_src;
+			kind = cudaMemcpyDeviceToDevice;
+		}
+
+		cudaError_t err = cudaMemcpy(specs[i].dst, copy_src,
+		                             specs[i].count * sizeof(double),
+		                             kind);
+		if (err != cudaSuccess)
+		{
+			cerr << "cudaMemcpy(" << (kind == cudaMemcpyDeviceToDevice ? "D2D" : "H2D")
+			     << ") failed: " << cudaGetErrorString(err) << endl;
+			return false;
+		}
+	}
+	return true;
+}
+
 bool zero_buffers(const ZeroSpec *specs, size_t count)
 {
 	for (size_t i = 0; i < count; ++i)
@@ -219,6 +340,8 @@ void cleanup_gpu_rhs_cache()
 	cache.last_x = nullptr;
 	cache.last_y = nullptr;
 	cache.last_z = nullptr;
+	reset_buffer_map(cache);
+	reset_external_buffer_map(external_buffer_registry());
 }

 bool register_gpu_rhs_cleanup()
@@ -285,6 +408,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
 	cache.last_y = nullptr;
 	cache.last_z = nullptr;
 	cache.meta_uploaded = false;
+	reset_buffer_map(cache);

 	Meta *meta = &cache.meta;
 	const int matrix_size = cache.matrix_size;
@@ -467,6 +591,7 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
 		cache.last_x = nullptr;
 		cache.last_y = nullptr;
 		cache.last_z = nullptr;
+		reset_buffer_map(cache);
 		return false;
 	}

@@ -491,8 +616,56 @@ bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
 	return true;
 }

+const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *host_ptr)
+{
+	if (!host_ptr)
+		return nullptr;
+
+	for (int i = 0; i < cache.mapped_buffer_count; ++i)
+	{
+		if (cache.host_buffers[i] == host_ptr)
+			return cache.device_buffers[i];
+	}
+	return nullptr;
+}
+
 } // namespace

+const double *bssn_gpu_find_device_buffer(const double *host_ptr)
+{
+	const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
+	if (device_ptr)
+		return device_ptr;
+	return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
+}
+
+int bssn_gpu_bind_process_device(int mpi_rank)
+{
+	const int device = select_cuda_device_for_process(mpi_rank);
+	if (device < 0)
+		return 1;
+
+	cudaError_t err = cudaSetDevice(device);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaSetDevice(" << device << ") failed: "
+		     << cudaGetErrorString(err) << endl;
+		return 1;
+	}
+	return 0;
+}
+
+void bssn_gpu_clear_cached_device_buffers()
+{
+	reset_external_buffer_map(external_buffer_registry());
+	reset_buffer_map(gpu_rhs_cache());
+}
+
+void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
+{
+	map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
+}
+
 __global__ void test_const_address(double * testd){
 	int _t = blockIdx.x*blockDim.x+threadIdx.x;
 	if(_t == 0)
@@ -739,16 +912,12 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
 }
 #endif //ifdef Cell
 #endif //ifdef Vertex
-inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
-	sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
-	cudaThreadSynchronize();
-	sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
-	cudaThreadSynchronize();
-	sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
-	cudaThreadSynchronize();
-	sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
-	cudaThreadSynchronize();
-}
+inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
+	sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
+	sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
+	sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
+	sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
+}


 __global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz)
@@ -837,19 +1006,17 @@ __global__ void sub_fdderivs_part1(double * f,double *fh,double *fxx,double *fxy
 	__syncthreads();
 }

-inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
-{
-	sub_symmetry_bd(2,f,fh,SoA);
-	cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
-	cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
-	cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
-	cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
-	cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
-	cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
-	cudaThreadSynchronize(); 
-	sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
-	cudaThreadSynchronize(); 
-}
+inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *fxz,double *fyy,double *fyz,double *fzz,double* SoA)
+{
+	sub_symmetry_bd(2,f,fh,SoA);
+	cudaMemset(fxx,0,_3D_SIZE[0] * sizeof(double));
+	cudaMemset(fxy,0,_3D_SIZE[0] * sizeof(double));
+	cudaMemset(fxz,0,_3D_SIZE[0] * sizeof(double));
+	cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
+	cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
+	cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
+	sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
+}

 __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz  )
 {	
@@ -905,18 +1072,16 @@ __global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,d
 	}
 }
 
-inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
-{
-	sub_symmetry_bd(2,f,fh,SoA);
-	
-	cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
-	cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
-	cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
-	
-	cudaThreadSynchronize(); 
-	sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
-	cudaThreadSynchronize();
-}
+inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,double * SoA)
+{
+	sub_symmetry_bd(2,f,fh,SoA);
+	
+	cudaMemset(fx,0,_3D_SIZE[0] * sizeof(double));
+	cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
+	cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
+	
+	sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
+}

 __global__ void computeRicci_part1(double * dst)
 {
@@ -930,14 +1095,12 @@ __global__ void computeRicci_part1(double * dst)
 	}
 }

- inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
-{
-	sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
-	cudaThreadSynchronize();
-	computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
-	cudaThreadSynchronize();
-	
-}/*Exception*/
+ inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
+{
+	sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
+	computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
+	
+}/*Exception*/

 __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
 {
@@ -989,13 +1152,11 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
 	}
 }

-inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
-{
-	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize();
-	sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
-	cudaThreadSynchronize();
-}
+inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
+{
+	sub_symmetry_bd(3,f,fh,SoA);
+	sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
+}
 
 __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
 {
@@ -1083,12 +1244,10 @@ __global__ void  sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
 }


-inline void  sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
-	sub_symmetry_bd(3,f,fh,SoA);
-	cudaThreadSynchronize(); 
-	sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
-	cudaThreadSynchronize(); 
-}
+inline void  sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
+	sub_symmetry_bd(3,f,fh,SoA);
+	sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
+}

 __global__ void compute_rhs_bssn_part1() 
 {
@@ -2866,6 +3025,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 		cache.last_z = Z;
 	}

+	reset_buffer_map(cache);
+
 	const CopySpec state_copies[] = {
 		{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
 		{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
@@ -2892,7 +3053,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 		{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
 		{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
 	};
-	if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
+	if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
 		return 1;

 	const ZeroSpec zero_specs[] = {
@@ -2977,9 +3138,57 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	};
 	if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
 		return 1;
-	
-	
-	double sss[3] = {1,1,1};
+	map_buffer(cache, chi, Mh_ chi);
+	map_buffer(cache, trK, Mh_ trK);
+	map_buffer(cache, dxx, Mh_ dxx);
+	map_buffer(cache, gxy, Mh_ gxy);
+	map_buffer(cache, gxz, Mh_ gxz);
+	map_buffer(cache, dyy, Mh_ dyy);
+	map_buffer(cache, gyz, Mh_ gyz);
+	map_buffer(cache, dzz, Mh_ dzz);
+	map_buffer(cache, Axx, Mh_ Axx);
+	map_buffer(cache, Axy, Mh_ Axy);
+	map_buffer(cache, Axz, Mh_ Axz);
+	map_buffer(cache, Ayy, Mh_ Ayy);
+	map_buffer(cache, Ayz, Mh_ Ayz);
+	map_buffer(cache, Azz, Mh_ Azz);
+	map_buffer(cache, Gamx, Mh_ Gamx);
+	map_buffer(cache, Gamy, Mh_ Gamy);
+	map_buffer(cache, Gamz, Mh_ Gamz);
+	map_buffer(cache, Lap, Mh_ Lap);
+	map_buffer(cache, betax, Mh_ betax);
+	map_buffer(cache, betay, Mh_ betay);
+	map_buffer(cache, betaz, Mh_ betaz);
+	map_buffer(cache, dtSfx, Mh_ dtSfx);
+	map_buffer(cache, dtSfy, Mh_ dtSfy);
+	map_buffer(cache, dtSfz, Mh_ dtSfz);
+	map_buffer(cache, chi_rhs, Mh_ chi_rhs);
+	map_buffer(cache, trK_rhs, Mh_ trK_rhs);
+	map_buffer(cache, gxx_rhs, Mh_ gxx_rhs);
+	map_buffer(cache, gxy_rhs, Mh_ gxy_rhs);
+	map_buffer(cache, gxz_rhs, Mh_ gxz_rhs);
+	map_buffer(cache, gyy_rhs, Mh_ gyy_rhs);
+	map_buffer(cache, gyz_rhs, Mh_ gyz_rhs);
+	map_buffer(cache, gzz_rhs, Mh_ gzz_rhs);
+	map_buffer(cache, Axx_rhs, Mh_ Axx_rhs);
+	map_buffer(cache, Axy_rhs, Mh_ Axy_rhs);
+	map_buffer(cache, Axz_rhs, Mh_ Axz_rhs);
+	map_buffer(cache, Ayy_rhs, Mh_ Ayy_rhs);
+	map_buffer(cache, Ayz_rhs, Mh_ Ayz_rhs);
+	map_buffer(cache, Azz_rhs, Mh_ Azz_rhs);
+	map_buffer(cache, Gamx_rhs, Mh_ Gamx_rhs);
+	map_buffer(cache, Gamy_rhs, Mh_ Gamy_rhs);
+	map_buffer(cache, Gamz_rhs, Mh_ Gamz_rhs);
+	map_buffer(cache, Lap_rhs, Mh_ Lap_rhs);
+	map_buffer(cache, betax_rhs, Mh_ betax_rhs);
+	map_buffer(cache, betay_rhs, Mh_ betay_rhs);
+	map_buffer(cache, betaz_rhs, Mh_ betaz_rhs);
+	map_buffer(cache, dtSfx_rhs, Mh_ dtSfx_rhs);
+	map_buffer(cache, dtSfy_rhs, Mh_ dtSfy_rhs);
+	map_buffer(cache, dtSfz_rhs, Mh_ dtSfz_rhs);
+	
+	
+	double sss[3] = {1,1,1};
 	double aas[3] = {-1,-1,1};
 	double asa[3] = {-1,1,-1};
 	double saa[3] = {1,-1,-1};
@@ -3116,9 +3325,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 #endif	
 	//cout<<"GPU meta data ready.\n";
 	
-	cudaThreadSynchronize();
-
-//--------------test constant memory address & value--------------
+	//--------------test constant memory address & value--------------
 /*	double rank = mpi_rank;
 	cudaMemcpyToSymbol(F1o3,&rank, sizeof(double));
 	double ctest1 = -1;
@@ -3137,11 +3344,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 //#4-----------------------calculate------------------------------
 	//4.0------enforce_ga---------
 	//sub_enforce_ga(matrix_size);
-	//4.1-----compute rhs---------
-	compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
-
-	sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
+	//4.1-----compute rhs---------
+	compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
+
+	sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
 	sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
 	sub_fderivs(Mh_ betaz,Mh_ fh,Mh_ betazx,Mh_ betazy,Mh_ betazz,ssa);
 	sub_fderivs(Mh_ chi,Mh_ fh,Mh_ chix,Mh_ chiy,Mh_ chiz, sss);
@@ -3153,44 +3359,37 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	sub_fderivs(Mh_ gxy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz, aas);
 	sub_fderivs(Mh_ gxz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz, asa);
 	sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
-  	
-  	compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
-	
-	sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
+	
+	compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
+	
+	sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
 	sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
 	sub_fdderivs(Mh_ betaz,Mh_ fh,Mh_ gxxz,Mh_ gxyz,Mh_ gxzz,Mh_ gyyz,Mh_ gyzz,Mh_ gzzz,ssa);        
 	sub_fderivs( Mh_ Gamx, Mh_ fh,Mh_ Gamxx, Mh_ Gamxy, Mh_ Gamxz,ass);
 	sub_fderivs( Mh_ Gamy, Mh_ fh,Mh_ Gamyx, Mh_ Gamyy, Mh_ Gamyz,sas);
 	sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
-	
-	compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
-	
-	computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
+	
+	compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
+	
+	computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
 	computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
 	computeRicci(Mh_ dzz,Mh_ Rzz,sss, meta);
 	computeRicci(Mh_ gxy,Mh_ Rxy,aas, meta);
 	computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
 	computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
 	
-	cudaThreadSynchronize();
-	
-	compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
-	
-	sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
-	
-	compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
-	
-	sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
-	
-	compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
-	cudaThreadSynchronize();
-	
-#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
-	sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
+	compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
+	
+	sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
+	
+	compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
+	
+	sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
+	
+	compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
+	
+#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
+	sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
 	compute_rhs_bssn_part6_gauge<<<GRID_DIM,BLOCK_DIM>>>();
 #endif	

@@ -3257,19 +3456,17 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,

 	}
 	
-	if(co == 0){
-		compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
-
-		sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
+		if(co == 0){
+			compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
+
+			sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
 		sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
 		sub_fderivs(Mh_ Axz,Mh_ fh,Mh_ gxzx,Mh_ gxzy,Mh_ gxzz,asa);
 		sub_fderivs(Mh_ Ayy,Mh_ fh,Mh_ gyyx,Mh_ gyyy,Mh_ gyyz,sss);
-		sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
-		sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
-		compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
-		cudaThreadSynchronize();
-	}
+			sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
+			sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
+			compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
+		}

 #if (ABV == 1)
 	cout<<"TODO: bssn_gpu.cu::2373 (ABV == 1)"<<endl;
@@ -3283,33 +3480,11 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	cudaMemcpy(betax, Mh_ betax, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
 	cudaMemcpy(betay, Mh_ betay, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
 	cudaMemcpy(betaz, Mh_ betaz, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);*/
-	if(calledby == CALLED_BY_STEP)
-	{	
-		cudaMemcpy(chi_rhs, Mh_ chi_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(trK_rhs, Mh_ trK_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(gxx_rhs, Mh_ gxx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(gxy_rhs, Mh_ gxy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(gxz_rhs, Mh_ gxz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(gyy_rhs, Mh_ gyy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(gyz_rhs, Mh_ gyz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(gzz_rhs, Mh_ gzz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Axx_rhs, Mh_ Axx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Axy_rhs, Mh_ Axy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Axz_rhs, Mh_ Axz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Ayy_rhs, Mh_ Ayy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Ayz_rhs, Mh_ Ayz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Azz_rhs, Mh_ Azz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Gamx_rhs, Mh_ Gamx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Gamy_rhs, Mh_ Gamy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Gamz_rhs, Mh_ Gamz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(Lap_rhs, Mh_ Lap_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(betax_rhs, Mh_ betax_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(betay_rhs, Mh_ betay_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(betaz_rhs, Mh_ betaz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(dtSfx_rhs, Mh_ dtSfx_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(dtSfy_rhs, Mh_ dtSfy_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-		cudaMemcpy(dtSfz_rhs, Mh_ dtSfz_rhs, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);
-	}
+	if(calledby == CALLED_BY_STEP)
+	{	
+		// Main GPU evolution path now consumes RHS directly from device caches.
+		// Avoiding this bulk D2H copy removes one full round-trip per RK stage.
+	}
 	else if(calledby == CALLED_BY_CONSTRAINT)
 	{
 		cudaMemcpy(Gamxxx, Mh_ Gamxxx, matrix_size * sizeof(double), cudaMemcpyDeviceToHost);