diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C
index 3950a2e..e4a759a 100644
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -243,6 +243,69 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
         }
       };
 
+  auto refresh_stage_host_before_sync =
+      [&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
+        if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
+          return false;
+
+        MyList<Patch> *patch_it = GH->PatL[lev];
+        while (patch_it)
+        {
+          MyList<Block> *block_it = patch_it->data->blb;
+          while (block_it)
+          {
+            Block *cg = block_it->data;
+            if (myrank == cg->rank)
+            {
+              std::vector<Parallel::gridseg *> local_segments;
+              MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
+              while (seg)
+              {
+                if (seg->data && seg->data->Bg == cg)
+                  local_segments.push_back(seg->data);
+                seg = seg->next;
+              }
+
+              if (!local_segments.empty())
+              {
+                MyList<var> *var_it = var_list;
+                while (var_it)
+                {
+                  double *host_ptr = cg->fgfs[var_it->data->sgfn];
+                  for (size_t i = 0; i < local_segments.size(); ++i)
+                  {
+                    Parallel::gridseg *src_seg = local_segments[i];
+                    if (bssn_gpu_stage_download_region(host_ptr,
+                                                       cg->shape,
+                                                       cg->bbox,
+                                                       cg->bbox + dim,
+                                                       src_seg->shape,
+                                                       src_seg->llb))
+                    {
+                      cerr << "GPU sync region download failure: lev=" << lev
+                           << " var=" << var_it->data->name
+                           << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                           << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                           << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+                      ERROR = 1;
+                      return true;
+                    }
+                  }
+                  var_it = var_it->next;
+                }
+              }
+            }
+
+            if (block_it == patch_it->data->ble)
+              break;
+            block_it = block_it->next;
+          }
+          patch_it = patch_it->next;
+        }
+
+        return true;
+      };
+
   MyList<Patch> *Pp = GH->PatL[lev];
   while (Pp)
   {
@@ -268,7 +331,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
           ERROR = 1;
         }
-        if (!ERROR)
+        if (!ERROR && !sync_cache_pre[lev].valid)
           stage_download_var_list(cg, SynchList_pre);
       }
       if (BP == Pp->data->ble)
@@ -278,6 +341,9 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
     Pp = Pp->next;
   }
 
+  if (!ERROR && sync_cache_pre[lev].valid)
+    refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]);
+
   MPI_Request err_req_pre;
   {
     int erh = ERROR;
@@ -348,15 +414,15 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
 
           if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
           {
-            cerr << "GPU lowerbound failure: lev=" << lev
-                 << " rk_stage=" << iter_count
-                 << " var=" << phi1->name
+          cerr << "GPU lowerbound failure: lev=" << lev
+               << " rk_stage=" << iter_count
+               << " var=" << phi1->name
                  << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
                  << cg->bbox[1] << ":" << cg->bbox[4] << ","
                  << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
             ERROR = 1;
           }
-          if (!ERROR)
+          if (!ERROR && (!sync_cache_cor[lev].valid || iter_count == 3))
             stage_download_var_list(cg, SynchList_cor);
         }
 
@@ -367,6 +433,9 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
       Pp = Pp->next;
     }
 
+    if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3)
+      refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]);
+
     MPI_Request err_req_cor;
     {
       int erh = ERROR;
diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu
index fc99f64..65e17e9 100644
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -914,6 +914,73 @@ int bssn_gpu_stage_upload_region(const double *host_ptr,
 	return 0;
 }
 
+int bssn_gpu_stage_download_region(double *host_ptr,
+                                   const int *full_shape,
+                                   const double *full_llb,
+                                   const double *full_uub,
+                                   const int *region_shape,
+                                   const double *region_llb)
+{
+	if (!host_ptr || !full_shape || !full_llb || !full_uub || !region_shape || !region_llb)
+		return 1;
+
+	const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
+	if (!device_ptr)
+		return 1;
+
+	int start[3] = {0, 0, 0};
+	for (int i = 0; i < 3; ++i)
+	{
+		if (full_shape[i] <= 0 || region_shape[i] <= 0)
+			return 1;
+
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+		const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i] - 1);
+		start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
+#else
+#ifdef Cell
+		const double dx = (full_uub[i] - full_llb[i]) / static_cast<double>(full_shape[i]);
+		start[i] = static_cast<int>((region_llb[i] - full_llb[i]) / dx + 0.4);
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+
+		if (start[i] < 0 || start[i] + region_shape[i] > full_shape[i])
+			return 1;
+	}
+
+	cudaMemcpy3DParms parms = {};
+	parms.srcPtr = make_cudaPitchedPtr(const_cast<double *>(device_ptr),
+	                                   static_cast<size_t>(full_shape[0]) * sizeof(double),
+	                                   static_cast<size_t>(full_shape[0]),
+	                                   static_cast<size_t>(full_shape[1]));
+	parms.dstPtr = make_cudaPitchedPtr(host_ptr,
+	                                   static_cast<size_t>(full_shape[0]) * sizeof(double),
+	                                   static_cast<size_t>(full_shape[0]),
+	                                   static_cast<size_t>(full_shape[1]));
+	parms.srcPos = make_cudaPos(static_cast<size_t>(start[0]) * sizeof(double),
+	                            static_cast<size_t>(start[1]),
+	                            static_cast<size_t>(start[2]));
+	parms.dstPos = parms.srcPos;
+	parms.extent = make_cudaExtent(static_cast<size_t>(region_shape[0]) * sizeof(double),
+	                               static_cast<size_t>(region_shape[1]),
+	                               static_cast<size_t>(region_shape[2]));
+	parms.kind = cudaMemcpyDeviceToHost;
+
+	cudaError_t err = cudaMemcpy3D(&parms);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaMemcpy3D(D2H region) failed: " << cudaGetErrorString(err) << endl;
+		return 1;
+	}
+
+	return 0;
+}
+
 __global__ void test_const_address(double * testd){
 	int _t = blockIdx.x*blockDim.x+threadIdx.x;
 	if(_t == 0)
diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h
index 19333c0..3dd890c 100644
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -77,6 +77,12 @@ int bssn_gpu_stage_upload_region(const double *host_ptr,
                                  const double *full_uub,
                                  const int *region_shape,
                                  const double *region_llb);
+int bssn_gpu_stage_download_region(double *host_ptr,
+                                   const int *full_shape,
+                                   const double *full_llb,
+                                   const double *full_uub,
+                                   const int *region_shape,
+                                   const double *region_llb);
 
 /** Init GPU side data in GPUMeta. */
 // void init_fluid_meta_gpu(GPUMeta *gpu_meta);