Cache GPU main-path device buffers

2026-04-08 19:43:17 +08:00
parent ea470737db
commit 01ac1f9250
3 changed files with 697 additions and 222 deletions
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -33,15 +33,43 @@ struct DeviceArrays
  double *d = nullptr;
 };

-inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
+struct CachedBuffer
 {
-  cudaError_t err = cudaMalloc(&dst, bytes);
+  double *ptr = nullptr;
+  size_t capacity = 0;
+};
+
+inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
+{
+  if (bytes <= buffer.capacity && buffer.ptr)
+    return true;
+
+  if (buffer.ptr)
+  {
+    cudaError_t free_err = cudaFree(buffer.ptr);
+    if (free_err != cudaSuccess)
+      report_cuda_error("cudaFree", free_err);
+    buffer.ptr = nullptr;
+    buffer.capacity = 0;
+  }
+
+  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }
-  err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice);
+
+  buffer.capacity = bytes;
+  return true;
+}
+
+inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
+{
+  if (!ensure_capacity(dst, bytes))
+    return false;
+
+  cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D)", err);
@@ -50,12 +78,6 @@ inline bool copy_to_device(double *&dst, const double *src, size_t bytes)
  return true;
 }

-inline void free_device(double *ptr)
-{
-  if (ptr)
-    cudaFree(ptr);
-}
-
 __global__ void enforce_ga_kernel(int n,
                                  double *dxx, double *gxy, double *gxz,
                                  double *dyy, double *gyz, double *dzz,
@@ -376,31 +398,37 @@ int bssn_cuda_enforce_ga(int *ex,
                         double *Axx, double *Axy, double *Axz,
                         double *Ayy, double *Ayz, double *Azz)
 {
+  struct EnforceGaCache
+  {
+    CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
+    CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
+  };
+  static thread_local EnforceGaCache cache;
+
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

-  double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr;
-  double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr;
-  double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr;
-  double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr;
-
-  bool ok = copy_to_device(d_dxx, dxx, bytes) &&
-            copy_to_device(d_gxy, gxy, bytes) &&
-            copy_to_device(d_gxz, gxz, bytes) &&
-            copy_to_device(d_dyy, dyy, bytes) &&
-            copy_to_device(d_gyz, gyz, bytes) &&
-            copy_to_device(d_dzz, dzz, bytes) &&
-            copy_to_device(d_Axx, Axx, bytes) &&
-            copy_to_device(d_Axy, Axy, bytes) &&
-            copy_to_device(d_Axz, Axz, bytes) &&
-            copy_to_device(d_Ayy, Ayy, bytes) &&
-            copy_to_device(d_Ayz, Ayz, bytes) &&
-            copy_to_device(d_Azz, Azz, bytes);
+  bool ok = copy_to_device(cache.dxx, dxx, bytes) &&
+            copy_to_device(cache.gxy, gxy, bytes) &&
+            copy_to_device(cache.gxz, gxz, bytes) &&
+            copy_to_device(cache.dyy, dyy, bytes) &&
+            copy_to_device(cache.gyz, gyz, bytes) &&
+            copy_to_device(cache.dzz, dzz, bytes) &&
+            copy_to_device(cache.Axx, Axx, bytes) &&
+            copy_to_device(cache.Axy, Axy, bytes) &&
+            copy_to_device(cache.Axz, Axz, bytes) &&
+            copy_to_device(cache.Ayy, Ayy, bytes) &&
+            copy_to_device(cache.Ayz, Ayz, bytes) &&
+            copy_to_device(cache.Azz, Azz, bytes);

  if (ok)
  {
+    double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
+    double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
+    double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
+    double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
    void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
                    &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
    ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args);
@@ -408,27 +436,22 @@ int bssn_cuda_enforce_ga(int *ex,

  if (ok)
  {
-    cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost);
+    cudaError_t err = cudaMemcpy(dxx, cache.dxx.ptr, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err);
    ok = err == cudaSuccess;
-    if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
-    if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(gxy, cache.gxy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(gxz, cache.gxz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(dyy, cache.dyy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(gyz, cache.gyz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(dzz, cache.dzz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Axx, cache.Axx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Axy, cache.Axy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Axz, cache.Axz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Ayy, cache.Ayy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Ayz, cache.Ayz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; }
+    if (ok) { err = cudaMemcpy(Azz, cache.Azz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; }
  }

-  free_device(d_dxx); free_device(d_gxy); free_device(d_gxz);
-  free_device(d_dyy); free_device(d_gyz); free_device(d_dzz);
-  free_device(d_Axx); free_device(d_Axy); free_device(d_Axz);
-  free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz);
-
  return ok ? 0 : 1;
 }

@@ -446,6 +469,19 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               int lev,
                               int rk_stage)
 {
+  struct Rk4BoundaryCache
+  {
+    CachedBuffer X, Y, Z;
+    CachedBuffer state0, boundary, stage, rhs;
+    const double *host_X = nullptr;
+    const double *host_Y = nullptr;
+    const double *host_Z = nullptr;
+    int nx = 0;
+    int ny = 0;
+    int nz = 0;
+  };
+  static thread_local Rk4BoundaryCache cache;
+
  int nx = ex[0];
  int ny = ex[1];
  int nz = ex[2];
@@ -457,23 +493,32 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

-  double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr;
-  double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr;
+  bool ok = true;
+  if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
+      cache.nx != nx || cache.ny != ny || cache.nz != nz)
+  {
+    ok = copy_to_device(cache.X, X, bytes_x) &&
+         copy_to_device(cache.Y, Y, bytes_y) &&
+         copy_to_device(cache.Z, Z, bytes_z);
+    if (ok)
+    {
+      cache.host_X = X;
+      cache.host_Y = Y;
+      cache.host_Z = Z;
+      cache.nx = nx;
+      cache.ny = ny;
+      cache.nz = nz;
+    }
+  }

-  bool ok = copy_to_device(d_X, X, bytes_x) &&
-            copy_to_device(d_Y, Y, bytes_y) &&
-            copy_to_device(d_Z, Z, bytes_z) &&
-            copy_to_device(d_state0, state0, bytes) &&
-            copy_to_device(d_boundary, boundary_src, bytes) &&
-            copy_to_device(d_stage, stage_data, bytes) &&
-            copy_to_device(d_rhs, rhs_accum, bytes);
+  ok = ok &&
+            copy_to_device(cache.state0, state0, bytes) &&
+            copy_to_device(cache.boundary, boundary_src, bytes) &&
+            copy_to_device(cache.stage, stage_data, bytes) &&
+            copy_to_device(cache.rhs, rhs_accum, bytes);

  if (!ok)
-  {
-    free_device(d_X); free_device(d_Y); free_device(d_Z);
-    free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
    return 1;
-  }

  double dX = X[1] - X[0];
  double dY = Y[1] - Y[0];
@@ -498,6 +543,9 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
    if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
    if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;

+    double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
+    double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
+    double *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
    double *bam_target = (rk_stage == 0) ? d_rhs : d_stage;
    const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
    void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
@@ -513,12 +561,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,

  if (ok)
  {
+    double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
    ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args);
  }

  if (ok && lev > 0)
  {
+      double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr;
      void *args[] = {&nx, &ny, &nz,
                      &has_xmin, &has_ymin, &has_zmin,
                      &has_xmax, &has_ymax, &has_zmax,
@@ -528,45 +578,43 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,

  if (ok)
  {
-    cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost);
+    cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
    ok = err == cudaSuccess;
    if (ok)
    {
-      err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost);
+      err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err);
      ok = err == cudaSuccess;
    }
  }

-  free_device(d_X); free_device(d_Y); free_device(d_Z);
-  free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs);
  return ok ? 0 : 1;
 }

 int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
 {
+  static thread_local CachedBuffer d_chi;
+
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

-  double *d_chi = nullptr;
  bool ok = copy_to_device(d_chi, chi, bytes);

  if (ok)
  {
-    void *args[] = {&n, &d_chi, &tinny};
+    double *ptr = d_chi.ptr;
+    void *args[] = {&n, &ptr, &tinny};
    ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args);
  }

  if (ok)
  {
-    cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost);
+    cudaError_t err = cudaMemcpy(chi, d_chi.ptr, bytes, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
    ok = err == cudaSuccess;
  }
-
-  free_device(d_chi);
  return ok ? 0 : 1;
 }
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
@@ -1,23 +1,26 @@
 // includes, system
 #include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <iostream>
-#include <sys/time.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <unistd.h>
+#include <iostream>
+#include <sys/time.h>
 #include <cuda.h>
 //#include "cutil.h"
 #include <nvrtc.h>
 #include <cuda_runtime.h>
-using namespace std;
+using namespace std;

 //includes, bssn
 #include "gpu_mem.h"
-#include "bssn_gpu.h"
-#ifdef RESULT_CHECK
-#include <fstream>
-#endif
-
+#include "bssn_gpu.h"
+#ifdef RESULT_CHECK
+#include <fstream>
+#endif
+
+void destroy_meta(Meta *meta);
+
 void compare_result_gpu(int ftag1,double * datac,int data_num){
 #ifdef RESULT_CHECK
 	double * data = (double*)malloc(sizeof(double)*data_num);
@@ -30,8 +33,426 @@ void compare_result_gpu(int ftag1,double * datac,int data_num){
 	(void)data_num;
 #endif
 }
-
-__global__ void test_const_address(double * testd){
+
+namespace {
+
+int read_local_rank_from_env()
+{
+	const char *keys[] = {
+		"AMSS_NCKU_CUDA_LOCAL_RANK",
+		"I_MPI_LOCAL_RANK",
+		"OMPI_COMM_WORLD_LOCAL_RANK",
+		"MPI_LOCALRANKID",
+		"PMI_LOCAL_RANK",
+		"SLURM_LOCALID"
+	};
+
+	for (size_t i = 0; i < sizeof(keys) / sizeof(keys[0]); ++i)
+	{
+		const char *value = getenv(keys[i]);
+		if (value && *value)
+			return atoi(value);
+	}
+	return -1;
+}
+
+int read_forced_device_from_env()
+{
+	const char *value = getenv("AMSS_NCKU_CUDA_DEVICE");
+	if (value && *value)
+		return atoi(value);
+	return -1;
+}
+
+int select_cuda_device_for_process(int mpi_rank)
+{
+	static int cached_device = -2;
+	if (cached_device >= -1)
+		return cached_device;
+
+	int device_count = 0;
+	cudaError_t err = cudaGetDeviceCount(&device_count);
+	if (err != cudaSuccess || device_count <= 0)
+	{
+		cached_device = -1;
+		return cached_device;
+	}
+
+	int device = read_forced_device_from_env();
+	if (device < 0)
+	{
+		int local_rank = read_local_rank_from_env();
+		if (local_rank < 0)
+			local_rank = mpi_rank;
+		device = local_rank % device_count;
+	}
+
+	if (device < 0)
+		device = 0;
+	if (device >= device_count)
+		device %= device_count;
+
+	err = cudaSetDevice(device);
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaSetDevice(" << device << ") failed: "
+		     << cudaGetErrorString(err) << endl;
+		cached_device = -1;
+		return cached_device;
+	}
+
+	cached_device = device;
+	return cached_device;
+}
+
+struct BufferSpec
+{
+	double **slot;
+	size_t count;
+};
+
+struct CopySpec
+{
+	double *dst;
+	const double *src;
+	size_t count;
+};
+
+struct ZeroSpec
+{
+	double *ptr;
+	size_t count;
+};
+
+struct GpuRhsCache
+{
+	Meta meta{};
+	int ex[3] = {0, 0, 0};
+	int matrix_size = 0;
+	int device = -1;
+	bool allocated = false;
+	const double *last_x = nullptr;
+	const double *last_y = nullptr;
+	const double *last_z = nullptr;
+};
+
+GpuRhsCache &gpu_rhs_cache()
+{
+	static GpuRhsCache cache;
+	return cache;
+}
+
+void reset_meta(Meta *meta)
+{
+	memset(meta, 0, sizeof(Meta));
+}
+
+bool ensure_device_buffer(double **ptr, size_t count)
+{
+	if (*ptr)
+		return true;
+
+	cudaError_t err = cudaMalloc((void **)ptr, count * sizeof(double));
+	if (err != cudaSuccess)
+	{
+		cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << endl;
+		return false;
+	}
+	return true;
+}
+
+bool allocate_buffers(const BufferSpec *specs, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		if (!ensure_device_buffer(specs[i].slot, specs[i].count))
+			return false;
+	}
+	return true;
+}
+
+bool copy_buffers_to_device(const CopySpec *specs, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		cudaError_t err = cudaMemcpy(specs[i].dst, specs[i].src,
+		                             specs[i].count * sizeof(double),
+		                             cudaMemcpyHostToDevice);
+		if (err != cudaSuccess)
+		{
+			cerr << "cudaMemcpy(H2D) failed: " << cudaGetErrorString(err) << endl;
+			return false;
+		}
+	}
+	return true;
+}
+
+bool zero_buffers(const ZeroSpec *specs, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		cudaError_t err = cudaMemset(specs[i].ptr, 0, specs[i].count * sizeof(double));
+		if (err != cudaSuccess)
+		{
+			cerr << "cudaMemset failed: " << cudaGetErrorString(err) << endl;
+			return false;
+		}
+	}
+	return true;
+}
+
+void cleanup_gpu_rhs_cache()
+{
+	GpuRhsCache &cache = gpu_rhs_cache();
+	if (!cache.allocated)
+		return;
+
+	if (cache.device >= 0)
+		cudaSetDevice(cache.device);
+	destroy_meta(&cache.meta);
+	reset_meta(&cache.meta);
+	cache.ex[0] = cache.ex[1] = cache.ex[2] = 0;
+	cache.matrix_size = 0;
+	cache.device = -1;
+	cache.allocated = false;
+	cache.last_x = nullptr;
+	cache.last_y = nullptr;
+	cache.last_z = nullptr;
+}
+
+bool register_gpu_rhs_cleanup()
+{
+	static bool registered = false;
+	if (!registered)
+	{
+		atexit(cleanup_gpu_rhs_cache);
+		registered = true;
+	}
+	return true;
+}
+
+bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex)
+{
+	register_gpu_rhs_cleanup();
+
+	const bool shape_changed =
+		!cache.allocated ||
+		cache.device != device ||
+		cache.ex[0] != ex[0] ||
+		cache.ex[1] != ex[1] ||
+		cache.ex[2] != ex[2];
+
+	if (!shape_changed)
+		return true;
+
+	if (cache.allocated)
+	{
+		if (cache.device >= 0)
+			cudaSetDevice(cache.device);
+		destroy_meta(&cache.meta);
+		reset_meta(&cache.meta);
+	}
+
+	cache.device = device;
+	cache.ex[0] = ex[0];
+	cache.ex[1] = ex[1];
+	cache.ex[2] = ex[2];
+	cache.matrix_size = ex[0] * ex[1] * ex[2];
+	cache.last_x = nullptr;
+	cache.last_y = nullptr;
+	cache.last_z = nullptr;
+
+	Meta *meta = &cache.meta;
+	const int matrix_size = cache.matrix_size;
+	const size_t fh_size = static_cast<size_t>(ex[0] + 2) * static_cast<size_t>(ex[1] + 2) * static_cast<size_t>(ex[2] + 2);
+	const size_t fh2_size = static_cast<size_t>(ex[0] + 3) * static_cast<size_t>(ex[1] + 3) * static_cast<size_t>(ex[2] + 3);
+
+	const BufferSpec buffers[] = {
+		{&meta->X, static_cast<size_t>(ex[0])},
+		{&meta->Y, static_cast<size_t>(ex[1])},
+		{&meta->Z, static_cast<size_t>(ex[2])},
+		{&meta->chi, static_cast<size_t>(matrix_size)},
+		{&meta->dxx, static_cast<size_t>(matrix_size)},
+		{&meta->dyy, static_cast<size_t>(matrix_size)},
+		{&meta->dzz, static_cast<size_t>(matrix_size)},
+		{&meta->trK, static_cast<size_t>(matrix_size)},
+		{&meta->gxy, static_cast<size_t>(matrix_size)},
+		{&meta->gxz, static_cast<size_t>(matrix_size)},
+		{&meta->gyz, static_cast<size_t>(matrix_size)},
+		{&meta->Axx, static_cast<size_t>(matrix_size)},
+		{&meta->Axy, static_cast<size_t>(matrix_size)},
+		{&meta->Axz, static_cast<size_t>(matrix_size)},
+		{&meta->Ayz, static_cast<size_t>(matrix_size)},
+		{&meta->Ayy, static_cast<size_t>(matrix_size)},
+		{&meta->Azz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamx, static_cast<size_t>(matrix_size)},
+		{&meta->Gamy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamz, static_cast<size_t>(matrix_size)},
+		{&meta->Lap, static_cast<size_t>(matrix_size)},
+		{&meta->betax, static_cast<size_t>(matrix_size)},
+		{&meta->betay, static_cast<size_t>(matrix_size)},
+		{&meta->betaz, static_cast<size_t>(matrix_size)},
+		{&meta->dtSfx, static_cast<size_t>(matrix_size)},
+		{&meta->dtSfy, static_cast<size_t>(matrix_size)},
+		{&meta->dtSfz, static_cast<size_t>(matrix_size)},
+		{&meta->chi_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->trK_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->gxx_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->gxy_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->gxz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->gyy_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->gyz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->gzz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Axx_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Axy_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Axz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Ayy_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Ayz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Azz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Gamx_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Gamy_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Gamz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->Lap_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->betax_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->betay_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->betaz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->dtSfx_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->dtSfy_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->dtSfz_rhs, static_cast<size_t>(matrix_size)},
+		{&meta->rho, static_cast<size_t>(matrix_size)},
+		{&meta->Sx, static_cast<size_t>(matrix_size)},
+		{&meta->Sy, static_cast<size_t>(matrix_size)},
+		{&meta->Sz, static_cast<size_t>(matrix_size)},
+		{&meta->Sxx, static_cast<size_t>(matrix_size)},
+		{&meta->Sxy, static_cast<size_t>(matrix_size)},
+		{&meta->Sxz, static_cast<size_t>(matrix_size)},
+		{&meta->Syy, static_cast<size_t>(matrix_size)},
+		{&meta->Syz, static_cast<size_t>(matrix_size)},
+		{&meta->Szz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxxx, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxxy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxxz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxyy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxyz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxzz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyxx, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyxy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyxz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyyy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyyz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyzz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzxx, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzxy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzxz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzyy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzyz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzzz, static_cast<size_t>(matrix_size)},
+		{&meta->Rxx, static_cast<size_t>(matrix_size)},
+		{&meta->Rxy, static_cast<size_t>(matrix_size)},
+		{&meta->Rxz, static_cast<size_t>(matrix_size)},
+		{&meta->Ryy, static_cast<size_t>(matrix_size)},
+		{&meta->Ryz, static_cast<size_t>(matrix_size)},
+		{&meta->Rzz, static_cast<size_t>(matrix_size)},
+		{&meta->ham_Res, static_cast<size_t>(matrix_size)},
+		{&meta->movx_Res, static_cast<size_t>(matrix_size)},
+		{&meta->movy_Res, static_cast<size_t>(matrix_size)},
+		{&meta->movz_Res, static_cast<size_t>(matrix_size)},
+		{&meta->Gmx_Res, static_cast<size_t>(matrix_size)},
+		{&meta->Gmy_Res, static_cast<size_t>(matrix_size)},
+		{&meta->Gmz_Res, static_cast<size_t>(matrix_size)},
+		{&meta->gxx, static_cast<size_t>(matrix_size)},
+		{&meta->gyy, static_cast<size_t>(matrix_size)},
+		{&meta->gzz, static_cast<size_t>(matrix_size)},
+		{&meta->chix, static_cast<size_t>(matrix_size)},
+		{&meta->chiy, static_cast<size_t>(matrix_size)},
+		{&meta->chiz, static_cast<size_t>(matrix_size)},
+		{&meta->gxxx, static_cast<size_t>(matrix_size)},
+		{&meta->gxyx, static_cast<size_t>(matrix_size)},
+		{&meta->gxzx, static_cast<size_t>(matrix_size)},
+		{&meta->gyyx, static_cast<size_t>(matrix_size)},
+		{&meta->gyzx, static_cast<size_t>(matrix_size)},
+		{&meta->gzzx, static_cast<size_t>(matrix_size)},
+		{&meta->gxxy, static_cast<size_t>(matrix_size)},
+		{&meta->gxyy, static_cast<size_t>(matrix_size)},
+		{&meta->gxzy, static_cast<size_t>(matrix_size)},
+		{&meta->gyyy, static_cast<size_t>(matrix_size)},
+		{&meta->gyzy, static_cast<size_t>(matrix_size)},
+		{&meta->gzzy, static_cast<size_t>(matrix_size)},
+		{&meta->gxxz, static_cast<size_t>(matrix_size)},
+		{&meta->gxyz, static_cast<size_t>(matrix_size)},
+		{&meta->gxzz, static_cast<size_t>(matrix_size)},
+		{&meta->gyyz, static_cast<size_t>(matrix_size)},
+		{&meta->gyzz, static_cast<size_t>(matrix_size)},
+		{&meta->gzzz, static_cast<size_t>(matrix_size)},
+		{&meta->Lapx, static_cast<size_t>(matrix_size)},
+		{&meta->Lapy, static_cast<size_t>(matrix_size)},
+		{&meta->Lapz, static_cast<size_t>(matrix_size)},
+		{&meta->betaxx, static_cast<size_t>(matrix_size)},
+		{&meta->betaxy, static_cast<size_t>(matrix_size)},
+		{&meta->betaxz, static_cast<size_t>(matrix_size)},
+		{&meta->betayy, static_cast<size_t>(matrix_size)},
+		{&meta->betayz, static_cast<size_t>(matrix_size)},
+		{&meta->betazz, static_cast<size_t>(matrix_size)},
+		{&meta->betayx, static_cast<size_t>(matrix_size)},
+		{&meta->betazy, static_cast<size_t>(matrix_size)},
+		{&meta->betazx, static_cast<size_t>(matrix_size)},
+		{&meta->Kx, static_cast<size_t>(matrix_size)},
+		{&meta->Ky, static_cast<size_t>(matrix_size)},
+		{&meta->Kz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxx, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamyx, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzy, static_cast<size_t>(matrix_size)},
+		{&meta->Gamzx, static_cast<size_t>(matrix_size)},
+		{&meta->div_beta, static_cast<size_t>(matrix_size)},
+		{&meta->S, static_cast<size_t>(matrix_size)},
+		{&meta->f, static_cast<size_t>(matrix_size)},
+		{&meta->fxx, static_cast<size_t>(matrix_size)},
+		{&meta->fxy, static_cast<size_t>(matrix_size)},
+		{&meta->fxz, static_cast<size_t>(matrix_size)},
+		{&meta->fyy, static_cast<size_t>(matrix_size)},
+		{&meta->fyz, static_cast<size_t>(matrix_size)},
+		{&meta->fzz, static_cast<size_t>(matrix_size)},
+		{&meta->gupxx, static_cast<size_t>(matrix_size)},
+		{&meta->gupxy, static_cast<size_t>(matrix_size)},
+		{&meta->gupxz, static_cast<size_t>(matrix_size)},
+		{&meta->gupyy, static_cast<size_t>(matrix_size)},
+		{&meta->gupyz, static_cast<size_t>(matrix_size)},
+		{&meta->gupzz, static_cast<size_t>(matrix_size)},
+		{&meta->Gamxa, static_cast<size_t>(matrix_size)},
+		{&meta->Gamya, static_cast<size_t>(matrix_size)},
+		{&meta->Gamza, static_cast<size_t>(matrix_size)},
+		{&meta->alpn1, static_cast<size_t>(matrix_size)},
+		{&meta->chin1, static_cast<size_t>(matrix_size)},
+		{&meta->fh, fh_size},
+		{&meta->fh2, fh2_size},
+#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
+		{&meta->reta, static_cast<size_t>(matrix_size)},
+#endif
+	};
+
+	if (!allocate_buffers(buffers, sizeof(buffers) / sizeof(buffers[0])))
+	{
+		destroy_meta(meta);
+		reset_meta(meta);
+		cache.allocated = false;
+		cache.last_x = nullptr;
+		cache.last_y = nullptr;
+		cache.last_z = nullptr;
+		return false;
+	}
+
+	cache.allocated = true;
+	return true;
+}
+
+} // namespace
+
+__global__ void test_const_address(double * testd){
 	int _t = blockIdx.x*blockDim.x+threadIdx.x;
 	if(_t == 0)
 		testd[0] = F1o3;
@@ -2010,39 +2431,30 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
               double *ham_Res, double *movx_Res, double *movy_Res,double * movz_Res, 
               double * Gmx_Res, double *Gmy_Res,double * Gmz_Res ,
               int & Symmetry,int &Lev, double &eps, int &co)
-{
-	//#1------------init gpu meta data---------------------
-	//cout<<"init GPU meta data\n";
-
-#ifdef DEVICE_ID  
-  	// which device to use
-  	cudaSetDevice(DEVICE_ID);
-#endif
-
-#ifdef DEVICE_ID_BY_PID
-	pid_t pid = getpid();
-	cudaSetDevice(pid % 2);
-	cout<<"My pid= "<<pid<<endl;
-#endif
-
-#ifdef DEVICE_ID_BY_MPI_RANK
-	cudaSetDevice(mpi_rank % 2);
-#endif
-
-#ifdef TIMING  
+{
+	//#1------------init gpu meta data---------------------
+	//cout<<"init GPU meta data\n";
+
+	const int device = select_cuda_device_for_process(mpi_rank);
+	if (device < 0)
+		return 1;
+
+#ifdef TIMING  
  struct timeval tvStart, tvEnd;
  struct timeval tv1, tv2;
  gettimeofday(&tvStart, NULL );
  gettimeofday(&tv1, NULL );
 #endif

-	//int dim = 3;
-	int matrix_size = ex[0] * ex[1] * ex[2];
-	Meta met;
-	Meta * meta = &met;
+	//int dim = 3;
+	int matrix_size = ex[0] * ex[1] * ex[2];
+	GpuRhsCache &cache = gpu_rhs_cache();
+	if (!prepare_gpu_rhs_cache(cache, device, ex))
+		return 1;
+	Meta * meta = &cache.meta;
 	
-	/*
-	//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------
+	/*
+	//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------

 	//1.1 inout
 	CUDA_SAFE_CALL(cudaMalloc((void**)&(Mh_ X), ex[0] * sizeof(double)));
@@ -2212,7 +2624,8 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 	CUDA_SAFE_CALL(cudaMalloc((void**)&(Mh_ fh2), (ex[0]+3)*(ex[1]+3)*(ex[2]+3) * sizeof(double)));
 	*/
 	
-	//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------
+	#if 0
+	//#1--------------------init_gpu_meta(meta,matrix_size)---------------------------

 	//1.1 inout
 	cudaMalloc((void**)&(Mh_ X), ex[0] * sizeof(double));
@@ -2396,114 +2809,133 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,

 	#endif
 	  
-//2 ----------------Copy Data to Device------------------
-	cudaMemcpy(Mh_ X, X, ex[0] * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Y, Y, ex[1] * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Z, Z, ex[2] * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ chi, chi, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ dxx, dxx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ dyy, dyy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ dzz, dzz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ trK, trK, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ gxy, gxy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ gxz, gxz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ gyz, gyz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Axx, Axx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Axy, Axy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Axz, Axz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Ayz, Ayz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Ayy, Ayy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Azz, Azz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Gamx, Gamx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Gamy, Gamy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Gamz, Gamz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ betax, betax, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ betay, betay, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ betaz, betaz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ Lap, Lap, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ dtSfx, dtSfx, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ dtSfy, dtSfy, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemcpy(Mh_ dtSfz, dtSfz, matrix_size * sizeof(double), cudaMemcpyHostToDevice);
-	cudaMemset(Mh_ rho,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Sxx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Sxy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Sxz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Syz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Syy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Szz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Sx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Sy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Sz,0,matrix_size * sizeof(double));
-
-	//init local var
-	cudaMemset(Mh_ gxx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gyy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gzz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ chix,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ chiy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ chiz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxxx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxyx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxzx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gyyx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gyzx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gzzx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxxy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxyy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxzy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gyyy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gyzy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gzzy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxxz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxyz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gxzz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gyyz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gyzz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gzzz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Lapx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Lapy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Lapz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betaxx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betaxy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betaxz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betayy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betayz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betazz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betayx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betazy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ betazx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Kx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Ky,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Kz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamxx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamxy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamxz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamyy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamyz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamzz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamyx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamzy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamzx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ div_beta,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ S,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ f,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ fxx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ fxy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ fxz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ fyy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ fyz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ fzz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gupxy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gupxz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gupyy,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gupyz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ gupzz,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamxa,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamya,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ Gamza,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ alpn1,0,matrix_size * sizeof(double));
-	cudaMemset(Mh_ chin1,0,matrix_size * sizeof(double));
+	#endif
+//2 ----------------Copy Data to Device------------------
+	if (cache.last_x != X || cache.last_y != Y || cache.last_z != Z)
+	{
+		const CopySpec coord_copies[] = {
+			{Mh_ X, X, static_cast<size_t>(ex[0])},
+			{Mh_ Y, Y, static_cast<size_t>(ex[1])},
+			{Mh_ Z, Z, static_cast<size_t>(ex[2])},
+		};
+		if (!copy_buffers_to_device(coord_copies, sizeof(coord_copies) / sizeof(coord_copies[0])))
+			return 1;
+		cache.last_x = X;
+		cache.last_y = Y;
+		cache.last_z = Z;
+	}
+
+	const CopySpec state_copies[] = {
+		{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
+		{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
+		{Mh_ dyy, dyy, static_cast<size_t>(matrix_size)},
+		{Mh_ dzz, dzz, static_cast<size_t>(matrix_size)},
+		{Mh_ trK, trK, static_cast<size_t>(matrix_size)},
+		{Mh_ gxy, gxy, static_cast<size_t>(matrix_size)},
+		{Mh_ gxz, gxz, static_cast<size_t>(matrix_size)},
+		{Mh_ gyz, gyz, static_cast<size_t>(matrix_size)},
+		{Mh_ Axx, Axx, static_cast<size_t>(matrix_size)},
+		{Mh_ Axy, Axy, static_cast<size_t>(matrix_size)},
+		{Mh_ Axz, Axz, static_cast<size_t>(matrix_size)},
+		{Mh_ Ayz, Ayz, static_cast<size_t>(matrix_size)},
+		{Mh_ Ayy, Ayy, static_cast<size_t>(matrix_size)},
+		{Mh_ Azz, Azz, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamx, Gamx, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamy, Gamy, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamz, Gamz, static_cast<size_t>(matrix_size)},
+		{Mh_ betax, betax, static_cast<size_t>(matrix_size)},
+		{Mh_ betay, betay, static_cast<size_t>(matrix_size)},
+		{Mh_ betaz, betaz, static_cast<size_t>(matrix_size)},
+		{Mh_ Lap, Lap, static_cast<size_t>(matrix_size)},
+		{Mh_ dtSfx, dtSfx, static_cast<size_t>(matrix_size)},
+		{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
+		{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
+	};
+	if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
+		return 1;
+
+	const ZeroSpec zero_specs[] = {
+		{Mh_ rho, static_cast<size_t>(matrix_size)},
+		{Mh_ Sxx, static_cast<size_t>(matrix_size)},
+		{Mh_ Sxy, static_cast<size_t>(matrix_size)},
+		{Mh_ Sxz, static_cast<size_t>(matrix_size)},
+		{Mh_ Syz, static_cast<size_t>(matrix_size)},
+		{Mh_ Syy, static_cast<size_t>(matrix_size)},
+		{Mh_ Szz, static_cast<size_t>(matrix_size)},
+		{Mh_ Sx, static_cast<size_t>(matrix_size)},
+		{Mh_ Sy, static_cast<size_t>(matrix_size)},
+		{Mh_ Sz, static_cast<size_t>(matrix_size)},
+		{Mh_ gxx, static_cast<size_t>(matrix_size)},
+		{Mh_ gyy, static_cast<size_t>(matrix_size)},
+		{Mh_ gzz, static_cast<size_t>(matrix_size)},
+		{Mh_ chix, static_cast<size_t>(matrix_size)},
+		{Mh_ chiy, static_cast<size_t>(matrix_size)},
+		{Mh_ chiz, static_cast<size_t>(matrix_size)},
+		{Mh_ gxxx, static_cast<size_t>(matrix_size)},
+		{Mh_ gxyx, static_cast<size_t>(matrix_size)},
+		{Mh_ gxzx, static_cast<size_t>(matrix_size)},
+		{Mh_ gyyx, static_cast<size_t>(matrix_size)},
+		{Mh_ gyzx, static_cast<size_t>(matrix_size)},
+		{Mh_ gzzx, static_cast<size_t>(matrix_size)},
+		{Mh_ gxxy, static_cast<size_t>(matrix_size)},
+		{Mh_ gxyy, static_cast<size_t>(matrix_size)},
+		{Mh_ gxzy, static_cast<size_t>(matrix_size)},
+		{Mh_ gyyy, static_cast<size_t>(matrix_size)},
+		{Mh_ gyzy, static_cast<size_t>(matrix_size)},
+		{Mh_ gzzy, static_cast<size_t>(matrix_size)},
+		{Mh_ gxxz, static_cast<size_t>(matrix_size)},
+		{Mh_ gxyz, static_cast<size_t>(matrix_size)},
+		{Mh_ gxzz, static_cast<size_t>(matrix_size)},
+		{Mh_ gyyz, static_cast<size_t>(matrix_size)},
+		{Mh_ gyzz, static_cast<size_t>(matrix_size)},
+		{Mh_ gzzz, static_cast<size_t>(matrix_size)},
+		{Mh_ Lapx, static_cast<size_t>(matrix_size)},
+		{Mh_ Lapy, static_cast<size_t>(matrix_size)},
+		{Mh_ Lapz, static_cast<size_t>(matrix_size)},
+		{Mh_ betaxx, static_cast<size_t>(matrix_size)},
+		{Mh_ betaxy, static_cast<size_t>(matrix_size)},
+		{Mh_ betaxz, static_cast<size_t>(matrix_size)},
+		{Mh_ betayy, static_cast<size_t>(matrix_size)},
+		{Mh_ betayz, static_cast<size_t>(matrix_size)},
+		{Mh_ betazz, static_cast<size_t>(matrix_size)},
+		{Mh_ betayx, static_cast<size_t>(matrix_size)},
+		{Mh_ betazy, static_cast<size_t>(matrix_size)},
+		{Mh_ betazx, static_cast<size_t>(matrix_size)},
+		{Mh_ Kx, static_cast<size_t>(matrix_size)},
+		{Mh_ Ky, static_cast<size_t>(matrix_size)},
+		{Mh_ Kz, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamxx, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamxy, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamxz, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamyy, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamyz, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamzz, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamyx, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamzy, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamzx, static_cast<size_t>(matrix_size)},
+		{Mh_ div_beta, static_cast<size_t>(matrix_size)},
+		{Mh_ S, static_cast<size_t>(matrix_size)},
+		{Mh_ f, static_cast<size_t>(matrix_size)},
+		{Mh_ fxx, static_cast<size_t>(matrix_size)},
+		{Mh_ fxy, static_cast<size_t>(matrix_size)},
+		{Mh_ fxz, static_cast<size_t>(matrix_size)},
+		{Mh_ fyy, static_cast<size_t>(matrix_size)},
+		{Mh_ fyz, static_cast<size_t>(matrix_size)},
+		{Mh_ fzz, static_cast<size_t>(matrix_size)},
+		{Mh_ gupxx, static_cast<size_t>(matrix_size)},
+		{Mh_ gupxy, static_cast<size_t>(matrix_size)},
+		{Mh_ gupxz, static_cast<size_t>(matrix_size)},
+		{Mh_ gupyy, static_cast<size_t>(matrix_size)},
+		{Mh_ gupyz, static_cast<size_t>(matrix_size)},
+		{Mh_ gupzz, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamxa, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamya, static_cast<size_t>(matrix_size)},
+		{Mh_ Gamza, static_cast<size_t>(matrix_size)},
+		{Mh_ alpn1, static_cast<size_t>(matrix_size)},
+		{Mh_ chin1, static_cast<size_t>(matrix_size)},
+	};
+	if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
+		return 1;
 	
 	
 	double sss[3] = {1,1,1};
@@ -2907,8 +3339,5 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
 #endif


-	destroy_meta(meta);
-
-	
-	return 0;//TODO return
-}
+	return 0;//TODO return
+}
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -4,10 +4,8 @@
 #include "bssn_macro.h"
 #include "macrodef.fh"

-#define DEVICE_ID 0
-// #define DEVICE_ID_BY_MPI_RANK
-#define GRID_DIM 256
-#define BLOCK_DIM 128
+#define GRID_DIM 256
+#define BLOCK_DIM 128

 #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
 #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]