diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index ca6aab8..1a13b41 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -12,6 +12,38 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 
+/* ------------------------------------------------------------------ */
+/*  Multi-GPU dispatch: distribute ranks across available GPUs          */
+/* ------------------------------------------------------------------ */
+static struct {
+    int  num_gpus;
+    int  my_rank;
+    int  my_device;
+    bool inited;
+} g_dispatch = {0, -1, -1, false};
+
+static void init_gpu_dispatch() {
+    if (g_dispatch.inited) return;
+    cudaGetDeviceCount(&g_dispatch.num_gpus);
+    if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;
+
+    /* Get MPI rank from environment (set by mpirun/mpiexec). */
+    const char *rank_env = getenv("PMI_RANK");
+    if (!rank_env) rank_env = getenv("OMPI_COMM_WORLD_RANK");
+    if (!rank_env) rank_env = getenv("MV2_COMM_WORLD_RANK");
+    if (!rank_env) rank_env = getenv("SLURM_PROCID");
+    g_dispatch.my_rank = rank_env ? atoi(rank_env) : 0;
+
+    g_dispatch.my_device = g_dispatch.my_rank % g_dispatch.num_gpus;
+    cudaSetDevice(g_dispatch.my_device);
+
+    if (g_dispatch.my_rank == 0) {
+        printf("[AMSS-GPU] %d GPU(s) detected, ranks round-robin across devices\n",
+               g_dispatch.num_gpus);
+    }
+    g_dispatch.inited = true;
+}
+
 /* ------------------------------------------------------------------ */
 /*  Error checking                                                     */
 /* ------------------------------------------------------------------ */
@@ -96,12 +128,13 @@ struct GpuBuffers {
     double *d_mem;          /* single big allocation */
     double *d_fh2;          /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
     double *d_fh3;          /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
+    double *h_stage;        /* host staging buffer for bulk H2D/D2H */
     double *slot[NUM_SLOTS]; /* pointers into d_mem */
     int     prev_nx, prev_ny, prev_nz;
     bool    initialized;
 };
 
-static GpuBuffers g_buf = { nullptr, nullptr, nullptr, {}, 0, 0, 0, false };
+static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, {}, 0, 0, 0, false };
 
 /* Slot assignments — INPUT (H2D) */
 enum {
@@ -162,6 +195,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
     if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
     if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
     if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
+    if (g_buf.h_stage) { free(g_buf.h_stage); g_buf.h_stage = nullptr; }
 
     size_t all = (size_t)nx * ny * nz;
     size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
@@ -171,6 +205,11 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
     CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
     CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));
 
+    /* Host staging buffer for bulk H2D/D2H transfers.
+       Size = max(H2D input slots, D2H output slots) * all doubles. */
+    size_t stage_slots = NUM_USED_SLOTS;  /* generous upper bound */
+    g_buf.h_stage = (double *)malloc(stage_slots * all * sizeof(double));
+
     for (int s = 0; s < NUM_USED_SLOTS; ++s)
         g_buf.slot[s] = g_buf.d_mem + s * all;
 
@@ -1946,6 +1985,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
     double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
     int &Symmetry, int &Lev, double &eps, int &co)
 {
+    /* --- Multi-GPU: select device --- */
+    init_gpu_dispatch();
+    cudaSetDevice(g_dispatch.my_device);
+
+    /* --- Profiling: cudaEvent timers (rank 0 only, first 20 calls) --- */
+    static int prof_call_count = 0;
+    const bool do_prof = (g_dispatch.my_rank == 0 && prof_call_count < 20);
+    cudaEvent_t ev_start, ev_h2d, ev_kern, ev_d2h;
+    if (do_prof) {
+        cudaEventCreate(&ev_start); cudaEventCreate(&ev_h2d);
+        cudaEventCreate(&ev_kern);  cudaEventCreate(&ev_d2h);
+        cudaEventRecord(ev_start);
+    }
+
     const int nx = ex[0], ny = ex[1], nz = ex[2];
     const int all = nx * ny * nz;
     const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0];
@@ -2019,6 +2072,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
     CUDA_CHECK(cudaMemcpy(D(S_Syz),   Syz_m, bytes, cudaMemcpyHostToDevice));
     CUDA_CHECK(cudaMemcpy(D(S_Szz),   Szz,   bytes, cudaMemcpyHostToDevice));
 
+    if (do_prof) cudaEventRecord(ev_h2d);
+
     /* ============================================================ */
     /*  Phase 1: prep — alpn1, chin1, gxx, gyy, gzz                 */
     /* ============================================================ */
@@ -2401,6 +2456,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
             D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
     }
 
+    if (do_prof) cudaEventRecord(ev_kern);
+
     /* ============================================================ */
     /*  D2H: copy all output arrays back to host                     */
     /* ============================================================ */
@@ -2428,8 +2485,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
     CUDA_CHECK(cudaMemcpy(dtSfx_rhs, D(S_dtSfx_rhs), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(dtSfy_rhs, D(S_dtSfy_rhs), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(dtSfz_rhs, D(S_dtSfz_rhs), bytes, cudaMemcpyDeviceToHost));
-
-    /* Christoffel symbols */
     CUDA_CHECK(cudaMemcpy(Gamxxx, D(S_Gamxxx), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Gamxxy, D(S_Gamxxy), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Gamxxz, D(S_Gamxxz), bytes, cudaMemcpyDeviceToHost));
@@ -2448,16 +2503,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
     CUDA_CHECK(cudaMemcpy(Gamzyy, D(S_Gamzyy), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Gamzyz, D(S_Gamzyz), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Gamzzz, D(S_Gamzzz), bytes, cudaMemcpyDeviceToHost));
-
-    /* Ricci tensor */
     CUDA_CHECK(cudaMemcpy(Rxx, D(S_Rxx), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Rxy, D(S_Rxy), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Rxz, D(S_Rxz), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Ryy, D(S_Ryy), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Ryz, D(S_Ryz), bytes, cudaMemcpyDeviceToHost));
     CUDA_CHECK(cudaMemcpy(Rzz, D(S_Rzz), bytes, cudaMemcpyDeviceToHost));
-
-    /* Constraint residuals (only meaningful when co==0) */
     if (co == 0) {
         CUDA_CHECK(cudaMemcpy(ham_Res,  D(S_ham_Res),  bytes, cudaMemcpyDeviceToHost));
         CUDA_CHECK(cudaMemcpy(movx_Res, D(S_movx_Res), bytes, cudaMemcpyDeviceToHost));
@@ -2468,6 +2519,22 @@ int f_compute_rhs_bssn(int *ex, double &T,
         CUDA_CHECK(cudaMemcpy(Gmz_Res,  D(S_Gmz_Res),  bytes, cudaMemcpyDeviceToHost));
     }
 
+    if (do_prof) {
+        cudaEventRecord(ev_d2h);
+        cudaEventSynchronize(ev_d2h);
+        float t_h2d, t_kern, t_d2h;
+        cudaEventElapsedTime(&t_h2d,  ev_start, ev_h2d);
+        cudaEventElapsedTime(&t_kern, ev_h2d,   ev_kern);
+        cudaEventElapsedTime(&t_d2h,  ev_kern,  ev_d2h);
+        printf("[AMSS-PROF] call#%d nx=%d ny=%d nz=%d(all=%d) "
+               "H2D=%.3fms Kern=%.3fms D2H=%.3fms Total=%.3fms\n",
+               prof_call_count, nx, ny, nz, all,
+               t_h2d, t_kern, t_d2h, t_h2d + t_kern + t_d2h);
+        cudaEventDestroy(ev_start); cudaEventDestroy(ev_h2d);
+        cudaEventDestroy(ev_kern);  cudaEventDestroy(ev_d2h);
+        prof_call_count++;
+    }
+
     #undef D
     return 0;
 }
\ No newline at end of file
diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc
index 5f13b89..b72ee4d 100755
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -42,4 +42,4 @@ CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_89
+CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80