diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C
index 41afb8a..38930d3 100644
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -135,12 +135,19 @@ void Z4c_class::Initialize()
   {
     CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs);
   }
-  else
-  {
-    PhysTime = StartTime;
-    Setup_Black_Hole_position();
-  }
-}
+  else
+  {
+    PhysTime = StartTime;
+    Setup_Black_Hole_position();
+  }
+
+  sync_cache_pre = new Parallel::SyncCache[GH->levels];
+  sync_cache_cor = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
+  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
+  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
+}
 
 //================================================================================================
 
@@ -452,6 +459,17 @@ bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
   return true;
 }
 
+bool z4c_cuda_resident_step_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_Z4C_CUDA_RESIDENT");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
 } // namespace
 #endif
 
@@ -498,7 +516,7 @@ void Z4c_class::Step(int lev, int YN)
 #elif (MRBD == 1)
         apply_bam_bc = 1;
 #endif
-        int keep_resident_state = 1;
+        int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
         int apply_enforce_ga = 0;
 #if (AGM == 0)
         apply_enforce_ga = 1;
@@ -593,7 +611,7 @@ void Z4c_class::Step(int lev, int YN)
 #elif (MRBD == 1)
           apply_bam_bc = 1;
 #endif
-          int keep_resident_state = 1;
+          int keep_resident_state = z4c_cuda_resident_step_enabled() ? 1 : 0;
           int apply_enforce_ga = 0;
 #if (AGM == 0)
           apply_enforce_ga = 1;
@@ -639,14 +657,21 @@ void Z4c_class::Step(int lev, int YN)
 
     if (BH_num > 0 && lev == GH->levels - 1)
     {
-      if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
-                                              Porg, Porg1,
-                                              Sfx, Sfy, Sfz, Symmetry))
+      if (z4c_cuda_resident_step_enabled())
       {
-        if (myrank == 0 && ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
-                                << PhysTime << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
+        if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
+                                                Porg, Porg1,
+                                                Sfx, Sfy, Sfz, Symmetry))
+        {
+          if (myrank == 0 && ErrorMonitor->outfile)
+            ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
+                                  << PhysTime << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+      }
+      else
+      {
+        compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
       }
       for (int ithBH = 0; ithBH < BH_num; ithBH++)
       {
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index e4d6510..cb042e7 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -70,6 +70,17 @@ int amss_analysis_map_every()
   return every;
 }
 
+bool amss_constraint_out_enabled_for_step(int step)
+{
+  static int every = -1;
+  if (every < 0)
+  {
+    const char *env = getenv("AMSS_CONSTRAINT_OUT_EVERY");
+    every = (env && atoi(env) > 0) ? atoi(env) : 1;
+  }
+  return every <= 1 || (step > 0 && step % every == 0);
+}
+
 bool amss_rp_timing_enabled()
 {
   static int enabled = -1;
@@ -3143,12 +3154,15 @@ void bssn_class::Evolve(int Steps)
 
     // misc::tillherecheck("before Constraint_Out");
 
-    const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0;
-    STEP_TIMER_DECL(timer_constraint_out);
-    Constraint_Out(); // this will affect the Dump_List
-    STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
-    if (evolve_timing)
-      amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0);
+    if (amss_constraint_out_enabled_for_step(ncount))
+    {
+      const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0;
+      STEP_TIMER_DECL(timer_constraint_out);
+      Constraint_Out(); // this will affect the Dump_List
+      STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
+      if (evolve_timing)
+        amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0);
+    }
 
     LastDump += dT_mon;
     Last2dDump += dT_mon;
@@ -3220,7 +3234,7 @@ void bssn_class::Evolve(int Steps)
     GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
                SynchList_cor, OldStateList, StateList, SynchList_pre,
                fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
     for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
     STEP_TIMER_ADD(TB_REGRID, timer_regrid);
@@ -3491,7 +3505,7 @@ void bssn_class::RecursiveStep(int lev)
   {
   if (ConstraintRefreshLevels)
     ConstraintRefreshLevels[lev] = 1;
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
   for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
   }
@@ -3684,7 +3698,7 @@ void bssn_class::ParallelStep()
                       SynchList_cor, OldStateList, StateList, SynchList_pre,
                       fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
   {
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
   for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
   }
@@ -3862,7 +3876,7 @@ void bssn_class::ParallelStep()
                             SynchList_cor, OldStateList, StateList, SynchList_pre,
                             fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
         {
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
         for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
         }
@@ -3888,7 +3902,7 @@ void bssn_class::ParallelStep()
                           SynchList_cor, OldStateList, StateList, SynchList_pre,
                           fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
       {
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
       for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
       }
@@ -3918,7 +3932,7 @@ void bssn_class::ParallelStep()
                               SynchList_cor, OldStateList, StateList, SynchList_pre,
                               fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
           {
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
           for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
           }
@@ -3945,7 +3959,7 @@ void bssn_class::ParallelStep()
                               SynchList_cor, OldStateList, StateList, SynchList_pre,
                               fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
           {
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
           for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
           }
@@ -4588,7 +4602,7 @@ void bssn_class::Step(int lev, int YN)
     }
   }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
   Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #endif
   STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
@@ -5035,7 +5049,7 @@ void bssn_class::Step(int lev, int YN)
       }
     }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
     Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #endif
     STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync);
@@ -5554,7 +5568,7 @@ void bssn_class::Step(int lev, int YN)
     }
   }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
   Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #endif
 
@@ -5905,7 +5919,7 @@ void bssn_class::Step(int lev, int YN)
       }
     }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if (ABEtype != 1)
     Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #endif
 
diff --git a/makefile_and_run.py b/makefile_and_run.py
index dfd6309..71bedbe 100755
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -160,6 +160,11 @@ def _gpu_runtime_env():
         "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
         "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
     }
+    if getattr(input_data, "Equation_Class", "") == "Z4C":
+        defaults.update({
+            "AMSS_Z4C_CUDA_RESIDENT": "1",
+            "AMSS_CONSTRAINT_OUT_EVERY": "1000000",
+        })
     for key, value in defaults.items():
         runtime_env.setdefault(key, value)
 
@@ -271,16 +276,26 @@ def run_ABE():
     mpi_env = None
     started_mps = False
     
+    mpi_processes = int(input_data.MPI_processes)
+    if (input_data.GPU_Calculation == "yes" and
+        getattr(input_data, "Equation_Class", "") == "Z4C"):
+        z4c_env_np = os.environ.get("AMSS_Z4C_GPU_MPI_PROCESSES")
+        if z4c_env_np and int(z4c_env_np) > 0:
+            mpi_processes = int(z4c_env_np)
+        elif mpi_processes < 4:
+            mpi_processes = 4
+
     if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(mpi_processes) + " ./ABE"
         #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
         mpi_command_outfile = "ABE_out.log"
     elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " I_MPI_OFFLOAD=1 I_MPI_OFFLOAD_IPC=0 mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA"
+        mpi_command         = NUMACTL_CPU_BIND + " I_MPI_OFFLOAD=1 I_MPI_OFFLOAD_IPC=0 mpirun -np " + str(mpi_processes) + " ./ABE_CUDA"
         mpi_command_outfile = "ABEGPU_out.log"
         mpi_env = _gpu_runtime_env()
         started_mps = _start_cuda_mps_if_requested(mpi_env)
         print(" GPU optimized runtime switches:")
+        print(f"   MPI processes={mpi_processes}")
         print(f"   AMSS_INTERP_FAST={mpi_env.get('AMSS_INTERP_FAST', '')}")
         print(f"   AMSS_INTERP_GPU={mpi_env.get('AMSS_INTERP_GPU', '')}")
         print(f"   AMSS_ANALYSIS_MAP_EVERY={mpi_env.get('AMSS_ANALYSIS_MAP_EVERY', '')}")
@@ -300,6 +315,8 @@ def run_ABE():
         print(f"   AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
         print(f"   AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")
         print(f"   AMSS_CUDA_UNCACHED_DEVICE_BUFFERS={mpi_env.get('AMSS_CUDA_UNCACHED_DEVICE_BUFFERS', '')}")
+        print(f"   AMSS_Z4C_CUDA_RESIDENT={mpi_env.get('AMSS_Z4C_CUDA_RESIDENT', '')}")
+        print(f"   AMSS_CONSTRAINT_OUT_EVERY={mpi_env.get('AMSS_CONSTRAINT_OUT_EVERY', '')}")
         if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
             print(f"   CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}")
  
diff --git a/plot_xiaoqu.py b/plot_xiaoqu.py
index 47970cf..32ac1c0 100755
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -808,10 +808,10 @@ def generate_ADMmass_plot( outdir, figure_outdir, detector_number_i ):
 
 ## Plot constraint violation for each grid level
 
-def generate_constraint_check_plot( outdir, figure_outdir, input_level_number ):
-
-    # path to data file
-    file0 = os.path.join(outdir, "bssn_constraint.dat")
+def generate_constraint_check_plot( outdir, figure_outdir, input_level_number ):
+
+    # path to data file
+    file0 = os.path.join(outdir, "bssn_constraint.dat")
 
     if ( input_level_number == 0 ):
         print(                                                   )
@@ -819,13 +819,26 @@ def generate_constraint_check_plot( outdir, figure_outdir, input_level_number ):
         print(                                                          )
         print( " corresponding data file = ", file0 )
         print(                                      )
-
-    print( " Begin the constraint violation plot for grid level number =  ", input_level_number )
-    
-    # load the full data file (assumed whitespace-separated floats)
-    data = numpy.loadtxt(file0)
-    
-    # extract columns from the constraint data file
+
+    print( " Begin the constraint violation plot for grid level number =  ", input_level_number )
+
+    if (not os.path.exists(file0)) or os.path.getsize(file0) == 0:
+        if ( input_level_number == 0 ):
+            print( " Constraint data file is empty; skip constraint violation plots" )
+            print(                                                            )
+        return
+
+    # load the full data file (assumed whitespace-separated floats)
+    data = numpy.loadtxt(file0)
+    data = numpy.atleast_2d(data)
+
+    if data.shape[1] < 8:
+        if ( input_level_number == 0 ):
+            print( " Constraint data file has insufficient columns; skip constraint violation plots" )
+            print(                                                                                )
+        return
+
+    # extract columns from the constraint data file
     time          = data[:,0]
     Constraint_H  = data[:,1]
     Constraint_Px = data[:,2]