黄老板逆天重写

2026-03-01 05:48:40 +08:00
61 changed files with 99832 additions and 83012 deletions
--- a/2.txt
+++ b/2.txt
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -16,7 +16,7 @@ import numpy
 File_directory   = "GW150914"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
+MPI_processes    = 2                          ## number of mpi processes used in the simulation
 GPU_Calculation  = "no"                          ## Use GPU or not 
                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
@@ -50,7 +50,7 @@ Check_Time               = 100.0
 Dump_Time                = 100.0                  ## time inteval dT for dumping binary data
 D2_Dump_Time             = 100.0                  ## dump the ascii data for 2d surface after dT'
 Analysis_Time            = 0.1                    ## dump the puncture position and GW psi4 after dT"
-Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
+Evolution_Step_Number    = 6               ## stop the calculation after the maximal step number
 Courant_Factor           = 0.5                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength
--- a/AMSS_NCKU_Program.py
+++ b/AMSS_NCKU_Program.py
@@ -8,14 +8,6 @@
 ##
 ##################################################################
 ## Guard against re-execution by multiprocessing child processes.
 ## Without this, using 'spawn' or 'forkserver' context would cause every
 ## worker to re-run the entire script, spawning exponentially more
 ## workers (fork bomb).
 if __name__ != '__main__':
    import sys as _sys
    _sys.exit(0)
 ##################################################################
@@ -57,33 +49,32 @@ import time
 File_directory = os.path.join(input_data.File_directory)   
 ## If the specified output directory exists, ask the user whether to continue
-if os.path.exists(File_directory):
+# if os.path.exists(File_directory):
-    print( " Output dictionary has been existed !!!  "                                                              )
+#     print( " Output dictionary has been existed !!!  "                                                              )
-    print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
+#     print( " If you want to overwrite the existing file directory, please input 'continue' in the terminal !! "     ) 
-    print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
+#     print( " If you want to retain the existing file directory, please input 'stop' in the terminal to stop the "   ) 
-    print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
+#     print( " simulation. Then you can reset the output dictionary in the input script file AMSS_NCKU_Input.py !!! " )
-    print(                                                                                                          )
+#     print(                                                                                                          )
-    ## Prompt whether to overwrite the existing directory
+#     ## Prompt whether to overwrite the existing directory
-    while True:
+#     while True:
-        try:
+#         try:
-            ## inputvalue = input()
+#             inputvalue = input()
-            inputvalue = "continue"
+#             ## If the user agrees to overwrite, proceed and remove the existing directory
-            ## If the user agrees to overwrite, proceed and remove the existing directory
+#             if ( inputvalue == "continue" ):
-            if ( inputvalue == "continue" ):
+#                 print( " Continue the calculation !!! " )
-                print( " Continue the calculation !!! " )
+#                 print(                                  )
-                print(                                  )
+#                 break  
-                break  
+#             ## If the user chooses not to overwrite, exit and keep the existing directory
-            ## If the user chooses not to overwrite, exit and keep the existing directory
+#             elif ( inputvalue == "stop" ):
-            elif ( inputvalue == "stop" ):
+#                 print( " Stop the calculation !!! "    )
-                print( " Stop the calculation !!! "    )
+#                 sys.exit() 
-                sys.exit() 
+#             ## If the user input is invalid, prompt again
-            ## If the user input is invalid, prompt again
+#             else:
-            else:
+#                 print( " Please input your choice !!! "                   )
-                print( " Please input your choice !!! "                   )
+#                 print( " Input 'continue' or 'stop' in the terminal !!! " )
-                print( " Input 'continue' or 'stop' in the terminal !!! " )
+#         except ValueError:
-        except ValueError:
+#             print( " Please input your choice !!! "                   )
-            print( " Please input your choice !!! "                   )
+#             print( " Input 'continue' or 'stop' in the terminal !!! " )
            print( " Input 'continue' or 'stop' in the terminal !!! " )
 ## Remove the existing output directory if present
 shutil.rmtree(File_directory, ignore_errors=True)
@@ -433,31 +424,26 @@ print(
 import plot_xiaoqu
 import plot_GW_strain_amplitude_xiaoqu
 from parallel_plot_helper import run_plot_tasks_parallel
 plot_tasks = []
 ## Plot black hole trajectory
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot,   (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot(   binary_results_directory, figure_directory )
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_orbit_plot3D( binary_results_directory, figure_directory )
 ## Plot black hole separation vs. time
-plot_tasks.append( ( plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory) ) )
+plot_xiaoqu.generate_puncture_distence_plot( binary_results_directory, figure_directory )
 ## Plot gravitational waveforms (psi4 and strain amplitude)
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_gravitational_wave_psi4_plot( binary_results_directory, figure_directory, i )
-    plot_tasks.append( ( plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot( binary_results_directory, figure_directory, i )
 ## Plot ADM mass evolution
 for i in range(input_data.Detector_Number):
-    plot_tasks.append( ( plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_ADMmass_plot( binary_results_directory, figure_directory, i )
 ## Plot Hamiltonian constraint violation over time
 for i in range(input_data.grid_level):
-    plot_tasks.append( ( plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i) ) )
+    plot_xiaoqu.generate_constraint_check_plot( binary_results_directory, figure_directory, i )
 run_plot_tasks_parallel(plot_tasks)
 ## Plot stored binary data
 plot_xiaoqu.generate_binary_data_plot( binary_results_directory, figure_directory )
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -24,7 +24,7 @@ using namespace std;
 #include "misc.h"
 #include "macrodef.h"
-
+#include <omp.h>
 #ifndef ABEtype
 #error "not define ABEtype"
 #endif
@@ -71,6 +71,7 @@ int main(int argc, char *argv[])
      if (myrank == 0)
      {     
            Begin_clock = MPI_Wtime();
      }
      if (argc > 1)
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -13,7 +13,7 @@ using namespace std;
 #include "MPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
-
+#include "xh_global_interp.h"
 Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
 {
@@ -394,7 +394,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
    while (notfind && Bp) // run along Blocks
    {
      Block *BP = Bp->data;
      bool flag = true;
      for (int i = 0; i < dim; i++)
      {
@@ -430,8 +429,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+
            xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
          }
@@ -441,7 +442,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
        break;
      Bp = Bp->next;
    }
  }
  // Replace MPI_Allreduce with per-owner MPI_Bcast:
  // Group consecutive points by owner rank and broadcast each group.
  // Since each point's data is non-zero only on the owner rank,
@@ -506,13 +509,11 @@ void Patch::Interp_Points(MyList<var> *VarList,
  // Targeted point-to-point overload: each owner sends each point only to
  // the one rank that needs it for integration (consumer), reducing
  // communication volume by ~nprocs times compared to the Bcast version.
  /*
  double t_calc_end, t_calc_total = 0;
  double t_calc_start = MPI_Wtime();*/
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
-
+// printf("here----\n");
  // int zzz = 0;
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
@@ -531,30 +532,35 @@ void Patch::Interp_Points(MyList<var> *VarList,
  for (int j = 0; j < NN; j++)
    owner_rank[j] = -1;
-  double DH[dim], llb[dim], uub[dim];
+  double DH[dim];
  for (int i = 0; i < dim; i++)
    DH[i] = getdX(i);
  // --- Interpolation phase (identical to original) ---
  // printf("NN: %d, num_var = %d\n", NN, num_var);
  #pragma omp parallel
  {
  #pragma omp for
  for (int j = 0; j < NN; j++)
  {
-    double pox[dim];
+    double pox[dim], llb[dim], uub[dim];
    MyList<var> *varl1;
    for (int i = 0; i < dim; i++)
    {
      pox[i] = XX[i][j];
-      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      // if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
+      // {
-        cout << "Patch::Interp_Points: point (";
+      //   cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
+      //   for (int k = 0; k < dim; k++)
-        {
+      //   {
-          cout << XX[k][j];
+      //     cout << XX[k][j];
-          if (k < dim - 1)
+      //     if (k < dim - 1)
-            cout << ",";
+      //       cout << ",";
-          else
+      //     else
-            cout << ") is out of current Patch." << endl;
+      //       cout << ") is out of current Patch." << endl;
-        }
+      //   }
-        MPI_Abort(MPI_COMM_WORLD, 1);
+      //   MPI_Abort(MPI_COMM_WORLD, 1);
-      }
+      // }
    }
    MyList<Block> *Bp = blb;
@@ -586,21 +592,23 @@ void Patch::Interp_Points(MyList<var> *VarList,
          break;
        }
      }
-
+      // printf("flag = %d\n", flag);
      if (flag)
      {
        notfind = false;
        owner_rank[j] = BP->rank;
        if (myrank == BP->rank)
        {
-          varl = VarList;
+          varl1 = VarList;
          int k = 0;
-          while (varl)
+          while (varl1)
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            
-                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl1->data->sgfn], Shellf[j * num_var + k],
-            varl = varl->next;
+                            pox[0], pox[1], pox[2], ordn, varl1->data->SoA, Symmetry);
            varl1 = varl1->next;
            k++;
            // zzz += 1;
          }
        }
      }
@@ -609,9 +617,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
      Bp = Bp->next;
    }
  }
-  /*
+  }
-      t_calc_end = MPI_Wtime();
+  // printf("Interpolation done, zzz = %d\n", zzz);
      t_calc_total = t_calc_end - t_calc_start;*/
  // --- Error check for unfound points ---
  for (int j = 0; j < NN; j++)
  {
@@ -768,63 +775,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] recv_count;
  delete[] consumer_rank;
  delete[] owner_rank;
  /*
  // 4. 汇总并输出真正干活最慢的 Top 4
  struct RankStats {
    int rank;
    double calc_time; // 净计算时间
  };
  // 创建当前进程的统计数据
  RankStats local_stat;
  local_stat.rank = myrank;
  local_stat.calc_time = t_calc_total;
  // 为所有进程的统计数据分配内存
  RankStats *all_stats = nullptr;
  if (myrank == 0) {
    all_stats = new RankStats[nprocs];
  }
  // 使用MPI_Gather收集所有进程的数据到rank 0
  MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
             all_stats, sizeof(RankStats), MPI_BYTE,
             0, MPI_COMM_WORLD);
  // 准备输出前4个rank的信息（所有rank都参与，确保广播后一致）
  int top10_ranks[10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
  double top10_times[10] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
  int num_top10 = 0;
  if (myrank == 0) {
    // 按 calc_time（净计算时间）排序
    std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
        return a.calc_time > b.calc_time;
    });
    // 取前4个
    num_top10 = (nprocs < 10) ? nprocs : 10;
    for (int i = 0; i < num_top10; i++) {
      top10_ranks[i] = all_stats[i].rank;
      top10_times[i] = all_stats[i].calc_time;
    }
    printf("\n--- Top %d Ranks by ACTIVE COMPUTATION (CPU Time) ---\n", num_top10);
    for (int i = 0; i < num_top10; i++) {
      printf("Rank [%4d]: Calc %.6f s\n", top10_ranks[i], top10_times[i]);
    }
    // 清理分配的内存
    delete[] all_stats;
  }
  // 广播前4个rank的信息给所有进程
  MPI_Bcast(&num_top10, 1, MPI_INT, 0, MPI_COMM_WORLD);
  if (num_top10 > 0) {
    MPI_Bcast(top10_ranks, 10, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(top10_times, 10, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  }
 */
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -834,7 +784,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
  int myrank, lmyrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_rank(Comm_here, &lmyrank);
  int ordn = 2 * ghost_width;
  MyList<var> *varl;
  int num_var = 0;
@@ -924,7 +873,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
          int k = 0;
          while (varl) // run along variables
          {
-            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+            xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
            varl = varl->next;
            k++;
@@ -1156,7 +1105,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
        {
          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
+          xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
@@ -1398,7 +1347,7 @@ bool Patch::Interp_ONE_Point(MyList<var> *VarList, double *XX,
        {
          //              shellf[j*num_var+k] = Parallel::global_interp(dim,BP->shape,BP->X,BP->fgfs[varl->data->sgfn],
          //	  		                                    pox,ordn,varl->data->SoA,Symmetry);
-          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
+          xh_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], shellf[k],
                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
          varl = varl->next;
          k++;
--- a/AMSS_NCKU_source/NullShellPatch.h
+++ b/AMSS_NCKU_source/NullShellPatch.h
@@ -24,7 +24,6 @@ using namespace std;
 #endif
 #include <mpi.h>
 #include <memory.h>
 #include "MyList.h"
 #include "Block.h"
 #include "Parallel.h"
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -4,8 +4,7 @@
 #include "prolongrestrict.h"
 #include "misc.h"
 #include "parameters.h"
-#include <set>
+#include <omp.h>
 int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
 {
  nx = Mymax(1, shape / min_width);
@@ -116,7 +115,7 @@ int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize,
  return nx * ny * nz;
 #undef SEARCH_SIZE
 }
-#elif 0 // Zhihui's idea one on 2013-09-25
+#elif 1 // Zhihui's idea one on 2013-09-25
 {
  int nx, ny, nz;
  int hmin_width;
@@ -151,7 +150,7 @@ int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize,
  return nx * ny * nz;
 }
-#elif 0 // Zhihui's idea two on 2013-09-25
+#elif 1 // Zhihui's idea two on 2013-09-25
 {
  int nx, ny, nz;
  const int hmin_width = 8; // for example we use 8
@@ -501,428 +500,6 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i
  return BlL;
 }
 MyList<Block> *Parallel::distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                                    bool periodic, int nodes)
 {
 #ifdef USE_GPU_DIVIDE
  double cpu_part, gpu_part;
  map<string, double>::iterator iter;
  iter = parameters::dou_par.find("cpu part");
  if (iter != parameters::dou_par.end())
  {
    cpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;
      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;
      if (sgrp == "ABE")
      {
        if (skey == "cpu part")
          cpu_part = atof(sval.c_str());
      }
    }
    inf.close();
    parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
  }
  iter = parameters::dou_par.find("gpu part");
  if (iter != parameters::dou_par.end())
  {
    gpu_part = iter->second;
  }
  else
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    // read parameter from file
    const int LEN = 256;
    char pline[LEN];
    string str, sgrp, skey, sval;
    int sind;
    char pname[50];
    {
      map<string, string>::iterator iter = parameters::str_par.find("inputpar");
      if (iter != parameters::str_par.end())
      {
        strcpy(pname, (iter->second).c_str());
      }
      else
      {
        cout << "Error inputpar" << endl;
        exit(0);
      }
    }
    ifstream inf(pname, ifstream::in);
    if (!inf.good() && myrank == 0)
    {
      cout << "Can not open parameter file " << pname << endl;
      MPI_Abort(MPI_COMM_WORLD, 1);
    }
    for (int i = 1; inf.good(); i++)
    {
      inf.getline(pline, LEN);
      str = pline;
      int status = misc::parse_parts(str, sgrp, skey, sval, sind);
      if (status == -1)
      {
        cout << "error reading parameter file " << pname << " in line " << i << endl;
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      else if (status == 0)
        continue;
      if (sgrp == "ABE")
      {
        if (skey == "gpu part")
          gpu_part = atof(sval.c_str());
      }
    }
    inf.close();
    parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
  }
  if (nodes == 0)
    nodes = cpusize / 2;
 #else
  if (nodes == 0)
    nodes = cpusize;
 #endif
  if (dim != 3)
  {
    cout << "distrivute: now we only support 3-dimension" << endl;
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  MyList<Block> *BlL = 0;
  int split_size, min_size, block_size = 0;
  int min_width = 2 * Mymax(ghost_width, buffer_width);
  int nxyz[dim], mmin_width[dim], min_shape[dim];
  MyList<Patch> *PLi = PatchLIST;
  for (int i = 0; i < dim; i++)
    min_shape[i] = PLi->data->shape[i];
  int lev = PLi->data->lev;
  PLi = PLi->next;
  while (PLi)
  {
    Patch *PP = PLi->data;
    for (int i = 0; i < dim; i++)
      min_shape[i] = Mymin(min_shape[i], PP->shape[i]);
    if (lev != PLi->data->lev)
      cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl;
    PLi = PLi->next;
  }
  for (int i = 0; i < dim; i++)
    mmin_width[i] = Mymin(min_width, min_shape[i]);
  min_size = mmin_width[0];
  for (int i = 1; i < dim; i++)
    min_size = min_size * mmin_width[i];
  PLi = PatchLIST;
  while (PLi)
  {
    Patch *PP = PLi->data;
    //    PP->checkPatch(true);
    int bs = PP->shape[0];
    for (int i = 1; i < dim; i++)
      bs = bs * PP->shape[i];
    block_size = block_size + bs;
    PLi = PLi->next;
  }
  split_size = Mymax(min_size, block_size / nodes);
  split_size = Mymax(1, split_size);
  int n_rank = 0;
  PLi = PatchLIST;
  int reacpu = 0;
  int current_block_id = 0;
  while (PLi) {
    Block *ng0, *ng;
    bool first_block_in_patch = true; 
    Patch *PP = PLi->data;
    reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape);
    for (int i = 0; i < nxyz[0]; i++)
    for (int j = 0; j < nxyz[1]; j++)
    for (int k = 0; k < nxyz[2]; k++)
    {
        // --- 1. 定义局部变量 ---
        int ibbox_here[6], shape_here[3];
        double bbox_here[6], dd;
        Block *current_ng_start = nullptr; // 本次循环产生的第一个(或唯一一个)块
        // --- 2. 核心逻辑分支 ---
        if (current_block_id == 27 || current_block_id == 28 ||
            current_block_id == 35 || current_block_id == 36)
        {
            // A. 计算原始索引 (不带 Ghost)
            int ib0 = (PP->shape[0] * i) / nxyz[0];
            int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
            int jb1 = (PP->shape[1] * j) / nxyz[1];
            int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
            int kb2 = (PP->shape[2] * k) / nxyz[2];
            int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
            int r_l, r_r;
            if(current_block_id == 27)      { r_l = 26; r_r = 27; }
            else if(current_block_id == 28) { r_l = 28; r_r = 29; }
            else if(current_block_id == 35) { r_l = 34; r_r = 35; }
            else                            { r_l = 36; r_r = 37; }
            Block * split_first_block = nullptr;
            Block * split_last_block = nullptr; 
            // 拆分逻辑：该函数应更新类成员变量 split_first_block 和 split_last_block
            splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5, 
                              PP, r_l, r_r, ingfsi, fngfsi, periodic,split_first_block,split_last_block);
            current_ng_start = split_first_block;
            ng = split_last_block; 
        }
        else 
        {
            // B. 普通块逻辑 (含 Ghost 扩张)
            ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
            ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
            ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
            ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1;
            ibbox_here[2] = (PP->shape[2] * k) / nxyz[2];
            ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
            if (periodic) {
                for(int d=0; d<3; d++) {
                    ibbox_here[d] -= ghost_width;
                    ibbox_here[d+3] += ghost_width;
                }
            } else {
                ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width);
                ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width);
                ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width);
                ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width);
                ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width);
                ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width);
            }
            for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
            // 物理坐标计算 (根据你的宏定义 Cell/Vertex)
 #ifdef Vertex
 #ifdef Cell
 #error Both Cell and Vertex are defined
 #endif
          // 0--4, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
          bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
          bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
          dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
          bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
          bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
          dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
          bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
          bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
 #else
 #ifdef Cell
          // 0--5, 5--10
          dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
          bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
          bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
          dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
          bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
          bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
          dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
          bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
          bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
 #else
 #error Not define Vertex nor Cell
 #endif
 #endif
            ng = createMappedBlock(BlL, dim, shape_here, bbox_here, current_block_id, ingfsi, fngfsi, PP->lev);
            current_ng_start = ng;
        }
        // --- 3. 统一处理 Patch 起始 Block 指针 ---
        if (first_block_in_patch) {
            ng0 = current_ng_start;
            // 立即设置 PP->blb，避免后续循环覆盖 ng0
            MyList<Block> *Bp_start = BlL;
            while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
            PP->blb = Bp_start;
            first_block_in_patch = false;
        }
        current_block_id++;
    }
    // --- 4. 设置 Patch 结束 Block 指针 ---
    MyList<Block> *Bp_end = BlL;
    while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
    PP->ble = Bp_end;
    PLi = PLi->next;
    first_block_in_patch = true; 
  }
  if (reacpu < nodes * 2 / 3)
  {
    int myrank;
    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
    if (myrank == 0)
      cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl;
  }
  return BlL;
 }
 /**
 * @brief 将当前 Block 几何二等分并存入列表
 * @param axis 拆分轴：0-x, 1-y, 2-z (建议选最长轴)
 */
 Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim, 
                                 int ib0_orig, int ib3_orig, 
                                 int jb1_orig, int jb4_orig, 
                                 int kb2_orig, int kb5_orig, 
                                 Patch* PP, int r_left, int r_right, 
                                 int ingfsi, int fngfsi, bool periodic,
                                 Block* &split_first_block, Block* &split_last_block)
 {
    // 1. 索引二分 (基于无 ghost 的原始索引)
    int mid = (ib0_orig + ib3_orig) / 2;
    // 左块原始索引: [ib0, mid], 右块原始索引: [mid+1, ib3]
    int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
    int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
    // 2. 内部处理逻辑 (复刻原 distribute 逻辑)
    auto createSubBlock = [&](int* ib_raw, int target_rank) {
        int ib_final[6];
        int sh_here[3];
        double bb_here[6], dd;
        // --- 逻辑 A: Ghost 扩张 ---
        if (periodic) {
            ib_final[0] = ib_raw[0] - ghost_width;
            ib_final[3] = ib_raw[3] + ghost_width;
            ib_final[1] = ib_raw[1] - ghost_width;
            ib_final[4] = ib_raw[4] + ghost_width;
            ib_final[2] = ib_raw[2] - ghost_width;
            ib_final[5] = ib_raw[5] + ghost_width;
        } else {
            ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
            ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
            ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
            ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
            ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
            ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
        }
        sh_here[0] = ib_final[3] - ib_final[0] + 1;
        sh_here[1] = ib_final[4] - ib_final[1] + 1;
        sh_here[2] = ib_final[5] - ib_final[2] + 1;
        // --- 逻辑 B: 物理坐标计算 (严格匹配 Cell 模式) ---
        // X 方向
        dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
        bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
        bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
        // Y 方向
        dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
        bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
        bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
        // Z 方向
        dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
        bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
        bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
        Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
        if (BlL) BlL->insert(Bg);
        else     BlL = new MyList<Block>(Bg);
        return Bg;
    };
    // 执行创建
    split_first_block = createSubBlock(indices_L, r_left);
    split_last_block  = createSubBlock(indices_R, r_right);
 }
 /**
 * @brief 创建映射后的 Block
 */
  Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
                        int block_id, int ingfsi, int fngfsi, int lev)
  {
      // 映射表逻辑
      int target_rank = block_id;
      if (block_id == 26)      target_rank = 25;
      else if (block_id == 29) target_rank = 30;
      else if (block_id == 34) target_rank = 33;
      else if (block_id == 37) target_rank = 38;
      Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
      if (BlL) BlL->insert(ng);
      else     BlL = new MyList<Block>(ng);
      return ng;
  }
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
 MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                                    bool periodic, int start_rank, int end_rank, int nodes)
@@ -3761,7 +3338,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
 {
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
+  // double time1 = omp_get_wtime();
  int DIM = dim;
  if (dir != PACK && dir != UNPACK)
@@ -3784,7 +3361,6 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
    varls = varls->next;
    varld = varld->next;
  }
  if (varls || varld)
  {
    cout << "error in short data packer, var lists does not match." << endl;
@@ -3798,7 +3374,6 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
    type = 2;
  else
    type = 3;
  while (src && dst)
  {
    if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
@@ -3808,6 +3383,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
      varld = VarListd;
      while (varls && varld)
      {
        if (data)
        {
          if (dir == PACK)
@@ -3828,6 +3404,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
              f_prolong3(DIM, src->data->Bg->bbox, src->data->Bg->bbox + dim, src->data->Bg->shape, src->data->Bg->fgfs[varls->data->sgfn],
                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
            }
          if (dir == UNPACK) // from target data to corresponding grid
            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
@@ -3841,8 +3418,14 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
    }
    dst = dst->next;
    src = src->next;
  }
  }
  // double time2 = omp_get_wtime();
  // xxx += time2 - time1;
  // if(myrank == 0){
  // printf("prolong3 time = %lf\n", time2 - time1);
  // }
  return size_out;
 }
 int Parallel::data_packermix(double *data, MyList<Parallel::gridseg> *src, MyList<Parallel::gridseg> *dst, int rank_in, int dir,
@@ -3937,7 +3520,7 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int node;
-
+  // double time1 = omp_get_wtime();
  MPI_Request *reqs;
  MPI_Status *stats;
  reqs = new MPI_Request[2 * cpusize];
@@ -4006,7 +3589,9 @@ void Parallel::transfer(MyList<Parallel::gridseg> **src, MyList<Parallel::gridse
    if (rec_data[node])
      delete[] rec_data[node];
  }
-
+  // double time2 = omp_get_wtime();
  // if (myrank == 0)
  //   printf("transfer time = %lf\n", time2 - time1);
  delete[] reqs;
  delete[] stats;
  delete[] send_data;
@@ -5709,203 +5294,6 @@ void Parallel::OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
  delete[] transfer_src;
  delete[] transfer_dst;
 }
 // Restrict_cached: cache grid segment lists, reuse buffers via transfer_cached
 void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                               MyList<var> *VarList1, MyList<var> *VarList2,
                               int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatfL, node, 2, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
 }
 // OutBdLow2Hi_cached: cache grid segment lists, reuse buffers via transfer_cached
 void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                  MyList<var> *VarList1, MyList<var> *VarList2,
                                  int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  transfer_cached(cache.combined_src, cache.combined_dst, VarList1, VarList2, Symmetry, cache);
 }
 // OutBdLow2Himix_cached: same as OutBdLow2Hi_cached but uses transfermix for unpacking
 void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                                     MyList<var> *VarList1, MyList<var> *VarList2,
                                     int Symmetry, SyncCache &cache)
 {
  if (!cache.valid)
  {
    int cpusize;
    MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
    cache.cpusize = cpusize;
    if (!cache.combined_src)
    {
      cache.combined_src = new MyList<Parallel::gridseg> *[cpusize];
      cache.combined_dst = new MyList<Parallel::gridseg> *[cpusize];
      cache.send_lengths = new int[cpusize];
      cache.recv_lengths = new int[cpusize];
      cache.send_bufs = new double *[cpusize];
      cache.recv_bufs = new double *[cpusize];
      cache.send_buf_caps = new int[cpusize];
      cache.recv_buf_caps = new int[cpusize];
      for (int i = 0; i < cpusize; i++)
      {
        cache.send_bufs[i] = cache.recv_bufs[i] = 0;
        cache.send_buf_caps[i] = cache.recv_buf_caps[i] = 0;
      }
      cache.max_reqs = 2 * cpusize;
      cache.reqs = new MPI_Request[cache.max_reqs];
      cache.stats = new MPI_Status[cache.max_reqs];
    }
    MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
    for (int node = 0; node < cpusize; node++)
    {
      MyList<Parallel::gridseg> *src_owned = build_owned_gsl(PatcL, node, 4, Symmetry);
      build_gstl(src_owned, dst, &cache.combined_src[node], &cache.combined_dst[node]);
      if (src_owned) src_owned->destroyList();
    }
    if (dst) dst->destroyList();
    cache.valid = true;
  }
  // Use transfermix instead of transfer for mix-mode interpolation
  int myrank;
  MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  int cpusize = cache.cpusize;
  int req_no = 0;
  for (int node = 0; node < cpusize; node++)
  {
    if (node == myrank)
    {
      int length = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = length;
      if (length > 0)
      {
        if (length > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[length];
          cache.recv_buf_caps[node] = length;
        }
        data_packermix(cache.recv_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      }
    }
    else
    {
      int slength = data_packermix(0, cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
      cache.send_lengths[node] = slength;
      if (slength > 0)
      {
        if (slength > cache.send_buf_caps[node])
        {
          if (cache.send_bufs[node]) delete[] cache.send_bufs[node];
          cache.send_bufs[node] = new double[slength];
          cache.send_buf_caps[node] = slength;
        }
        data_packermix(cache.send_bufs[node], cache.combined_src[myrank], cache.combined_dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
        MPI_Isend((void *)cache.send_bufs[node], slength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
      int rlength = data_packermix(0, cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
      cache.recv_lengths[node] = rlength;
      if (rlength > 0)
      {
        if (rlength > cache.recv_buf_caps[node])
        {
          if (cache.recv_bufs[node]) delete[] cache.recv_bufs[node];
          cache.recv_bufs[node] = new double[rlength];
          cache.recv_buf_caps[node] = rlength;
        }
        MPI_Irecv((void *)cache.recv_bufs[node], rlength, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, cache.reqs + req_no++);
      }
    }
  }
  MPI_Waitall(req_no, cache.reqs, cache.stats);
  for (int node = 0; node < cpusize; node++)
    if (cache.recv_bufs[node] && cache.recv_lengths[node] > 0)
      data_packermix(cache.recv_bufs[node], cache.combined_src[node], cache.combined_dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
 }
 // collect all buffer grid segments or blocks for given patch
 MyList<Parallel::gridseg> *Parallel::build_buffer_gsl(Patch *Pat)
 {
@@ -6905,224 +6293,3 @@ void Parallel::checkpatchlist(MyList<Patch> *PatL, bool buflog)
    PL = PL->next;
  }
 }
 // Check if load balancing is needed based on interpolation times
 bool Parallel::check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks)
 {
  // Calculate average time
  double avg_time = 0;
  for (int r = 0; r < nprocs; r++)
  {
    avg_time += rank_times[r];
  }
  avg_time /= nprocs;
  // Identify heavy ranks (time > 1.5x average)
  std::vector<std::pair<int, double>> rank_times_vec;
  for (int r = 0; r < nprocs; r++)
  {
    if (rank_times[r] > avg_time * 1.5)
    {
      rank_times_vec.push_back(std::make_pair(r, rank_times[r]));
    }
  }
  // Sort by time (descending)
  std::sort(rank_times_vec.begin(), rank_times_vec.end(),
            [](const std::pair<int, double>& a, const std::pair<int, double>& b) {
              return a.second > b.second;
            });
  // Take top 4 heavy ranks
  num_heavy = std::min(4, (int)rank_times_vec.size());
  if (num_heavy > 0)
  {
    for (int i = 0; i < num_heavy; i++)
    {
      heavy_ranks[i] = rank_times_vec[i].first;
    }
    return true;  // Load balancing is needed
  }
  return false;  // No load balancing needed
 }
 // Split blocks belonging to heavy ranks to improve load balancing
 // Strategy: Split heavy rank blocks in half, merge 8 light ranks to free 4 ranks
 void Parallel::split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
                                  int split_factor, int cpusize, int ingfsi, int fngfsi)
 {
  int myrank, nprocs;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
  if (myrank != 0) return; // Only rank 0 performs the analysis
  cout << "\n=== Load Balancing Strategy ===" << endl;
  cout << "Heavy ranks to split (in half): " << num_heavy << endl;
  for (int i = 0; i < num_heavy; i++)
    cout << "  Heavy rank " << heavy_ranks[i] << endl;
  // Step 1: Identify all blocks and their ranks
  std::vector<int> all_ranks;
  std::map<int, std::vector<Block*>> rank_to_blocks;
  MyList<Patch> *PL = PatL;
  while (PL)
  {
    Patch *PP = PL->data;
    MyList<Block> *BP = PP->blb;
    while (BP)
    {
      Block *block = BP->data;
      all_ranks.push_back(block->rank);
      rank_to_blocks[block->rank].push_back(block);
      BP = BP->next;
    }
    PL = PL->next;
  }
  // Step 2: Identify light ranks (not in heavy_ranks list)
  std::set<int> heavy_set(heavy_ranks, heavy_ranks + num_heavy);
  std::vector<int> light_ranks;
  for (int r : all_ranks)
  {
    if (heavy_set.find(r) == heavy_set.end())
    {
      light_ranks.push_back(r);
    }
  }
  // Remove duplicates from light_ranks
  std::sort(light_ranks.begin(), light_ranks.end());
  light_ranks.erase(std::unique(light_ranks.begin(), light_ranks.end()), light_ranks.end());
  cout << "Found " << light_ranks.size() << " light ranks (candidates for merging)" << endl;
  // Step 3: Select 8 light ranks to merge (those with smallest workload)
  // For now, we select the first 8 light ranks
  int num_to_merge = 8;
  if (light_ranks.size() < num_to_merge)
  {
    cout << "WARNING: Not enough light ranks to merge. Found " << light_ranks.size()
         << ", need " << num_to_merge << endl;
    num_to_merge = light_ranks.size();
  }
  std::vector<int> ranks_to_merge(light_ranks.begin(), light_ranks.begin() + num_to_merge);
  cout << "Light ranks to merge (8 -> 4 merged ranks):" << endl;
  for (int i = 0; i < num_to_merge; i++)
    cout << "  Rank " << ranks_to_merge[i] << endl;
  // Step 4: Analyze blocks that need to be split
  cout << "\n=== Analyzing blocks for splitting ===" << endl;
  struct BlockSplitInfo {
    Block *original_block;
    int split_dim;
    int split_point;
  };
  std::vector<BlockSplitInfo> blocks_to_split;
  PL = PatL;
  while (PL)
  {
    Patch *PP = PL->data;
    MyList<Block> *BP = PP->blb;
    while (BP)
    {
      Block *block = BP->data;
      // Check if this block belongs to a heavy rank
      for (int i = 0; i < num_heavy; i++)
      {
        if (block->rank == heavy_ranks[i])
        {
          // Find the largest dimension for splitting
          int max_dim = 0;
          int max_size = block->shape[0];
          for (int d = 1; d < dim; d++)
          {
            if (block->shape[d] > max_size)
            {
              max_size = block->shape[d];
              max_dim = d;
            }
          }
          int split_point = max_size / 2;
          BlockSplitInfo info;
          info.original_block = block;
          info.split_dim = max_dim;
          info.split_point = split_point;
          blocks_to_split.push_back(info);
          cout << "Block at rank " << block->rank << " will be split" << endl;
          cout << "  Shape: [" << block->shape[0] << ", " << block->shape[1] << ", " << block->shape[2] << "]" << endl;
          cout << "  Split along dimension " << max_dim << " at index " << split_point << endl;
          break;
        }
      }
      BP = BP->next;
    }
    PL = PL->next;
  }
  cout << "\nTotal blocks to split: " << blocks_to_split.size() << endl;
  // Step 5: Calculate new rank assignments
  // Strategy:
  // - For each heavy rank, its blocks are split in half
  // - First half keeps the original rank
  // - Second half gets a new rank (from the freed light ranks)
  // - 8 light ranks are merged into 4 ranks, freeing up 4 ranks
  std::vector<int> freed_ranks;
  for (size_t i = 0; i < ranks_to_merge.size(); i += 2)
  {
    // Merge pairs of light ranks: (ranks_to_merge[i], ranks_to_merge[i+1]) -> ranks_to_merge[i]
    // This frees up ranks_to_merge[i+1]
    if (i + 1 < ranks_to_merge.size())
    {
      freed_ranks.push_back(ranks_to_merge[i + 1]);
      cout << "Merging ranks " << ranks_to_merge[i] << " and " << ranks_to_merge[i + 1]
           << " -> keeping rank " << ranks_to_merge[i] << ", freeing rank " << ranks_to_merge[i + 1] << endl;
    }
  }
  cout << "\nFreed ranks available for split blocks: ";
  for (int r : freed_ranks)
    cout << r << " ";
  cout << endl;
  // Step 6: Assign new ranks to split blocks
  int freed_idx = 0;
  for (size_t i = 0; i < blocks_to_split.size(); i++)
  {
    BlockSplitInfo &info = blocks_to_split[i];
    Block *original = info.original_block;
    if (freed_idx < freed_ranks.size())
    {
      cout << "\nSplitting block at rank " << original->rank << endl;
      cout << "  First half: keeps rank " << original->rank << endl;
      cout << "  Second half: gets new rank " << freed_ranks[freed_idx] << endl;
      freed_idx++;
    }
    else
    {
      cout << "WARNING: Not enough freed ranks for all split blocks!" << endl;
      break;
    }
  }
  cout << "\n=== Load Balancing Analysis Complete ===" << endl;
  cout << "Next steps:" << endl;
  cout << "  1. Recompose the grid with new rank assignments" << endl;
  cout << "  2. Data migration will be handled by recompose_cgh" << endl;
  cout << "  3. Ghost zone communication will be updated automatically" << endl;
 }
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -11,7 +11,7 @@
 #include <cmath>
 #include <new>
 using namespace std;
-#include <memory.h>
+
 #include "Parallel_bam.h"
 #include "var.h"
 #include "MPatch.h"
@@ -32,16 +32,6 @@ namespace Parallel
  int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
  int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
  MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
  Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim, 
                                 int ib0_orig, int ib3_orig, 
                                 int jb1_orig, int jb4_orig, 
                                 int kb2_orig, int kb5_orig, 
                                 Patch* PP, int r_left, int r_right, 
                                 int ingfsi, int fngfsi, bool periodic,
                                 Block* &split_first_block, Block* &split_last_block);
  Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
                        int block_id, int ingfsi, int fngfsi, int lev); 
  void KillBlocks(MyList<Patch> *PatchLIST);
  void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -140,15 +130,6 @@ namespace Parallel
  void OutBdLow2Himix(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                      MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                      int Symmetry);
  void Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  void OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                          MyList<var> *VarList1, MyList<var> *VarList2,
                          int Symmetry, SyncCache &cache);
  void OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
                             MyList<var> *VarList1, MyList<var> *VarList2,
                             int Symmetry, SyncCache &cache);
  void Prolong(Patch *Patc, Patch *Patf,
               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
               int Symmetry);
@@ -218,18 +199,6 @@ namespace Parallel
 #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
                            bool periodic, int start_rank, int end_rank, int nodes = 0);
  // Redistribute blocks with time statistics for load balancing
  MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
                            int cpusize, int ingfsi, int fngfsi,
                            bool periodic, int start_rank, int end_rank, int nodes = 0);
 #endif
  // Dynamic load balancing: split blocks for heavy ranks
    void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
                            int split_factor, int cpusize, int ingfsi, int fngfsi);
    // Check if load balancing is needed based on interpolation times
    bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
 }
 #endif /*PARALLEL_H */
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -40,7 +40,7 @@ using namespace std;
 #include "derivatives.h"
 #include "ricci_gamma.h"
-
+#include "xh_bssn_rhs_compute.h"
 //================================================================================================
 // define bssn_class
@@ -2029,6 +2029,7 @@ void bssn_class::Read_Ansorg()
 void bssn_class::Evolve(int Steps)
 {
  clock_t prev_clock, curr_clock;
  double prev_time, curr_time;
  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
  LastAnas = 0;
 #if 0
@@ -2141,8 +2142,10 @@ void bssn_class::Evolve(int Steps)
    //     if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) 
    //     { GH->levels=GH->movls; }
-    if (myrank == 0)
+    if (myrank == 0){
      curr_clock = clock();
      curr_time = omp_get_wtime();
    }
 #if (PSTR == 0)
    RecursiveStep(0);
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
@@ -2198,12 +2201,17 @@ void bssn_class::Evolve(int Steps)
    if (myrank == 0)
    {
      prev_clock = curr_clock;
      prev_time = curr_time;
      curr_clock = clock();
      curr_time = omp_get_wtime();
      cout << endl;
      // cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
      //      << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
      //      << " seconds! " << endl;
      // // cout << endl;
      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
-           << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
+            << " Computer used " << (curr_time - prev_time) 
            << " seconds! " << endl;
      // cout << endl;
    }
    if (PhysTime >= TotalTime)
@@ -3092,7 +3100,7 @@ void bssn_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
-        if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -3292,7 +3300,7 @@ void bssn_class::Step(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
-
+          // cout<<"....................................."<<endl;
          // rk4 substep and boundary
          {
            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; 
@@ -3457,7 +3465,7 @@ void bssn_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
-          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -3970,7 +3978,7 @@ void bssn_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
-        if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -4312,7 +4320,7 @@ void bssn_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
-          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -4848,7 +4856,7 @@ void bssn_class::Step(int lev, int YN)
                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
 #endif
-        if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+        if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -5048,7 +5056,7 @@ void bssn_class::Step(int lev, int YN)
                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #endif
-          if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+          if (f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                 cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                 cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -5819,11 +5827,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5870,11 +5888,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -5949,11 +5977,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
@@ -5971,11 +6009,21 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SL, SL, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
@@ -6036,11 +6084,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6060,11 +6118,21 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6101,11 +6169,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
      }
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -6114,11 +6192,21 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
        Pp = GH->PatL[lev];
        while (Pp)
        {
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
-      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
+          Parallel::OutBdLow2Himix(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
 #endif
          Pp = Pp->next;
        }
        Ppc = Ppc->next;
      }
 #elif (RPB == 1)
      //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry);
      Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry);
@@ -7263,7 +7351,7 @@ void bssn_class::Constraint_Out()
            Block *cg = BP->data;
            if (myrank == cg->rank)
            {
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+              f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -7766,7 +7854,7 @@ void bssn_class::Interp_Constraint(bool infg)
            Block *cg = BP->data;
            if (myrank == cg->rank)
            {
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+              f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -8024,7 +8112,7 @@ void bssn_class::Compute_Constraint()
          Block *cg = BP->data;
          if (myrank == cg->rank)
          {
-            f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+            f_compute_rhs_bssn_xh(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                               cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                               cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                               cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -106,38 +106,6 @@
  call getpbh(BHN,Porg,Mass)
 #endif
 !!! sanity check (disabled in production builds for performance)
 #ifdef DEBUG
  dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
      +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz)                   &
      +sum(Gamx)+sum(Gamy)+sum(Gamz)                                           &
      +sum(Lap)+sum(betax)+sum(betay)+sum(betaz)
  if(dX.ne.dX) then
     if(sum(chi).ne.sum(chi))write(*,*)"bssn.f90: find NaN in chi"
     if(sum(trK).ne.sum(trK))write(*,*)"bssn.f90: find NaN in trk"
     if(sum(dxx).ne.sum(dxx))write(*,*)"bssn.f90: find NaN in dxx"
     if(sum(gxy).ne.sum(gxy))write(*,*)"bssn.f90: find NaN in gxy"
     if(sum(gxz).ne.sum(gxz))write(*,*)"bssn.f90: find NaN in gxz"
     if(sum(dyy).ne.sum(dyy))write(*,*)"bssn.f90: find NaN in dyy"
     if(sum(gyz).ne.sum(gyz))write(*,*)"bssn.f90: find NaN in gyz"
     if(sum(dzz).ne.sum(dzz))write(*,*)"bssn.f90: find NaN in dzz"
     if(sum(Axx).ne.sum(Axx))write(*,*)"bssn.f90: find NaN in Axx"
     if(sum(Axy).ne.sum(Axy))write(*,*)"bssn.f90: find NaN in Axy"
     if(sum(Axz).ne.sum(Axz))write(*,*)"bssn.f90: find NaN in Axz"
     if(sum(Ayy).ne.sum(Ayy))write(*,*)"bssn.f90: find NaN in Ayy"
     if(sum(Ayz).ne.sum(Ayz))write(*,*)"bssn.f90: find NaN in Ayz"
     if(sum(Azz).ne.sum(Azz))write(*,*)"bssn.f90: find NaN in Azz"
     if(sum(Gamx).ne.sum(Gamx))write(*,*)"bssn.f90: find NaN in Gamx"
     if(sum(Gamy).ne.sum(Gamy))write(*,*)"bssn.f90: find NaN in Gamy"
     if(sum(Gamz).ne.sum(Gamz))write(*,*)"bssn.f90: find NaN in Gamz"
     if(sum(Lap).ne.sum(Lap))write(*,*)"bssn.f90: find NaN in Lap"
     if(sum(betax).ne.sum(betax))write(*,*)"bssn.f90: find NaN in betax"
     if(sum(betay).ne.sum(betay))write(*,*)"bssn.f90: find NaN in betay"
     if(sum(betaz).ne.sum(betaz))write(*,*)"bssn.f90: find NaN in betaz"
     gont = 1
     return
  endif
 #endif
  PI = dacos(-ONE)
@@ -634,7 +602,7 @@
  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
-! now get physical second kind of connection
+
  Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
  Gamyxx = Gamyxx - (                     - gxx * gxxy )*HALF
  Gamzxx = Gamzxx - (                     - gxx * gxxz )*HALF
@@ -945,60 +913,103 @@
  SSA(2)=SYM
  SSA(3)=ANTI
-!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
+!!!!!!!!!advection term part
 ! lopsided_kodis shares the symmetry_bd buffer between advection and
 ! dissipation, eliminating redundant full-grid copies. For metric variables
 ! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
 ! so the constant offset has no effect on dissipation.
-  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS)
-  call lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA)
-  call lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA)
-  call lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS)
-  call lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
+  call lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS)
-  call lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
+  call lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS)
-  call lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
+  call lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA)
-
+!!
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps)
  call lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps)
 #endif
 #else
 ! No dissipation on gauge variables (advection only)
  call lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS)
 #if (GAUGE == 0 || GAUGE == 1 || GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS)
  call lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS)
  call lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA)
 #endif
  if(eps>0)then 
 ! usual Kreiss-Oliger dissipation      
  call kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "before",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps)
 #if 0
 #define i 42
 #define j 40
 #define k 40
 if(Lev == 1)then
 write(*,*) X(i),Y(j),Z(k)
 write(*,*) "after",Axx_rhs(i,j,k)
 endif
 #undef i
 #undef j
 #undef k
 !!stop
 #endif
  call kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps)
  call kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps)
 #if 1 
 !! bam does not apply dissipation on gauge variables
  call kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps)
 #if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
  call kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps)
  call kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps)
 #endif
 #endif
  endif
  if(co == 0)then
 ! ham_Res = trR + 2/3 * K^2 - A_ij * A^ij - 16 * PI * rho
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -43,14 +43,6 @@ cgh::cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun,
  end_rank = 0;
 #endif
  // Initialize load balancing variables
  enable_load_balance = false;
  load_balance_check_interval = 10;  // Check every 10 time steps
  current_time_step = 0;
  rank_interp_times = nullptr;
  heavy_ranks = nullptr;
  num_heavy_ranks = 0;
  if (!checkrun)
  {
    read_bbox(Symmetry, filename);
@@ -121,12 +113,6 @@ cgh::~cgh()
    delete[] Porgls[lev];
  }
  delete[] Porgls;
  // Clean up load balancing memory
  if (rank_interp_times)
    delete[] rank_interp_times;
  if (heavy_ranks)
    delete[] heavy_ranks;
 }
 //================================================================================================
@@ -144,7 +130,7 @@ void cgh::compose_cgh(int nprocs)
  for (int lev = 0; lev < levels; lev++)
  {
    checkPatchList(PatL[lev], false);
-    Parallel::distribute_hard(PatL[lev], nprocs, ingfs, fngfs, false);
+    Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
 #if (RPB == 1)
    // we need distributed box of PatL[lev] and PatL[lev-1]
    if (lev > 0)
@@ -1719,121 +1705,3 @@ void cgh::settrfls(const int lev)
 {
  trfls = lev;
 }
 //================================================================================================
 // Load Balancing Functions
 //================================================================================================
 // Initialize load balancing
 void cgh::init_load_balance(int nprocs)
 {
  if (rank_interp_times)
    delete[] rank_interp_times;
  if (heavy_ranks)
    delete[] heavy_ranks;
  rank_interp_times = new double[nprocs];
  heavy_ranks = new int[4];  // Maximum 4 heavy ranks
  num_heavy_ranks = 0;
  for (int i = 0; i < nprocs; i++)
    rank_interp_times[i] = 0.0;
 }
 // Update interpolation time for a rank
 void cgh::update_interp_time(int rank, double time)
 {
  if (rank_interp_times && rank >= 0)
  {
    rank_interp_times[rank] = time;
  }
 }
 // Check and perform load balancing if needed
 bool cgh::check_and_rebalance(int nprocs, int lev,
                               MyList<var> *OldList, MyList<var> *StateList,
                               MyList<var> *FutureList, MyList<var> *tmList,
                               int Symmetry, bool BB)
 {
  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  // Only check at specified intervals
  current_time_step++;
  if (current_time_step % load_balance_check_interval != 0)
    return false;
  if (myrank == 0)
  {
    cout << "\n=== Checking load balance at time step " << current_time_step << " ===" << endl;
  }
  // Collect all rank times on rank 0
  double *all_times = nullptr;
  if (myrank == 0)
  {
    all_times = new double[nprocs];
  }
  MPI_Gather(rank_interp_times, 1, MPI_DOUBLE, all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  bool need_rebalance = false;
  if (myrank == 0)
  {
    // Check if load balancing is needed
    need_rebalance = Parallel::check_load_balance_need(all_times, nprocs, num_heavy_ranks, heavy_ranks);
    if (need_rebalance)
    {
      cout << "=== Load imbalance detected! Need to rebalance ===" << endl;
      cout << "Top " << num_heavy_ranks << " heavy ranks: ";
      for (int i = 0; i < num_heavy_ranks; i++)
      {
        cout << heavy_ranks[i] << " (" << all_times[heavy_ranks[i]] << " s) ";
      }
      cout << endl;
      // Analyze blocks that need to be split
      Parallel::split_heavy_blocks(PatL[lev], heavy_ranks, num_heavy_ranks, 2, nprocs, ingfs, fngfs);
      // Set lev_flag to trigger recompose_cgh
      cout << "=== Triggering recompose_cgh for level " << lev << " ===" << endl;
    }
    else
    {
      cout << "=== Load is balanced, no rebalancing needed ===" << endl;
    }
    delete[] all_times;
  }
  // Broadcast the decision to all ranks
  MPI_Bcast(&need_rebalance, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
  if (need_rebalance)
  {
    // Broadcast heavy ranks information
    MPI_Bcast(&num_heavy_ranks, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(heavy_ranks, num_heavy_ranks, MPI_INT, 0, MPI_COMM_WORLD);
    // Perform recompose_cgh on the specified level
    if (myrank == 0)
    {
      cout << "=== Performing recompose_cgh ===" << endl;
    }
    // Call recompose_cgh_Onelevel for the specified level
    bool *lev_flag = new bool[1];
    lev_flag[0] = true;
    recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
    delete[] lev_flag;
    // Reset time counter after rebalancing
    current_time_step = 0;
    return true;
  }
  return false;
 }
--- a/AMSS_NCKU_source/cgh.h
+++ b/AMSS_NCKU_source/cgh.h
@@ -87,21 +87,6 @@ public:
 #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
   void construct_mylev(int nprocs);
 #endif
   // Load balancing support
   bool enable_load_balance;         // Enable load balancing
   int load_balance_check_interval;  // Check interval (in time steps)
   int current_time_step;            // Current time step counter
   double *rank_interp_times;        // Store interpolation times for each rank
   int *heavy_ranks;                 // Store heavy rank numbers
   int num_heavy_ranks;              // Number of heavy ranks
   void init_load_balance(int nprocs);
   void update_interp_time(int rank, double time);
   bool check_and_rebalance(int nprocs, int lev,
                           MyList<var> *OldList, MyList<var> *StateList,
                           MyList<var> *FutureList, MyList<var> *tmList,
                           int Symmetry, bool BB);
 };
 #endif /* CGH_H */
--- a/AMSS_NCKU_source/diff_new.f90
+++ b/AMSS_NCKU_source/diff_new.f90
@@ -69,8 +69,6 @@
  fy = ZEO
  fz = ZEO
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
 !DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
@@ -373,8 +371,6 @@
  fxz = ZEO
  fyz = ZEO
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
 !DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
--- a/AMSS_NCKU_source/extention/include/xh_bssn_rhs_compute.h
+++ b/AMSS_NCKU_source/extention/include/xh_bssn_rhs_compute.h
@@ -0,0 +1,26 @@
 #include "xh_macrodef.h"
 #include "xh_tool.h"
 int f_compute_rhs_bssn(int *ex, double &T, 
                       double *X, double *Y, double *Z,
                       double *chi, double *trK,
                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                       double *Gamx, double *Gamy, double *Gamz,
                       double *Lap, double *betax, double *betay, double *betaz,
                       double *dtSfx, double *dtSfy, double *dtSfz,
                       double *chi_rhs, double *trK_rhs,
                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                       double *rho, double *Sx, double *Sy, double *Sz,
                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                       int &Symmetry, int &Lev, double &eps, int &co
                       ); 
--- a/AMSS_NCKU_source/extention/include/xh_macrodef.h
+++ b/AMSS_NCKU_source/extention/include/xh_macrodef.h
@@ -0,0 +1,66 @@
 /* tetrad notes
   v:r; u: phi; w: theta
   tetradtype 0
   v^a = (x,y,z)
   orthonormal order: v,u,w
   m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
   tetradtype 1
   orthonormal order: w,u,v
   m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of  PRD 85, 124062(2012)
   tetradtype 2
   v_a = (x,y,z)
   orthonormal order: v,u,w
   m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of  PRD 75, 124018(2007)
 */
 #define tetradtype 2
 /* Cell center or Vertex center */
 #define Cell
 /* ghost_width meaning:
   2nd order: 2
   4th order: 3
   6th order: 4
   8th order: 5
 */
 #define ghost_width 3
 /* use shell or not */
 #define WithShell
 /* use constraint preserving boundary condition or not
   only affect Z4c
 */
 #define CPBC
 /* Gauge condition type
   0: B^i gauge
   1: David's puncture gauge
   2: MB B^i gauge
   3: RIT B^i gauge
   4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
   5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
   6: MGB1 B^i gauge
   7: MGB2 B^i gauge
 */
 #define GAUGE 2
 /* buffer points for CPBC boundary */
 #define CPBC_ghost_width (ghost_width)
 /* using BSSN variable for constraint violation and psi4 calculation: 0
   using ADM variable for constraint violation and psi4 calculation: 1
 */
 #define ABV 0
 /* Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
   1: Case C of 1112.3928, V=0
   2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
   3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
   4: a2 = infinity and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
   5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
 */
 #define EScalar_CC 2
--- a/AMSS_NCKU_source/extention/include/xh_share_func.h
+++ b/AMSS_NCKU_source/extention/include/xh_share_func.h
@@ -0,0 +1,338 @@
 #ifndef SHARE_FUNC_H
 #define SHARE_FUNC_H
 #include <stdlib.h>
 #include <stddef.h>
 #include <math.h>
 #include <stdio.h>
 #include <omp.h>
 /* 主网格：0-based -> 1D */
 static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
    const int ex1 = ex[0], ex2 = ex[1];
    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
 }
 /*
 * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
 * ord=2 => shift=1
 * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
 */
 static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 1;
    const int nx = ex[0] + 2;      // ex1 + ord
    const int ny = ex[1] + 2;
    const int ii = iF + shift;     // 0..ex1+1
    const int jj = jF + shift;     // 0..ex2+1
    const int kk = kF + shift;     // 0..ex3+1
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
 * ord=3 => shift=2
 * iF/jF/kF 是 Fortran 索引（可为负）
 */
 static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 2;                 // ord=3 -> -2..ex
    const int nx = ex[0] + 3;            // ex1 + ord
    const int ny = ex[1] + 3;
    const int ii = iF + shift;           // 0..ex1+2
    const int jj = jF + shift;           // 0..ex2+2
    const int kk = kF + shift;           // 0..ex3+2
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
 * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
 *
 * C 里我们把：
 *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
 *   funcc 用“平移下标”存为一维数组：
 *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
 *     总长度 nx = extc1 + ord
 *     同理 ny = extc2 + ord, nz = extc3 + ord
 */
 static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
    const int nx = extc[0], ny = extc[1];
    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
 }
 static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
    const int shift = ord - 1;          // iF = -shift .. extc1
    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
    const int ny = extc[1] + ord;
    const int ii = iF + shift;          // 0..extc1+shift
    const int jj = jF + shift;          // 0..extc2+shift
    const int kk = kF + shift;          // 0..extc3+shift
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * 等价于 Fortran:
 * funcc(1:extc1,1:extc2,1:extc3)=func
 * do i=0,ord-1
 *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
 * enddo
 * do i=0,ord-1
 *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
 * enddo
 * do i=0,ord-1
 *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
 * enddo
 */
 static inline void symmetry_bd(int ord,
                 const int extc[3],
                 const double *func,
                 double *funcc,
                 const double SoA[3])
 {
    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
    // 1) funcc(1:extc1,1:extc2,1:extc3) = func
    // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
    for (int k0 = 0; k0 < extc3; ++k0) {
        for (int j0 = 0; j0 < extc2; ++j0) {
            for (int i0 = 0; i0 < extc1; ++i0) {
                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
                funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
            }
        }
    }
    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
    for (int ii = 0; ii <= ord - 1; ++ii) {
        const int iF_dst = -ii;       // 0, -1, -2, ...
        const int iF_src = ii + 1;    // 1, 2, 3, ...
        for (int kF = 1; kF <= extc3; ++kF) {
            for (int jF = 1; jF <= extc2; ++jF) {
                funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
                    funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
            }
        }
    }
    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
    for (int jj = 0; jj <= ord - 1; ++jj) {
        const int jF_dst = -jj;
        const int jF_src = jj + 1;
        for (int kF = 1; kF <= extc3; ++kF) {
            for (int iF = -ord + 1; iF <= extc1; ++iF) {
                funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
                    funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
            }
        }
    }
    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
    for (int kk = 0; kk <= ord - 1; ++kk) {
        const int kF_dst = -kk;
        const int kF_src = kk + 1;
        for (int jF = -ord + 1; jF <= extc2; ++jF) {
            for (int iF = -ord + 1; iF <= extc1; ++iF) {
                funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
                    funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
            }
        }
    }
 }
 #endif
 /* 你已有的函数：idx_ex / idx_fh_F_ord2 以及 fh 的布局 */
 static inline void fdderivs_xh(
    int i0, int j0, int k0,
    const int ex[3],
    const double *fh,
    int iminF, int jminF, int kminF,
    int imaxF, int jmaxF, int kmaxF,
    double Fdxdx, double Fdydy, double Fdzdz,
    double Fdxdy, double Fdxdz, double Fdydz,
    double Sdxdx, double Sdydy, double Sdzdz,
    double Sdxdy, double Sdxdz, double Sdydz,
    double *fxx, double *fxy, double *fxz,
    double *fyy, double *fyz, double *fzz
 ){
    const double F8  = 8.0;
    const double F16 = 16.0;
    const double F30 = 30.0;
    const double TWO = 2.0;
    const int iF = i0 + 1;
    const int jF = j0 + 1;
    const int kF = k0 + 1;
    const size_t p = idx_ex(i0, j0, k0, ex);
    /* 高阶分支：i±2,j±2,k±2 都在范围内 */
    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
    {
        fxx[p] = Fdxdx * (
            -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
             fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
        );
        fyy[p] = Fdydy * (
            -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
             fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
        );
        fzz[p] = Fdzdz * (
            -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
             fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
        );
        /* fxy 高阶 */
        {
            const double t_jm2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
            const double t_jm1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
            const double t_jp1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
            const double t_jp2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
            fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
        }
        /* fxz 高阶 */
        {
            const double t_km2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
            const double t_km1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
            const double t_kp1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
            const double t_kp2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
            fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
        }
        /* fyz 高阶 */
        {
            const double t_km2 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
            const double t_km1 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
            const double t_kp1 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
            const double t_kp2 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
            fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
        }
    }
    /* 二阶分支：i±1,j±1,k±1 在范围内 */
    else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
             (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
             (kF + 1) <= kmaxF && (kF - 1) >= kminF)
    {
        fxx[p] = Sdxdx * (
            fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
            fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
        );
        fyy[p] = Sdydy * (
            fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
            fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
        );
        fzz[p] = Sdzdz * (
            fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
            fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
        );
        fxy[p] = Sdxdy * (
            fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
            fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
            fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
            fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
        );
        fxz[p] = Sdxdz * (
            fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
            fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
        );
        fyz[p] = Sdydz * (
            fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
            fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
        );
    }
    else {
        fxx[p] = 0.0; fyy[p] = 0.0; fzz[p] = 0.0;
        fxy[p] = 0.0; fxz[p] = 0.0; fyz[p] = 0.0;
    }
 }
--- a/AMSS_NCKU_source/extention/include/xh_tool.h
+++ b/AMSS_NCKU_source/extention/include/xh_tool.h
@@ -0,0 +1,27 @@
 #include "xh_share_func.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff);
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff);
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps);
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3]);
--- a/AMSS_NCKU_source/extention/src/bssn_rhs
+++ b/AMSS_NCKU_source/extention/src/bssn_rhs
--- a/AMSS_NCKU_source/extention/src/bssn_rhs-fast.c
+++ b/AMSS_NCKU_source/extention/src/bssn_rhs-fast.c
--- a/AMSS_NCKU_source/extention/src/bssn_rhs-try.c
+++ b/AMSS_NCKU_source/extention/src/bssn_rhs-try.c
--- a/AMSS_NCKU_source/extention/src/fdderivs-fast.c
+++ b/AMSS_NCKU_source/extention/src/fdderivs-fast.c
@@ -0,0 +1,311 @@
 #include "../include/tool.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff)
 {
    (void)onoff;
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
    const double F1o4   = 2.5e-1;          // 1/4
    const double F8     = 8.0;
    const double F16    = 16.0;
    const double F30    = 30.0;
    const double F1o12  = ONE / 12.0;
    const double F1o144 = ONE / 144.0;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    /* 系数：按 Fortran 原式 */
    const double Sdxdx = ONE / (dX * dX);
    const double Sdydy = ONE / (dY * dY);
    const double Sdzdz = ONE / (dZ * dZ);
    const double Fdxdx = F1o12 / (dX * dX);
    const double Fdydy = F1o12 / (dY * dY);
    const double Fdzdz = F1o12 / (dZ * dZ);
    const double Sdxdy = F1o4 / (dX * dY);
    const double Sdxdz = F1o4 / (dX * dZ);
    const double Sdydz = F1o4 / (dY * dZ);
    const double Fdxdy = F1o144 / (dX * dY);
    const double Fdxdz = F1o144 / (dX * dZ);
    const double Fdydz = F1o144 / (dY * dZ);
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // symmetry_bd(2, ex, f, fh, SoA);
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    for (int k0 = 0; k0 < ex[2]; ++k0) {
        for (int j0 = 0; j0 < ex[1]; ++j0) {
            for (int i0 = 0; i0 < ex[0]; ++i0) {
                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
                fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
            }
        }
    }
    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
    for (int ii = 0; ii <= 2 - 1; ++ii) {
        const int iF_dst = -ii;       // 0, -1, -2, ...
        const int iF_src = ii + 1;    // 1, 2, 3, ...
        for (int kF = 1; kF <= ex[2]; ++kF) {
            for (int jF = 1; jF <= ex[1]; ++jF) {
                fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
                    fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
            }
        }
    }
    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
    for (int jj = 0; jj <= 2 - 1; ++jj) {
        const int jF_dst = -jj;
        const int jF_src = jj + 1;
        for (int kF = 1; kF <= ex[2]; ++kF) {
            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
                    fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
            }
        }
    }
    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
    for (int kk = 0; kk <= 2 - 1; ++kk) {
        const int kF_dst = -kk;
        const int kF_src = kk + 1;
        for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
                    fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
            }
        }
    }
    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
    // const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    // for (size_t p = 0; p < all; ++p) {
    //     fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
    //     fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
    // }
    /*
     * Fortran:
     * do k=1,ex3-1
     * do j=1,ex2-1
     * do i=1,ex1-1
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
                {
                    fxx[p] = Fdxdx * (
                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Fdydy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Fdzdz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
                    {
                        const double t_jm2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
                        const double t_jm1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
                        const double t_jp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
                        const double t_jp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
                    }
                    /* fxz 高阶 */
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                    /* fyz 高阶 */
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                }
                /* 二阶分支：i±1,j±1,k±1 在范围内 */
                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
                {
                    fxx[p] = Sdxdx * (
                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Sdydy * (
                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Sdzdz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    fxy[p] = Sdxdy * (
                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                    );
                    fxz[p] = Sdxdz * (
                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                    );
                    fyz[p] = Sdydz * (
                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                    );
                }else{
                    fxx[p] = 0.0;
                    fyy[p] = 0.0;
                    fzz[p] = 0.0;
                    fxy[p] = 0.0;
                    fxz[p] = 0.0;
                    fyz[p] = 0.0;
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/extention/src/main.c
+++ b/AMSS_NCKU_source/extention/src/main.c
@@ -0,0 +1,7 @@
 #include "include/bssn_rhs_compute.h"
 int main() {
    // 这里可以写一些测试代码，调用 f_compute_rhs_bssn 来验证它的正确性
    // 例如，定义一些小的网格和初始条件，调用函数，并检查输出是否合理。
    return 0;
 }
--- a/AMSS_NCKU_source/extention/src/new.c
+++ b/AMSS_NCKU_source/extention/src/new.c
@@ -0,0 +1,65 @@
        SoA[0] = SYM, SoA[1] = SYM, SoA[2] = SYM;
        #pragma omp for collapse(3)
        for (int k0 = 0; k0 < ex[2]; ++k0) {
            for (int j0 = 0; j0 < ex[1]; ++j0) {
                for (int i0 = 0; i0 < ex[0]; ++i0) {
                    const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
                    fh[idx_funcc_F(iF, jF, kF, 2, ex)] = Lap[idx_func0(i0, j0, k0, ex)];
                }
            }
        }
        // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
        #pragma omp for collapse(3)
        for (int ii = 0; ii <= 2 - 1; ++ii) {
            const int iF_dst = -ii;       // 0, -1, -2, ...
            const int iF_src = ii + 1;    // 1, 2, 3, ...
            for (int kF = 1; kF <= ex[2]; ++kF) {
                for (int jF = 1; jF <= ex[1]; ++jF) {
                    fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
                        fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
                }
            }
        }
        // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
        // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
        #pragma omp for collapse(3)
        for (int jj = 0; jj <= 2 - 1; ++jj) {
            const int jF_dst = -jj;
            const int jF_src = jj + 1;
            for (int kF = 1; kF <= ex[2]; ++kF) {
                for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                    fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
                        fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
                }
            }
        }
        // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
        #pragma omp for collapse(3)
        for (int kk = 0; kk <= 2 - 1; ++kk) {
            const int kF_dst = -kk;
            const int kF_src = kk + 1;
            for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
                for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                    fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
                        fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
                }
            }
        }
        #pragma omp for collapse(3)
        for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                    fdderivs_xh(i0, j0, k0, ex, fh, iminF, jminF, kminF, ex1, ex2, ex3, 
                        Fdxdx, Fdydy, Fdzdz, Fdxdy, Fdxdz, Fdydz,
                        Sdxdx, Sdydy, Sdzdz, Sdxdy, Sdxdz, Sdydz,
                            fxx,fxy,fxz,fyy,fyz,fzz
                    );
                }
            }
        }
--- a/AMSS_NCKU_source/extention/src/xh_bssn_rhs.c
+++ b/AMSS_NCKU_source/extention/src/xh_bssn_rhs.c
--- a/AMSS_NCKU_source/extention/src/xh_fdderivs.c
+++ b/AMSS_NCKU_source/extention/src/xh_fdderivs.c
@@ -0,0 +1,311 @@
 #include "xh_tool.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff)
 {
    (void)onoff;
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
    const double F1o4   = 2.5e-1;          // 1/4
    const double F8     = 8.0;
    const double F16    = 16.0;
    const double F30    = 30.0;
    const double F1o12  = ONE / 12.0;
    const double F1o144 = ONE / 144.0;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    /* 系数：按 Fortran 原式 */
    const double Sdxdx = ONE / (dX * dX);
    const double Sdydy = ONE / (dY * dY);
    const double Sdzdz = ONE / (dZ * dZ);
    const double Fdxdx = F1o12 / (dX * dX);
    const double Fdydy = F1o12 / (dY * dY);
    const double Fdzdz = F1o12 / (dZ * dZ);
    const double Sdxdy = F1o4 / (dX * dY);
    const double Sdxdz = F1o4 / (dX * dZ);
    const double Sdydz = F1o4 / (dY * dZ);
    const double Fdxdy = F1o144 / (dX * dY);
    const double Fdxdz = F1o144 / (dX * dZ);
    const double Fdydz = F1o144 / (dY * dZ);
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // symmetry_bd(2, ex, f, fh, SoA);
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    for (int k0 = 0; k0 < ex[2]; ++k0) {
        for (int j0 = 0; j0 < ex[1]; ++j0) {
            for (int i0 = 0; i0 < ex[0]; ++i0) {
                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
                fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
            }
        }
    }
    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
    for (int ii = 0; ii <= 2 - 1; ++ii) {
        const int iF_dst = -ii;       // 0, -1, -2, ...
        const int iF_src = ii + 1;    // 1, 2, 3, ...
        for (int kF = 1; kF <= ex[2]; ++kF) {
            for (int jF = 1; jF <= ex[1]; ++jF) {
                fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
                    fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
            }
        }
    }
    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
    for (int jj = 0; jj <= 2 - 1; ++jj) {
        const int jF_dst = -jj;
        const int jF_src = jj + 1;
        for (int kF = 1; kF <= ex[2]; ++kF) {
            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
                    fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
            }
        }
    }
    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
    for (int kk = 0; kk <= 2 - 1; ++kk) {
        const int kF_dst = -kk;
        const int kF_src = kk + 1;
        for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
                    fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
            }
        }
    }
    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
    // const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    // for (size_t p = 0; p < all; ++p) {
    //     fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
    //     fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
    // }
    /*
     * Fortran:
     * do k=1,ex3-1
     * do j=1,ex2-1
     * do i=1,ex1-1
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
                {
                    fxx[p] = Fdxdx * (
                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Fdydy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Fdzdz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
                    {
                        const double t_jm2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
                        const double t_jm1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
                        const double t_jp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
                        const double t_jp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
                    }
                    /* fxz 高阶 */
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                    /* fyz 高阶 */
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                }
                /* 二阶分支：i±1,j±1,k±1 在范围内 */
                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
                {
                    fxx[p] = Sdxdx * (
                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Sdydy * (
                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Sdzdz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    fxy[p] = Sdxdy * (
                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                    );
                    fxz[p] = Sdxdz * (
                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                    );
                    fyz[p] = Sdydz * (
                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                    );
                }else{
                    fxx[p] = 0.0;
                    fyy[p] = 0.0;
                    fzz[p] = 0.0;
                    fxy[p] = 0.0;
                    fxz[p] = 0.0;
                    fyz[p] = 0.0;
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/extention/src/xh_fderivs.c
+++ b/AMSS_NCKU_source/extention/src/xh_fderivs.c
@@ -0,0 +1,145 @@
 #include "xh_tool.h"
 /*
 * C 版 fderivs
 *
 * Fortran:
 * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
 *
 * 约定：
 *   f, fx, fy, fz: ex1*ex2*ex3，按 idx_ex 布局
 *   X: ex1, Y: ex2, Z: ex3
 */
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff)
 {
    (void)onoff; // Fortran 里没用到
    const double ZEO = 0.0, ONE = 1.0;
    const double TWO = 2.0, EIT = 8.0;
    const double F12 = 12.0;
    const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    // SoA(1:3) = SYM1,SYM2,SYM3
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // call symmetry_bd(2,ex,f,fh,SoA)
    symmetry_bd(2, ex, f, fh, SoA);
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    // fx = fy = fz = 0
    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    for (size_t p = 0; p < all; ++p) {
        fx[p] = ZEO;
        fy[p] = ZEO;
        fz[p] = ZEO;
    }
    /*
     * Fortran loops:
     * do k=1,ex3-1
     * do j=1,ex2-1
     * do i=1,ex1-1
     *
     * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                // if(i+2 <= imax .and. i-2 >= imin ... )  (全是 Fortran 索引)
                if ((iF + 2) <= ex1 && (iF - 2) >= iminF &&
                    (jF + 2) <= ex2 && (jF - 2) >= jminF &&
                    (kF + 2) <= ex3 && (kF - 2) >= kminF)
                {
                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
                    );
                    fy[p] = d12dy * (
                        fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
                    );
                    fz[p] = d12dz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                    );
                }
                // elseif(i+1 <= imax .and. i-1 >= imin ...)
                else if ((iF + 1) <= ex1 && (iF - 1) >= iminF &&
                         (jF + 1) <= ex2 && (jF - 1) >= jminF &&
                         (kF + 1) <= ex3 && (kF - 1) >= kminF)
                {
                    fx[p] = d2dx * (
                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fy[p] = d2dy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fz[p] = d2dz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/extention/src/xh_kodiss.c
+++ b/AMSS_NCKU_source/extention/src/xh_kodiss.c
@@ -0,0 +1,116 @@
 #include "xh_tool.h"
 /*
 * C 版 kodis
 *
 * Fortran signature:
 * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
 *
 * 约定：
 *   X: ex1, Y: ex2, Z: ex3
 *   f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
 *   SoA[3]
 *   eps: double
 */
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps)
 {
    const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0;             // 2^6
    const int NO_SYMM = 0, OCTANT = 2;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    (void)ONE; // ONE 在原 Fortran 里只是参数，这里不一定用得上
    // Fortran: imax=ex(1) 等是 1-based 上界
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran: imin=jmin=kmin=1，某些对称情况变 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)，对应 ord=3
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    if (!fh) return;
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran loops:
     * do k=1,ex3
     * do j=1,ex2
     * do i=1,ex1
     *
     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
     * 并定义 Fortran index: iF=i0+1, ...
     */
    for (int k0 = 0; k0 < ex3; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 < ex2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 < ex1; ++i0) {
                const int iF = i0 + 1;
                // Fortran if 条件：
                // i-3 >= imin .and. i+3 <= imax  等（都是 Fortran 索引）
                if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
                    (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
                    (kF - 3) >= kminF && (kF + 3) <= kmaxF)
                {
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
                    const double Dx_term =
                        ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF    , jF, kF, ex)] ) / dX;
                    const double Dy_term =
                        ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF    , kF, ex)] ) / dY;
                    const double Dz_term =
                        ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF, kF    , ex)] ) / dZ;
                    // Fortran:
                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/extention/src/xh_lopsided.c
+++ b/AMSS_NCKU_source/extention/src/xh_lopsided.c
@@ -0,0 +1,262 @@
 #include "xh_tool.h"
 /*
 * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
 * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
 *
 * 约定：
 *   nghost = 3
 *   ex[3]  = {ex1,ex2,ex3}
 *   f      = 原始网格 (ex1*ex2*ex3)
 *   fh     = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3))，对应 Fortran 的 (-2:ex1, ...)
 *   SoA[3] = 输入参数
 */
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3])
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
    const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
    (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // 对应 Fortran: dX = X(2)-X(1)  （Fortran 1-based）
    // C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到（保持一致也算出来）
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    (void)d2dx; (void)d2dy; (void)d2dz;
    // Fortran:
    // imax = ex(1); jmax = ex(2); kmax = ex(3)
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran:
    // imin=jmin=kmin=1; 若满足对称条件则设为 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran 主循环：
     * do k=1,ex(3)-1
     * do j=1,ex(2)-1
     * do i=1,ex(1)-1
     *
     * 转成 C 0-based：
     * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
     *
     * 并且 Fortran 里的 i/j/k 在 fh 访问时，仍然是 Fortran 索引值：
     * iF=i0+1, jF=j0+1, kF=k0+1
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                // ---------------- x direction ----------------
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    // Fortran: if(i+3 <= imax)
                    // iF+3 <= ex1  <=> i0+4 <= ex1 <=> i0 <= ex1-4
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                    // elseif(i+2 <= imax)  <=> i0 <= ex1-3
                    else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i+1 <= imax)  <=> i0 <= ex1-2（循环里总成立）
                    else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    // Fortran: if(i-3 >= imin)
                    // (iF-3) >= iminF  <=> (i0-2) >= iminF
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                    // elseif(i-2 >= imin) <=> (i0-1) >= iminF
                    else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i-1 >= imin) <=> i0 >= iminF
                    else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
                // ---------------- y direction ----------------
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
                    // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                // ---------------- z direction ----------------
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/fmisc.f90
+++ b/AMSS_NCKU_source/fmisc.f90
@@ -883,17 +883,13 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
  integer::i
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
  funcc(1:extc(1),1:extc(2),1:extc(3)) = func
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
   enddo
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
   enddo
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
   do i=0,ord-1
      funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
   enddo
@@ -1116,7 +1112,6 @@ end subroutine d2dump
 ! Lagrangian polynomial interpolation
 !------------------------------------------------------------------------------
 !DIR$ ATTRIBUTES FORCEINLINE :: polint
  subroutine polint(xa, ya, x, y, dy, ordn)
  implicit none
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
 !                       dx^4
 !  note the sign (-1)^r-1, now r=2
 !DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
 !DIR$ UNROLL PARTIAL(4)
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
--- a/AMSS_NCKU_source/lopsidediff.f90
+++ b/AMSS_NCKU_source/lopsidediff.f90
@@ -487,201 +487,6 @@ subroutine lopsided(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA)
  end subroutine lopsided
 !-----------------------------------------------------------------------------
 ! Combined advection (lopsided) + Kreiss-Oliger dissipation (kodis)
 ! Shares the symmetry_bd buffer fh, eliminating one full-grid copy per call.
 ! Mathematically identical to calling lopsided then kodis separately.
 !-----------------------------------------------------------------------------
 subroutine lopsided_kodis(ex,X,Y,Z,f,f_rhs,Sfx,Sfy,Sfz,Symmetry,SoA,eps)
  implicit none
 !~~~~~~> Input parameters:
  integer, intent(in)  :: ex(1:3),Symmetry
  real*8,  intent(in)  :: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3))
  real*8,dimension(ex(1),ex(2),ex(3)),intent(in)   :: f,Sfx,Sfy,Sfz
  real*8,dimension(ex(1),ex(2),ex(3)),intent(inout):: f_rhs
  real*8,dimension(3),intent(in) ::SoA
  real*8,intent(in) :: eps
 !~~~~~~> local variables:
 ! note index -2,-1,0, so we have 3 extra points
  real*8,dimension(-2:ex(1),-2:ex(2),-2:ex(3))   :: fh
  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: dX,dY,dZ
  real*8 :: d12dx,d12dy,d12dz,d2dx,d2dy,d2dz
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F3=3.d0
  real*8,  parameter :: TWO=2.d0,F6=6.0d0,F18=1.8d1
  real*8,  parameter :: F12=1.2d1, F10=1.d1,EIT=8.d0
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
 ! kodis parameters
  real*8, parameter :: SIX=6.d0,FIT=1.5d1,TWT=2.d1
  real*8, parameter :: cof=6.4d1   ! 2^6
  dX = X(2)-X(1)
  dY = Y(2)-Y(1)
  dZ = Z(2)-Z(1)
  d12dx = ONE/F12/dX
  d12dy = ONE/F12/dY
  d12dz = ONE/F12/dZ
  d2dx = ONE/TWO/dX
  d2dy = ONE/TWO/dY
  d2dz = ONE/TWO/dZ
  imax = ex(1)
  jmax = ex(2)
  kmax = ex(3)
  imin = 1
  jmin = 1
  kmin = 1
  if(Symmetry > NO_SYMM .and. dabs(Z(1)) < dZ) kmin = -2
  if(Symmetry > EQ_SYMM .and. dabs(X(1)) < dX) imin = -2
  if(Symmetry > EQ_SYMM .and. dabs(Y(1)) < dY) jmin = -2
 ! Single symmetry_bd call shared by both advection and dissipation
  call symmetry_bd(3,ex,f,fh,SoA)
 ! ---- Advection (lopsided) loop ----
 ! upper bound set ex-1 only for efficiency, 
 ! the loop body will set ex 0 also
  do k=1,ex(3)-1
  do j=1,ex(2)-1
  do i=1,ex(1)-1
 ! x direction   
    if(Sfx(i,j,k) > ZEO)then
      if(i+3 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     elseif(i+2 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i+1 <= imax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     endif
   elseif(Sfx(i,j,k) < ZEO)then
      if(i-3 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i+1,j,k)-F10*fh(i,j,k)+F18*fh(i-1,j,k) &
                                    -F6*fh(i-2,j,k)+    fh(i-3,j,k))
     elseif(i-2 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfx(i,j,k)*d12dx*(fh(i-2,j,k)-EIT*fh(i-1,j,k)+EIT*fh(i+1,j,k)-fh(i+2,j,k))
     elseif(i-1 >= imin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfx(i,j,k)*d12dx*(-F3*fh(i-1,j,k)-F10*fh(i,j,k)+F18*fh(i+1,j,k) &
                                    -F6*fh(i+2,j,k)+    fh(i+3,j,k))
     endif
   endif
 ! y direction   
    if(Sfy(i,j,k) > ZEO)then
      if(j+3 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     elseif(j+2 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j+1 <= jmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     endif
   elseif(Sfy(i,j,k) < ZEO)then
      if(j-3 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j+1,k)-F10*fh(i,j,k)+F18*fh(i,j-1,k) &
                                    -F6*fh(i,j-2,k)+    fh(i,j-3,k))
     elseif(j-2 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfy(i,j,k)*d12dy*(fh(i,j-2,k)-EIT*fh(i,j-1,k)+EIT*fh(i,j+1,k)-fh(i,j+2,k))
     elseif(j-1 >= jmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfy(i,j,k)*d12dy*(-F3*fh(i,j-1,k)-F10*fh(i,j,k)+F18*fh(i,j+1,k) &
                                    -F6*fh(i,j+2,k)+    fh(i,j+3,k))
     endif
   endif
 ! z direction   
    if(Sfz(i,j,k) > ZEO)then
      if(k+3 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     elseif(k+2 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k+1 <= kmax)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     endif
   elseif(Sfz(i,j,k) < ZEO)then
      if(k-3 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)-                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k+1)-F10*fh(i,j,k)+F18*fh(i,j,k-1) &
                                    -F6*fh(i,j,k-2)+    fh(i,j,k-3))
     elseif(k-2 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                           &
                  Sfz(i,j,k)*d12dz*(fh(i,j,k-2)-EIT*fh(i,j,k-1)+EIT*fh(i,j,k+1)-fh(i,j,k+2))
     elseif(k-1 >= kmin)then
     f_rhs(i,j,k)=f_rhs(i,j,k)+                                                   &
                  Sfz(i,j,k)*d12dz*(-F3*fh(i,j,k-1)-F10*fh(i,j,k)+F18*fh(i,j,k+1) &
                                    -F6*fh(i,j,k+2)+    fh(i,j,k+3))
     endif
   endif
  enddo
  enddo
  enddo
 ! ---- Dissipation (kodis) loop ----
  if(eps > ZEO) then
  do k=1,ex(3)
  do j=1,ex(2)
  do i=1,ex(1)
  if(i-3 >= imin .and. i+3 <= imax .and. &
     j-3 >= jmin .and. j+3 <= jmax .and. &
     k-3 >= kmin .and. k+3 <= kmax) then
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
                          TWT* fh(i,j,k)            )/dX + &
                                                  (     &
                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
                          TWT* fh(i,j,k)            )/dY + &
                                                  (     &
                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
  endif
  enddo
  enddo
  enddo
  endif
  return
  end subroutine lopsided_kodis
 #elif (ghost_width == 4)
 ! sixth order code
 ! Compute advection terms in right hand sides of field equations
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -8,7 +8,7 @@ include makefile.inc
 	$(f90) $(f90appflags) -c $< -o $@
 .C.o:
-	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -qopenmp -c $< $(filein) -o $@
 .for.o:
 	$(f77) -c $< -o $@
@@ -28,7 +28,8 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
 	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
 	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o writefile_f.o
+	   NullShellPatch2_Evo.o writefile_f.o xh_bssn_rhs.o xh_fdderivs.o xh_fderivs.o xh_kodiss.o xh_lopsided.o \
 	   xh_global_interp.o xh_polint3.o
 C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o surface_integral.o ShellPatch.o\
@@ -72,7 +73,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
 	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
 	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
-             initial_null2.h NullShellPatch2.h 
+             initial_null2.h NullShellPatch2.h xh_bssn_rhs_compute.h xh_global_interp.h
 $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
 	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -96,7 +97,7 @@ misc.o : zbesh.o
 # projects
 ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) 
-	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
+	$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
 ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -13,7 +13,7 @@ LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore
 ## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
 ## -fprofile-instr-use: use collected profile data to guide optimization decisions
 ##   (branch prediction, basic block layout, inlining, loop unrolling)
-PROFDATA     = ../../pgo_profile/default.profdata
+PROFDATA     = /home/hxh/AMSS-NCKU/pgo_profile/default.profdata
 CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
               -fprofile-instr-use=$(PROFDATA) \
               -Dfortran3 -Dnewc -I${MKLROOT}/include
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -11,8 +11,6 @@
 #include <strstream>
 #include <cmath>
 #include <map>
 #include <vector>
 #include <algorithm>
 using namespace std;
 #else
 #include <iostream.h>
@@ -240,6 +238,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  shellf = new double[n_tot * InList];
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
  //|~~~~~> Integrate the dot product of Dphi with the surface normal.
  double *RP_out, *IP_out;
  RP_out = new double[NN];
  IP_out = new double[NN];
@@ -2652,6 +2653,7 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Comm_here);
  double Mass_out = 0;
--- a/AMSS_NCKU_source/xh_bssn_rhs.C
+++ b/AMSS_NCKU_source/xh_bssn_rhs.C
--- a/AMSS_NCKU_source/xh_bssn_rhs_compute.h
+++ b/AMSS_NCKU_source/xh_bssn_rhs_compute.h
@@ -0,0 +1,30 @@
 #include "xh_tool.h"
 extern "C"
 {
 int f_compute_rhs_bssn_xh(int *ex, double &T, 
                       double *X, double *Y, double *Z,
                       double *chi, double *trK,
                       double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                       double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                       double *Gamx, double *Gamy, double *Gamz,
                       double *Lap, double *betax, double *betay, double *betaz,
                       double *dtSfx, double *dtSfy, double *dtSfz,
                       double *chi_rhs, double *trK_rhs,
                       double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                       double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                       double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                       double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                       double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                       double *rho, double *Sx, double *Sy, double *Sz,
                       double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
                       double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
                       double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
                       double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
                       double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
                       double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                       double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                       int &Symmetry, int &Lev, double &eps, int &co
                       ); 
 }
--- a/AMSS_NCKU_source/xh_fdderivs.C
+++ b/AMSS_NCKU_source/xh_fdderivs.C
@@ -0,0 +1,311 @@
 #include "xh_tool.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff)
 {
    (void)onoff;
    const int NO_SYMM = 0, EQ_SYMM = 1;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
    const double F1o4   = 2.5e-1;          // 1/4
    const double F8     = 8.0;
    const double F16    = 16.0;
    const double F30    = 30.0;
    const double F1o12  = ONE / 12.0;
    const double F1o144 = ONE / 144.0;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    /* 系数：按 Fortran 原式 */
    const double Sdxdx = ONE / (dX * dX);
    const double Sdydy = ONE / (dY * dY);
    const double Sdzdz = ONE / (dZ * dZ);
    const double Fdxdx = F1o12 / (dX * dX);
    const double Fdydy = F1o12 / (dY * dY);
    const double Fdzdz = F1o12 / (dZ * dZ);
    const double Sdxdy = F1o4 / (dX * dY);
    const double Sdxdz = F1o4 / (dX * dZ);
    const double Sdydz = F1o4 / (dY * dZ);
    const double Fdxdy = F1o144 / (dX * dY);
    const double Fdxdz = F1o144 / (dX * dZ);
    const double Fdydz = F1o144 / (dY * dZ);
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // symmetry_bd(2, ex, f, fh, SoA);
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    for (int k0 = 0; k0 < ex[2]; ++k0) {
        for (int j0 = 0; j0 < ex[1]; ++j0) {
            for (int i0 = 0; i0 < ex[0]; ++i0) {
                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; 
                fh[idx_funcc_F(iF, jF, kF, 2, ex)] = f[idx_func0(i0, j0, k0, ex)];
            }
        }
    }
    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
    for (int ii = 0; ii <= 2 - 1; ++ii) {
        const int iF_dst = -ii;       // 0, -1, -2, ...
        const int iF_src = ii + 1;    // 1, 2, 3, ...
        for (int kF = 1; kF <= ex[2]; ++kF) {
            for (int jF = 1; jF <= ex[1]; ++jF) {
                fh[idx_funcc_F(iF_dst, jF, kF, 2, ex)] = 
                    fh[idx_funcc_F(iF_src, jF, kF, 2, ex)] * SoA[0];
            }
        }
    }
    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
    for (int jj = 0; jj <= 2 - 1; ++jj) {
        const int jF_dst = -jj;
        const int jF_src = jj + 1;
        for (int kF = 1; kF <= ex[2]; ++kF) {
            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                fh[idx_funcc_F(iF, jF_dst, kF, 2, ex)] =
                    fh[idx_funcc_F(iF, jF_src, kF, 2, ex)] * SoA[1];
            }
        }
    }
    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
    for (int kk = 0; kk <= 2 - 1; ++kk) {
        const int kF_dst = -kk;
        const int kF_src = kk + 1;
        for (int jF = -2 + 1; jF <= ex[1]; ++jF) {
            for (int iF = -2 + 1; iF <= ex[0]; ++iF) {
                fh[idx_funcc_F(iF, jF, kF_dst, 2, ex)] =
                    fh[idx_funcc_F(iF, jF, kF_src, 2, ex)] * SoA[2];
            }
        }
    }
    /* 输出清零：fxx,fyy,fzz,fxy,fxz,fyz = 0 */
    // const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    // for (size_t p = 0; p < all; ++p) {
    //     fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
    //     fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
    // }
    /*
     * Fortran:
     * do k=1,ex3-1
     * do j=1,ex2-1
     * do i=1,ex1-1
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                /* 高阶分支：i±2,j±2,k±2 都在范围内 */
                if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
                    (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
                    (kF + 2) <= kmaxF && (kF - 2) >= kminF)
                {
                    fxx[p] = Fdxdx * (
                        -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Fdydy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Fdzdz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
                        F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    /* fxy 高阶：完全照搬 Fortran 的括号结构 */
                    {
                        const double t_jm2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
                        const double t_jm1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
                        const double t_jp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
                        const double t_jp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
                        fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
                    }
                    /* fxz 高阶 */
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
                        fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                    /* fyz 高阶 */
                    {
                        const double t_km2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
                        const double t_km1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
                        const double t_kp1 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
                        const double t_kp2 =
                            ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
                             -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
                             +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
                             -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
                        fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
                    }
                }
                /* 二阶分支：i±1,j±1,k±1 在范围内 */
                else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
                         (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
                         (kF + 1) <= kmaxF && (kF - 1) >= kminF)
                {
                    fxx[p] = Sdxdx * (
                        fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fyy[p] = Sdydy * (
                        fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fzz[p] = Sdzdz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
                        TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                    fxy[p] = Sdxdy * (
                        fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                    );
                    fxz[p] = Sdxdz * (
                        fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                    );
                    fyz[p] = Sdydz * (
                        fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
                        fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
                        fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                    );
                }else{
                    fxx[p] = 0.0;
                    fyy[p] = 0.0;
                    fzz[p] = 0.0;
                    fxy[p] = 0.0;
                    fxz[p] = 0.0;
                    fyz[p] = 0.0;
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/xh_fderivs.C
+++ b/AMSS_NCKU_source/xh_fderivs.C
@@ -0,0 +1,145 @@
 #include "xh_tool.h"
 /*
 * C 版 fderivs
 *
 * Fortran:
 * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
 *
 * 约定：
 *   f, fx, fy, fz: ex1*ex2*ex3，按 idx_ex 布局
 *   X: ex1, Y: ex2, Z: ex3
 */
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff)
 {
    (void)onoff; // Fortran 里没用到
    const double ZEO = 0.0, ONE = 1.0;
    const double TWO = 2.0, EIT = 8.0;
    const double F12 = 12.0;
    const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
    // SoA(1:3) = SYM1,SYM2,SYM3
    const double SoA[3] = { SYM1, SYM2, SYM3 };
    // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
    const size_t nx = (size_t)ex1 + 2;
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    // double *fh = (double*)malloc(fh_size * sizeof(double));
    if (!fh) return;
    // call symmetry_bd(2,ex,f,fh,SoA)
    symmetry_bd(2, ex, f, fh, SoA);
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    // fx = fy = fz = 0
    const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
    for (size_t p = 0; p < all; ++p) {
        fx[p] = ZEO;
        fy[p] = ZEO;
        fz[p] = ZEO;
    }
    /*
     * Fortran loops:
     * do k=1,ex3-1
     * do j=1,ex2-1
     * do i=1,ex1-1
     *
     * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                // if(i+2 <= imax .and. i-2 >= imin ... )  (全是 Fortran 索引)
                if ((iF + 2) <= ex1 && (iF - 2) >= iminF &&
                    (jF + 2) <= ex2 && (jF - 2) >= jminF &&
                    (kF + 2) <= ex3 && (kF - 2) >= kminF)
                {
                    fx[p] = d12dx * (
                        fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)] -
                        fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)]
                    );
                    fy[p] = d12dy * (
                        fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)] -
                        fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)]
                    );
                    fz[p] = d12dz * (
                        fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] -
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                        EIT * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)] -
                        fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)]
                    );
                }
                // elseif(i+1 <= imax .and. i-1 >= imin ...)
                else if ((iF + 1) <= ex1 && (iF - 1) >= iminF &&
                         (jF + 1) <= ex2 && (jF - 1) >= jminF &&
                         (kF + 1) <= ex3 && (kF - 1) >= kminF)
                {
                    fx[p] = d2dx * (
                        -fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] +
                         fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
                    );
                    fy[p] = d2dy * (
                        -fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] +
                         fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
                    );
                    fz[p] = d2dz * (
                        -fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] +
                         fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
                    );
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/xh_global_interp.C
+++ b/AMSS_NCKU_source/xh_global_interp.C
@@ -0,0 +1,143 @@
 #include "xh_global_interp.h"
 /* 你已有的 polin3（由前面 Fortran->C 翻译得到） */
 // void polin3(const double *x1a, const double *x2a, const double *x3a,
 //             const double *ya, double x1, double x2, double x3,
 //             double *y, double *dy, int ordn);
 /*
  你需要提供 decide3d 的实现（这里仅声明）。
  Fortran: decide3d(ex,f,f,cxB,cxT,SoA,ya,ORDN,Symmetry)
  - ex: [3]
  - f: 三维场（列主序）
  - cxB/cxT: 3 维窗口起止（Fortran 1-based，且可能 <=0）
  - SoA: [3]
  - ya: 输出 ORDN^3 的采样块（列主序）
  - return: 0 表示正常；非 0 表示错误（对应 Fortran logical = .true.）
 */
 // int xh_decide3d(const int ex[3],
 //              const double *f_in,
 //              const double *f_in2,   /* Fortran 里传了 f,f；按原样保留 */
 //              const int cxB[3],
 //              const int cxT[3],
 //              const double SoA[3],
 //              double *ya,
 //              int ordn,
 //              int symmetry);
 /* 把 Fortran 1-based 下标 idxF (可为负/0) 映射到 C 的 X[idx] 访问（只用于 X(2-cxB) 这种表达式） */
 static inline double X_at_FortranIndex(const double *X, int idxF) {
    /* Fortran: X(1) 对应 C: X[0] */
    return X[idxF - 1];
 }
 /* Fortran 整数截断：idint 在这里可用 (int) 实现（对正数等价于 floor） */
 static inline int idint_like(double a) {
    return (int)a;  /* trunc toward zero */
 }
 /* global_interp 的 C 版 */
 void xh_global_interp(const int ex[3],
                   const double *X, const double *Y, const double *Z,
                   const double *f,                 /* f(ex1,ex2,ex3) column-major */
                   double &f_int,
                   double x1, double y1, double z1,
                   int ORDN,
                   const double SoA[3],
                   int symmetry)
 {
    // double time1, time2;
    // time1 = omp_get_wtime();
    enum { NO_SYMM = 0, EQUATORIAL = 1, OCTANT = 2 };
    int j, m;
    int imin, jmin, kmin;
    int cxB[3], cxT[3], cxI[3], cmin[3], cmax[3];
    double cx[3];
    double dX, dY, dZ, ddy;
    /* Fortran: imin=lbound(f,1) ... 通常是 1；这里按 1 处理 */
    imin = 1; jmin = 1; kmin = 1;
    dX = X_at_FortranIndex(X, imin + 1) - X_at_FortranIndex(X, imin);
    dY = X_at_FortranIndex(Y, jmin + 1) - X_at_FortranIndex(Y, jmin);
    dZ = X_at_FortranIndex(Z, kmin + 1) - X_at_FortranIndex(Z, kmin);
    /* x1a(j) = (j-1)*1.0  (j=1..ORDN) */
    double *x1a = (double*)malloc((size_t)ORDN * sizeof(double));
    double *ya  = (double*)malloc((size_t)ORDN * (size_t)ORDN * (size_t)ORDN * sizeof(double));
    if (!x1a || !ya) {
        fprintf(stderr, "global_interp: malloc failed\n");
        exit(1);
    }
    for (j = 0; j < ORDN; j++) x1a[j] = (double)j;
    /* cxI(m) = idint((p - P(1))/dP + 0.4) + 1  (Fortran 1-based) */
    cxI[0] = idint_like((x1 - X_at_FortranIndex(X, 1)) / dX + 0.4) + 1;
    cxI[1] = idint_like((y1 - X_at_FortranIndex(Y, 1)) / dY + 0.4) + 1;
    cxI[2] = idint_like((z1 - X_at_FortranIndex(Z, 1)) / dZ + 0.4) + 1;
    /* cxB = cxI - ORDN/2 + 1 ; cxT = cxB + ORDN - 1 */
    int half = ORDN / 2;  /* Fortran 整数除法 */
    for (m = 0; m < 3; m++) {
        cxB[m] = cxI[m] - half + 1;
        cxT[m] = cxB[m] + ORDN - 1;
    }
    /* cmin=1; cmax=ex */
    cmin[0] = cmin[1] = cmin[2] = 1;
    cmax[0] = ex[0];
    cmax[1] = ex[1];
    cmax[2] = ex[2];
    /* 对称边界时允许 cxB 为负/0（与 Fortran 一致） */
    if (symmetry == OCTANT && fabs(X_at_FortranIndex(X, 1)) < dX) cmin[0] = -half + 2;
    if (symmetry == OCTANT && fabs(X_at_FortranIndex(Y, 1)) < dY) cmin[1] = -half + 2;
    if (symmetry != NO_SYMM && fabs(X_at_FortranIndex(Z, 1)) < dZ) cmin[2] = -half + 2;
    /* 夹紧窗口 [cxB,cxT] 到 [cmin,cmax] */
    for (m = 0; m < 3; m++) {
        if (cxB[m] < cmin[m]) {
            cxB[m] = cmin[m];
            cxT[m] = cxB[m] + ORDN - 1;
        }
        if (cxT[m] > cmax[m]) {
            cxT[m] = cmax[m];
            cxB[m] = cxT[m] + 1 - ORDN;
        }
    }
    /*
      cx(m) 的计算：如果 cxB>0:
        cx = (p - P(cxB))/dP
      else:
        cx = (p + P(2 - cxB))/dP
      注意这里的 cxB 是 Fortran 1-based 语义下的整数，可能 <=0。
    */
    if (cxB[0] > 0) cx[0] = (x1 - X_at_FortranIndex(X, cxB[0])) / dX;
    else           cx[0] = (x1 + X_at_FortranIndex(X, 2 - cxB[0])) / dX;
    if (cxB[1] > 0) cx[1] = (y1 - X_at_FortranIndex(Y, cxB[1])) / dY;
    else           cx[1] = (y1 + X_at_FortranIndex(Y, 2 - cxB[1])) / dY;
    if (cxB[2] > 0) cx[2] = (z1 - X_at_FortranIndex(Z, cxB[2])) / dZ;
    else           cx[2] = (z1 + X_at_FortranIndex(Z, 2 - cxB[2])) / dZ;
    /* decide3d: 填充 ya(1:ORDN,1:ORDN,1:ORDN) */
    if (xh_decide3d(ex, f, f, cxB, cxT, SoA, ya, ORDN, symmetry)) {
        printf("global_interp position: %g %g %g\n", x1, y1, z1);
        printf("data range: %g %g   %g %g   %g %g\n",
               X_at_FortranIndex(X, 1), X_at_FortranIndex(X, ex[0]),
               X_at_FortranIndex(Y, 1), X_at_FortranIndex(Y, ex[1]),
               X_at_FortranIndex(Z, 1), X_at_FortranIndex(Z, ex[2]));
        exit(1);
    }
    /* polin3(x1a,x1a,x1a,ya,cx(1),cx(2),cx(3),f_int,ddy,ORDN) */
    xh_polin3(x1a, x1a, x1a, ya, cx[0], cx[1], cx[2], f_int, &ddy, ORDN);
    free(x1a);
    free(ya);
    // time2 = omp_get_wtime();
    // printf("Time for global_interp: %lf seconds\n", time2 - time1);
 }
--- a/AMSS_NCKU_source/xh_global_interp.h
+++ b/AMSS_NCKU_source/xh_global_interp.h
@@ -0,0 +1,12 @@
 #include "xh_po.h"
 extern "C"{
    void xh_global_interp(const int ex[3],
                    const double *X, const double *Y, const double *Z,
                    const double *f,                 /* f(ex1,ex2,ex3) column-major */
                    double &f_int,
                    double x1, double y1, double z1,
                    int ORDN,
                    const double SoA[3],
                    int symmetry);
 }
--- a/AMSS_NCKU_source/xh_kodiss.C
+++ b/AMSS_NCKU_source/xh_kodiss.C
@@ -0,0 +1,116 @@
 #include "xh_tool.h"
 /*
 * C 版 kodis
 *
 * Fortran signature:
 * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
 *
 * 约定：
 *   X: ex1, Y: ex2, Z: ex3
 *   f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
 *   SoA[3]
 *   eps: double
 */
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps)
 {
    const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
    const double cof = 64.0;             // 2^6
    const int NO_SYMM = 0, OCTANT = 2;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    (void)ONE; // ONE 在原 Fortran 里只是参数，这里不一定用得上
    // Fortran: imax=ex(1) 等是 1-based 上界
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran: imin=jmin=kmin=1，某些对称情况变 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)，对应 ord=3
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    if (!fh) return;
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran loops:
     * do k=1,ex3
     * do j=1,ex2
     * do i=1,ex1
     *
     * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
     * 并定义 Fortran index: iF=i0+1, ...
     */
    for (int k0 = 0; k0 < ex3; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 < ex2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 < ex1; ++i0) {
                const int iF = i0 + 1;
                // Fortran if 条件：
                // i-3 >= imin .and. i+3 <= imax  等（都是 Fortran 索引）
                if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
                    (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
                    (kF - 3) >= kminF && (kF + 3) <= kmaxF)
                {
                    const size_t p = idx_ex(i0, j0, k0, ex);
                    // 三个方向各一份同型的 7 点组合（实际上是对称的 6th-order dissipation/filter 核）
                    const double Dx_term =
                        ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF    , jF, kF, ex)] ) / dX;
                    const double Dy_term =
                        ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF    , kF, ex)] ) / dY;
                    const double Dz_term =
                        ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
                          SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
                          FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
                          TWT *  fh[idx_fh_F(iF, jF, kF    , ex)] ) / dZ;
                    // Fortran:
                    // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
                    f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/xh_lopsided.C
+++ b/AMSS_NCKU_source/xh_lopsided.C
@@ -0,0 +1,262 @@
 #include "xh_tool.h"
 /*
 * 你需要提供 symmetry_bd 的 C 版本（或 Fortran 绑到 C 的接口）。
 * Fortran: call symmetry_bd(3,ex,f,fh,SoA)
 *
 * 约定：
 *   nghost = 3
 *   ex[3]  = {ex1,ex2,ex3}
 *   f      = 原始网格 (ex1*ex2*ex3)
 *   fh     = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3))，对应 Fortran 的 (-2:ex1, ...)
 *   SoA[3] = 输入参数
 */
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3])
 {
    const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
    const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
    const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
    const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
    (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    // 对应 Fortran: dX = X(2)-X(1)  （Fortran 1-based）
    // C: X[1]-X[0]
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const double d12dx = ONE / F12 / dX;
    const double d12dy = ONE / F12 / dY;
    const double d12dz = ONE / F12 / dZ;
    // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到（保持一致也算出来）
    const double d2dx  = ONE / TWO / dX;
    const double d2dy  = ONE / TWO / dY;
    const double d2dz  = ONE / TWO / dZ;
    (void)d2dx; (void)d2dy; (void)d2dz;
    // Fortran:
    // imax = ex(1); jmax = ex(2); kmax = ex(3)
    const int imaxF = ex1;
    const int jmaxF = ex2;
    const int kmaxF = ex3;
    // Fortran:
    // imin=jmin=kmin=1; 若满足对称条件则设为 -2
    int iminF = 1, jminF = 1, kminF = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
    // 分配 fh：大小 (ex1+3)*(ex2+3)*(ex3+3)
    const size_t nx = (size_t)ex1 + 3;
    const size_t ny = (size_t)ex2 + 3;
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;
    static thread_local double *fh = NULL;
    static thread_local size_t cap = 0;
    if (fh_size > cap) {
        free(fh);
        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
        cap = fh_size;
    }
    if (!fh) return; // 内存不足：直接返回（你也可以改成 abort/报错）
    // Fortran: call symmetry_bd(3,ex,f,fh,SoA)
    symmetry_bd(3, ex, f, fh, SoA);
    /*
     * Fortran 主循环：
     * do k=1,ex(3)-1
     * do j=1,ex(2)-1
     * do i=1,ex(1)-1
     *
     * 转成 C 0-based：
     * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
     *
     * 并且 Fortran 里的 i/j/k 在 fh 访问时，仍然是 Fortran 索引值：
     * iF=i0+1, jF=j0+1, kF=k0+1
     */
    for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
        const int kF = k0 + 1;
        for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
            const int jF = j0 + 1;
            for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
                const int iF = i0 + 1;
                const size_t p = idx_ex(i0, j0, k0, ex);
                // ---------------- x direction ----------------
                const double sfx = Sfx[p];
                if (sfx > ZEO) {
                    // Fortran: if(i+3 <= imax)
                    // iF+3 <= ex1  <=> i0+4 <= ex1 <=> i0 <= ex1-4
                    if (i0 <= ex1 - 4) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                    // elseif(i+2 <= imax)  <=> i0 <= ex1-3
                    else if (i0 <= ex1 - 3) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i+1 <= imax)  <=> i0 <= ex1-2（循环里总成立）
                    else if (i0 <= ex1 - 2) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                } else if (sfx < ZEO) {
                    // Fortran: if(i-3 >= imin)
                    // (iF-3) >= iminF  <=> (i0-2) >= iminF
                    if ((i0 - 2) >= iminF) {
                        f_rhs[p] -= sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF - 3, jF, kF, ex)]);
                    }
                    // elseif(i-2 >= imin) <=> (i0-1) >= iminF
                    else if ((i0 - 1) >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            ( fh[idx_fh_F(iF - 2, jF, kF, ex)]
                             -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -      fh[idx_fh_F(iF + 2, jF, kF, ex)]);
                    }
                    // elseif(i-1 >= imin) <=> i0 >= iminF
                    else if (i0 >= iminF) {
                        f_rhs[p] += sfx * d12dx *
                            (-F3  * fh[idx_fh_F(iF - 1, jF, kF, ex)]
                             -F10 * fh[idx_fh_F(iF    , jF, kF, ex)]
                             +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
                             -F6  * fh[idx_fh_F(iF + 2, jF, kF, ex)]
                             +      fh[idx_fh_F(iF + 3, jF, kF, ex)]);
                    }
                }
                // ---------------- y direction ----------------
                const double sfy = Sfy[p];
                if (sfy > ZEO) {
                    // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
                    if (j0 <= ex2 - 4) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    } else if (j0 <= ex2 - 3) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 <= ex2 - 2) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    }
                } else if (sfy < ZEO) {
                    if ((j0 - 2) >= jminF) {
                        f_rhs[p] -= sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF - 3, kF, ex)]);
                    } else if ((j0 - 1) >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            ( fh[idx_fh_F(iF, jF - 2, kF, ex)]
                             -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -      fh[idx_fh_F(iF, jF + 2, kF, ex)]);
                    } else if (j0 >= jminF) {
                        f_rhs[p] += sfy * d12dy *
                            (-F3  * fh[idx_fh_F(iF, jF - 1, kF, ex)]
                             -F10 * fh[idx_fh_F(iF, jF    , kF, ex)]
                             +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
                             -F6  * fh[idx_fh_F(iF, jF + 2, kF, ex)]
                             +      fh[idx_fh_F(iF, jF + 3, kF, ex)]);
                    }
                }
                // ---------------- z direction ----------------
                const double sfz = Sfz[p];
                if (sfz > ZEO) {
                    if (k0 <= ex3 - 4) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    } else if (k0 <= ex3 - 3) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 <= ex3 - 2) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    }
                } else if (sfz < ZEO) {
                    if ((k0 - 2) >= kminF) {
                        f_rhs[p] -= sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF - 3, ex)]);
                    } else if ((k0 - 1) >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            ( fh[idx_fh_F(iF, jF, kF - 2, ex)]
                             -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -      fh[idx_fh_F(iF, jF, kF + 2, ex)]);
                    } else if (k0 >= kminF) {
                        f_rhs[p] += sfz * d12dz *
                            (-F3  * fh[idx_fh_F(iF, jF, kF - 1, ex)]
                             -F10 * fh[idx_fh_F(iF, jF, kF    , ex)]
                             +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
                             -F6  * fh[idx_fh_F(iF, jF, kF + 2, ex)]
                             +      fh[idx_fh_F(iF, jF, kF + 3, ex)]);
                    }
                }
            }
        }
    }
    // free(fh);
 }
--- a/AMSS_NCKU_source/xh_po.h
+++ b/AMSS_NCKU_source/xh_po.h
@@ -0,0 +1,19 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <omp.h>
 int xh_decide3d(const int ex[3],
             const double *f,
             const double *fpi,   /* 这里未用，Fortran 也没用到 */
             const int cxB[3],
             const int cxT[3],
             const double SoA[3],
             double *ya,
             int ordn,
             int Symmetry);
 void xh_polint(const double *xa, const double *ya, double x,
                   double *y, double *dy, int ordn);
 void xh_polin3(const double *x1a, const double *x2a, const double *x3a,
                   const double *ya, double x1, double x2, double x3,
                   double &y, double *dy, int ordn);
--- a/AMSS_NCKU_source/xh_polint3.C
+++ b/AMSS_NCKU_source/xh_polint3.C
@@ -0,0 +1,258 @@
 #include "xh_po.h"
 /*
  ex[0..2]  == Fortran ex(1:3)
  cxB/cxT   == Fortran cxB(1:3), cxT(1:3)  (可能 <=0)
  SoA[0..2] == Fortran SoA(1:3)
  f, fpi    == Fortran f(ex1,ex2,ex3) column-major (1-based in formulas)
  ya        == 连续内存，尺寸为 ORDN^3，对应 Fortran ya(cxB1:cxT1, cxB2:cxT2, cxB3:cxT3)
              但注意：我们用 offset 映射把 Fortran 的 i/j/k 坐标写进去。
 */
 static inline int imax(int a, int b) { return a > b ? a : b; }
 static inline int imin(int a, int b) { return a < b ? a : b; }
 /* f(i,j,k): Fortran column-major, i/j/k are Fortran 1-based in [1..ex] */
 #define F(i,j,k) f[((i)-1) + ex1 * (((j)-1) + ex2 * ((k)-1))]
 /*
  ya(i,j,k): i in [cxB1..cxT1], j in [cxB2..cxT2], k in [cxB3..cxT3]
  我们把它映射到 C 的 0..ORDN-1 立方体：
    ii = i - cxB1
    jj = j - cxB2
    kk = k - cxB3
  并按 column-major 存储（与 Fortran 一致，方便直接喂给你的 polin3）
 */
 #define YA(i,j,k) ya[((i)-cxB1) + ordn * (((j)-cxB2) + ordn * ((k)-cxB3))]
 int xh_decide3d(const int ex[3],
             const double *f,
             const double *fpi,   /* 这里未用，Fortran 也没用到 */
             const int cxB[3],
             const int cxT[3],
             const double SoA[3],
             double *ya,
             int ordn,
             int Symmetry)         /* Symmetry 在 decide3d 里也没直接用 */
 {
    (void)fpi;
    (void)Symmetry;
    const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
    int fmin1[3], fmin2[3], fmax1[3], fmax2[3];
    int i, j, k, m;
    int gont = 0;
    /* 方便 YA 宏使用 */
    const int cxB1 = cxB[0], cxB2 = cxB[1], cxB3 = cxB[2];
    for (m = 0; m < 3; m++) {
        /* Fortran 的 “NaN 检查” 在整数上基本无意义，这里不额外处理 */
        fmin1[m] = imax(1, cxB[m]);
        fmax1[m] = cxT[m];
        fmin2[m] = cxB[m];
        fmax2[m] = imin(0, cxT[m]);
        /* if((fmin1<=fmax1) and (fmin1<1 or fmax1>ex)) gont=true */
        if ((fmin1[m] <= fmax1[m]) && (fmin1[m] < 1 || fmax1[m] > ex[m])) gont = 1;
        /* if((fmin2<=fmax2) and (2-fmax2<1 or 2-fmin2>ex)) gont=true */
        if ((fmin2[m] <= fmax2[m]) && (2 - fmax2[m] < 1 || 2 - fmin2[m] > ex[m])) gont = 1;
    }
    if (gont) {
        printf("error in decide3d\n");
        printf("cxB: %d %d %d   cxT: %d %d %d   ex: %d %d %d\n",
               cxB[0], cxB[1], cxB[2], cxT[0], cxT[1], cxT[2], ex[0], ex[1], ex[2]);
        printf("fmin1: %d %d %d  fmax1: %d %d %d\n",
               fmin1[0], fmin1[1], fmin1[2], fmax1[0], fmax1[1], fmax1[2]);
        printf("fmin2: %d %d %d  fmax2: %d %d %d\n",
               fmin2[0], fmin2[1], fmin2[2], fmax2[0], fmax2[1], fmax2[2]);
        return 1;
    }
    /* ---- 填充 ya：完全照 Fortran 两大块循环写 ---- */
    /* k in [fmin1(3)..fmax1(3)] */
    for (k = fmin1[2]; k <= fmax1[2]; k++) {
        /* j in [fmin1(2)..fmax1(2)] */
        for (j = fmin1[1]; j <= fmax1[1]; j++) {
            /* i in [fmin1(1)..fmax1(1)] : ya(i,j,k)=f(i,j,k) */
            for (i = fmin1[0]; i <= fmax1[0]; i++) {
                YA(i, j, k) = F(i, j, k);
            }
            /* i in [fmin2(1)..fmax2(1)] : ya(i,j,k)=f(2-i,j,k)*SoA(1) */
            for (i = fmin2[0]; i <= fmax2[0]; i++) {
                YA(i, j, k) = F(2 - i, j, k) * SoA[0];
            }
        }
        /* j in [fmin2(2)..fmax2(2)] */
        for (j = fmin2[1]; j <= fmax2[1]; j++) {
            /* i in [fmin1(1)..fmax1(1)] : ya(i,j,k)=f(i,2-j,k)*SoA(2) */
            for (i = fmin1[0]; i <= fmax1[0]; i++) {
                YA(i, j, k) = F(i, 2 - j, k) * SoA[1];
            }
            /* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,2-j,k)*SoA(1)*SoA(2) */
            for (i = fmin2[0]; i <= fmax2[0]; i++) {
                YA(i, j, k) = F(2 - i, 2 - j, k) * SoA[0] * SoA[1];
            }
        }
    }
    /* k in [fmin2(3)..fmax2(3)] */
    for (k = fmin2[2]; k <= fmax2[2]; k++) {
        /* j in [fmin1(2)..fmax1(2)] */
        for (j = fmin1[1]; j <= fmax1[1]; j++) {
            /* i in [fmin1(1)..fmax1(1)] : ya=f(i,j,2-k)*SoA(3) */
            for (i = fmin1[0]; i <= fmax1[0]; i++) {
                YA(i, j, k) = F(i, j, 2 - k) * SoA[2];
            }
            /* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,j,2-k)*SoA(1)*SoA(3) */
            for (i = fmin2[0]; i <= fmax2[0]; i++) {
                YA(i, j, k) = F(2 - i, j, 2 - k) * SoA[0] * SoA[2];
            }
        }
        /* j in [fmin2(2)..fmax2(2)] */
        for (j = fmin2[1]; j <= fmax2[1]; j++) {
            /* i in [fmin1(1)..fmax1(1)] : ya=f(i,2-j,2-k)*SoA(2)*SoA(3) */
            for (i = fmin1[0]; i <= fmax1[0]; i++) {
                YA(i, j, k) = F(i, 2 - j, 2 - k) * SoA[1] * SoA[2];
            }
            /* i in [fmin2(1)..fmax2(1)] : ya=f(2-i,2-j,2-k)*SoA1*SoA2*SoA3 */
            for (i = fmin2[0]; i <= fmax2[0]; i++) {
                YA(i, j, k) = F(2 - i, 2 - j, 2 - k) * SoA[0] * SoA[1] * SoA[2];
            }
        }
    }
    return 0;
 }
 #undef F
 #undef YA
 void xh_polint(const double *xa, const double *ya, double x,
                   double *y, double *dy, int ordn)
 {
    int i, m, ns, n_m;
    double dif, dift, hp, h, den_val;
    double *c  = (double*)malloc((size_t)ordn * sizeof(double));
    double *d  = (double*)malloc((size_t)ordn * sizeof(double));
    double *ho = (double*)malloc((size_t)ordn * sizeof(double));
    if (!c || !d || !ho) {
        fprintf(stderr, "polint: malloc failed\n");
        exit(1);
    }
    for (i = 0; i < ordn; i++) {
        c[i]  = ya[i];
        d[i]  = ya[i];
        ho[i] = xa[i] - x;
    }
    ns  = 0;                      // Fortran ns=1 -> C ns=0
    dif = fabs(x - xa[0]);
    for (i = 1; i < ordn; i++) {
        dift = fabs(x - xa[i]);
        if (dift < dif) {
            ns  = i;
            dif = dift;
        }
    }
    *y  = ya[ns];
    ns -= 1;                      // Fortran ns=ns-1
    for (m = 1; m <= ordn - 1; m++) {
        n_m = ordn - m;           // number of active points this round
        for (i = 0; i < n_m; i++) {
            hp      = ho[i];
            h       = ho[i + m];
            den_val = hp - h;
            if (den_val == 0.0) {
                fprintf(stderr, "failure in polint for point %g\n", x);
                fprintf(stderr, "with input points xa: ");
                for (int t = 0; t < ordn; t++) fprintf(stderr, "%g ", xa[t]);
                fprintf(stderr, "\n");
                exit(1);
            }
            den_val = (c[i + 1] - d[i]) / den_val;
            d[i]    = h  * den_val;
            c[i]    = hp * den_val;
        }
        // Fortran: if (2*ns < n_m) then dy=c(ns+1) else dy=d(ns); ns=ns-1
        // Here ns is C-indexed and can be -1; logic still matches.
        if (2 * ns < n_m) {
            *dy = c[ns + 1];
        } else {
            *dy = d[ns];
            ns -= 1;
        }
        *y += *dy;
    }
    free(c);
    free(d);
    free(ho);
 }
 void xh_polin3(const double *x1a, const double *x2a, const double *x3a,
                   const double *ya, double x1, double x2, double x3,
                   double &y, double *dy, int ordn)
 {
    // ya is ordn x ordn x ordn in Fortran layout (column-major)
    #define YA3(i,j,k) ya[(i) + ordn*((j) + ordn*(k))]  // i,j,k: 0..ordn-1
    int j, k;
    double dy_temp;
    // yatmp(j,k) in Fortran code is ordn x ordn, treat column-major:
    // yatmp(j,k) -> yatmp[j + ordn*k]
    double *yatmp = (double*)malloc((size_t)ordn * (size_t)ordn * sizeof(double));
    double *ymtmp = (double*)malloc((size_t)ordn * sizeof(double));
    if (!yatmp || !ymtmp) {
        fprintf(stderr, "polin3: malloc failed\n");
        exit(1);
    }
    #define YAT(j,k) yatmp[(j) + ordn*(k)]
    for (k = 0; k < ordn; k++) {
        for (j = 0; j < ordn; j++) {
            // call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp)
            // ya(:,j,k) contiguous: base is &YA3(0,j,k)
            xh_polint(x1a, &YA3(0, j, k), x1, &YAT(j, k), &dy_temp, ordn);
        }
    }
    for (k = 0; k < ordn; k++) {
        // call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp)
        xh_polint(x2a, &YAT(0, k), x2, &ymtmp[k], &dy_temp, ordn);
    }
    xh_polint(x3a, ymtmp, x3, &y, dy, ordn);
    #undef YAT
    free(yatmp);
    free(ymtmp);
    #undef YA3
 }
--- a/AMSS_NCKU_source/xh_share_func.h
+++ b/AMSS_NCKU_source/xh_share_func.h
@@ -0,0 +1,338 @@
 #ifndef SHARE_FUNC_H
 #define SHARE_FUNC_H
 #include <stdlib.h>
 #include <stddef.h>
 #include <math.h>
 #include <stdio.h>
 #include <omp.h>
 /* 主网格：0-based -> 1D */
 static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
    const int ex1 = ex[0], ex2 = ex[1];
    return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
 }
 /*
 * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
 * ord=2 => shift=1
 * iF/jF/kF 为 Fortran 索引（可为 -1,0,1..ex）
 */
 static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 1;
    const int nx = ex[0] + 2;      // ex1 + ord
    const int ny = ex[1] + 2;
    const int ii = iF + shift;     // 0..ex1+1
    const int jj = jF + shift;     // 0..ex2+1
    const int kk = kF + shift;     // 0..ex3+1
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
 * ord=3 => shift=2
 * iF/jF/kF 是 Fortran 索引（可为负）
 */
 static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
    const int shift = 2;                 // ord=3 -> -2..ex
    const int nx = ex[0] + 3;            // ex1 + ord
    const int ny = ex[1] + 3;
    const int ii = iF + shift;           // 0..ex1+2
    const int jj = jF + shift;           // 0..ex2+2
    const int kk = kF + shift;           // 0..ex3+2
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * func:  (1..extc1, 1..extc2, 1..extc3)   1-based in Fortran
 * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
 *
 * C 里我们把：
 *   func  视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
 *   funcc 用“平移下标”存为一维数组：
 *     iF in [-ord+1..extc1]  -> ii = iF + (ord-1)  in [0..extc1+ord-1]
 *     总长度 nx = extc1 + ord
 *     同理 ny = extc2 + ord, nz = extc3 + ord
 */
 static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
    const int nx = extc[0], ny = extc[1];
    return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
 }
 static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
    const int shift = ord - 1;          // iF = -shift .. extc1
    const int nx = extc[0] + ord;       // [-shift..extc1] 共 extc1+ord 个
    const int ny = extc[1] + ord;
    const int ii = iF + shift;          // 0..extc1+shift
    const int jj = jF + shift;          // 0..extc2+shift
    const int kk = kF + shift;          // 0..extc3+shift
    return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
 }
 /*
 * 等价于 Fortran:
 * funcc(1:extc1,1:extc2,1:extc3)=func
 * do i=0,ord-1
 *   funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
 * enddo
 * do i=0,ord-1
 *   funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
 * enddo
 * do i=0,ord-1
 *   funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
 * enddo
 */
 static inline void symmetry_bd(int ord,
                 const int extc[3],
                 const double *func,
                 double *funcc,
                 const double SoA[3])
 {
    const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
    // 1) funcc(1:extc1,1:extc2,1:extc3) = func
    // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
    for (int k0 = 0; k0 < extc3; ++k0) {
        for (int j0 = 0; j0 < extc2; ++j0) {
            for (int i0 = 0; i0 < extc1; ++i0) {
                const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
                funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
            }
        }
    }
    // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
    for (int ii = 0; ii <= ord - 1; ++ii) {
        const int iF_dst = -ii;       // 0, -1, -2, ...
        const int iF_src = ii + 1;    // 1, 2, 3, ...
        for (int kF = 1; kF <= extc3; ++kF) {
            for (int jF = 1; jF <= extc2; ++jF) {
                funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
                    funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
            }
        }
    }
    // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
    // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
    for (int jj = 0; jj <= ord - 1; ++jj) {
        const int jF_dst = -jj;
        const int jF_src = jj + 1;
        for (int kF = 1; kF <= extc3; ++kF) {
            for (int iF = -ord + 1; iF <= extc1; ++iF) {
                funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
                    funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
            }
        }
    }
    // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
    for (int kk = 0; kk <= ord - 1; ++kk) {
        const int kF_dst = -kk;
        const int kF_src = kk + 1;
        for (int jF = -ord + 1; jF <= extc2; ++jF) {
            for (int iF = -ord + 1; iF <= extc1; ++iF) {
                funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
                    funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
            }
        }
    }
 }
 #endif
 /* 你已有的函数：idx_ex / idx_fh_F_ord2 以及 fh 的布局 */
 static inline void fdderivs_xh(
    int i0, int j0, int k0,
    const int ex[3],
    const double *fh,
    int iminF, int jminF, int kminF,
    int imaxF, int jmaxF, int kmaxF,
    double Fdxdx, double Fdydy, double Fdzdz,
    double Fdxdy, double Fdxdz, double Fdydz,
    double Sdxdx, double Sdydy, double Sdzdz,
    double Sdxdy, double Sdxdz, double Sdydz,
    double *fxx, double *fxy, double *fxz,
    double *fyy, double *fyz, double *fzz
 ){
    const double F8  = 8.0;
    const double F16 = 16.0;
    const double F30 = 30.0;
    const double TWO = 2.0;
    const int iF = i0 + 1;
    const int jF = j0 + 1;
    const int kF = k0 + 1;
    const size_t p = idx_ex(i0, j0, k0, ex);
    /* 高阶分支：i±2,j±2,k±2 都在范围内 */
    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
    {
        fxx[p] = Fdxdx * (
            -fh[idx_fh_F_ord2(iF - 2, jF,     kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
             fh[idx_fh_F_ord2(iF + 2, jF,     kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
        );
        fyy[p] = Fdydy * (
            -fh[idx_fh_F_ord2(iF,     jF - 2, kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
             fh[idx_fh_F_ord2(iF,     jF + 2, kF,     ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
        );
        fzz[p] = Fdzdz * (
            -fh[idx_fh_F_ord2(iF,     jF,     kF - 2, ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
             F30 * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] -
             fh[idx_fh_F_ord2(iF,     jF,     kF + 2, ex)] +
             F16 * fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
        );
        /* fxy 高阶 */
        {
            const double t_jm2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
            const double t_jm1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
            const double t_jp1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
            const double t_jp2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
            fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
        }
        /* fxz 高阶 */
        {
            const double t_km2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
            const double t_km1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
            const double t_kp1 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
            const double t_kp2 =
                ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
                 -    fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
            fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
        }
        /* fyz 高阶 */
        {
            const double t_km2 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
            const double t_km1 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
            const double t_kp1 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
            const double t_kp2 =
                ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
                 -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
                 +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
                 -    fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
            fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
        }
    }
    /* 二阶分支：i±1,j±1,k±1 在范围内 */
    else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
             (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
             (kF + 1) <= kmaxF && (kF - 1) >= kminF)
    {
        fxx[p] = Sdxdx * (
            fh[idx_fh_F_ord2(iF - 1, jF,     kF,     ex)] -
            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
            fh[idx_fh_F_ord2(iF + 1, jF,     kF,     ex)]
        );
        fyy[p] = Sdydy * (
            fh[idx_fh_F_ord2(iF,     jF - 1, kF,     ex)] -
            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
            fh[idx_fh_F_ord2(iF,     jF + 1, kF,     ex)]
        );
        fzz[p] = Sdzdz * (
            fh[idx_fh_F_ord2(iF,     jF,     kF - 1, ex)] -
            TWO * fh[idx_fh_F_ord2(iF,     jF,     kF,     ex)] +
            fh[idx_fh_F_ord2(iF,     jF,     kF + 1, ex)]
        );
        fxy[p] = Sdxdy * (
            fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
            fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
            fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
            fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
        );
        fxz[p] = Sdxdz * (
            fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
            fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
        );
        fyz[p] = Sdydz * (
            fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
            fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
            fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
        );
    }
    else {
        fxx[p] = 0.0; fyy[p] = 0.0; fzz[p] = 0.0;
        fxy[p] = 0.0; fxz[p] = 0.0; fyz[p] = 0.0;
    }
 }
--- a/AMSS_NCKU_source/xh_tool.h
+++ b/AMSS_NCKU_source/xh_tool.h
@@ -0,0 +1,27 @@
 #include "xh_share_func.h"
 void fdderivs(const int ex[3],
              const double *f,
              double *fxx, double *fxy, double *fxz,
              double *fyy, double *fyz, double *fzz,
              const double *X, const double *Y, const double *Z,
              double SYM1, double SYM2, double SYM3,
              int Symmetry, int onoff);
 void fderivs(const int ex[3],
             const double *f,
             double *fx, double *fy, double *fz,
             const double *X, const double *Y, const double *Z,
             double SYM1, double SYM2, double SYM3,
             int Symmetry, int onoff);
 void kodis(const int ex[3],
           const double *X, const double *Y, const double *Z,
           const double *f, double *f_rhs,
           const double SoA[3],
           int Symmetry, double eps);
 void lopsided(const int ex[3],
              const double *X, const double *Y, const double *Z,
              const double *f, double *f_rhs,
              const double *Sfx, const double *Sfy, const double *Sfz,
              int Symmetry, const double SoA[3]);
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -16,18 +16,18 @@ import time
 ## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
 ## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
 #NUMACTL_CPU_BIND = "taskset -c 0-111"
-NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
+NUMACTL_CPU_BIND = "taskset -c 0-47"
-
+NUMACTL_CPU_BIND2 = "OMP_NUM_THREADS=48 OMP_PROC_BIND=close OMP_PLACES={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47} taskset -c 0-47"
 #NUMACTL_CPU_BIND2 = "taskset -c 0-1"
 ## Build parallelism configuration
 ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
 ## Set make -j to utilize available cores for faster builds
-BUILD_JOBS = 96
+BUILD_JOBS = 32
 ##################################################################
 ##################################################################
 ## Compile the AMSS-NCKU main program ABE
@@ -117,11 +117,12 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        #mpi_command         = NUMACTL_CPU_BIND2 + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command = """ OMP_NUM_THREADS=48 OMP_PROC_BIND=close OMP_PLACES=cores mpirun -np 1 --cpu-bind=sockets  ./ABE """
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND2 + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output
--- a/parallel_plot_helper.py
+++ b/parallel_plot_helper.py
@@ -1,29 +0,0 @@
 import multiprocessing
 def run_plot_task(task):
    """Execute a single plotting task.
    Parameters
    ----------
    task : tuple
        A tuple of (function, args_tuple) where function is a callable
        plotting function and args_tuple contains its arguments.
    """
    func, args = task
    return func(*args)
 def run_plot_tasks_parallel(plot_tasks):
    """Execute a list of independent plotting tasks in parallel.
    Uses the 'fork' context to create worker processes so that the main
    script is NOT re-imported/re-executed in child processes.
    Parameters
    ----------
    plot_tasks : list of tuples
        Each element is (function, args_tuple).
    """
    ctx = multiprocessing.get_context('fork')
    with ctx.Pool() as pool:
        pool.map(run_plot_task, plot_tasks)
--- a/pgo_profile/default.profdata
+++ b/pgo_profile/default.profdata
--- a/pgo_profile/default.profdata.backup
+++ b/pgo_profile/default.profdata.backup
--- a/pgo_profile/default_15874826282416242821_0_58277.profraw
+++ b/pgo_profile/default_15874826282416242821_0_58277.profraw
--- a/plot_GW_strain_amplitude_xiaoqu.py
+++ b/plot_GW_strain_amplitude_xiaoqu.py
@@ -11,8 +11,6 @@
 import numpy                               ## numpy for array operations
 import scipy                               ## scipy for interpolation and signal processing
 import math
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 import os                                  ## os for system/file operations
--- a/plot_binary_data.py
+++ b/plot_binary_data.py
@@ -8,23 +8,16 @@
 ##
 #################################################
 ## Restrict OpenMP to one thread per process so that running
 ## many workers in parallel does not create an O(workers * BLAS_threads)
 ## thread explosion.  The variable MUST be set before numpy/scipy
 ## are imported, because the BLAS library reads them only at load time.
 import os
 os.environ.setdefault("OMP_NUM_THREADS",        "1")
 import numpy
 import scipy
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt
 from   matplotlib.colors    import LogNorm
 from   mpl_toolkits.mplot3d import Axes3D
 ## import torch
 import AMSS_NCKU_Input      as input_data
 import os
 #########################################################################################
@@ -199,19 +192,3 @@ def get_data_xy( Rmin, Rmax, n, data0, time, figure_title, figure_outdir ):
 ####################################################################################
 ####################################################################################
 ## Allow this module to be run as a standalone script so that each
 ## binary-data plot can be executed in a fresh subprocess whose BLAS
 ## environment variables (set above) take effect before numpy loads.
 ##
 ## Usage:  python3 plot_binary_data.py <filename> <binary_outdir> <figure_outdir>
 ####################################################################################
 if __name__ == '__main__':
    import sys
    if len(sys.argv) != 4:
        print(f"Usage: {sys.argv[0]} <filename> <binary_outdir> <figure_outdir>")
        sys.exit(1)
    plot_binary_data(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/plot_xiaoqu.py
+++ b/plot_xiaoqu.py
@@ -8,8 +8,6 @@
 #################################################
 import numpy                               ## numpy for array operations
 import matplotlib
 matplotlib.use('Agg')                      ## use non-interactive backend for multiprocessing safety
 import matplotlib.pyplot    as     plt     ## matplotlib for plotting
 from   mpl_toolkits.mplot3d import Axes3D  ## needed for 3D plots
 import glob
@@ -17,9 +15,6 @@ import os                                  ## operating system utilities
 import plot_binary_data
 import AMSS_NCKU_Input as input_data
 import subprocess
 import sys
 import multiprocessing
 # plt.rcParams['text.usetex'] = True  ## enable LaTeX fonts in plots
@@ -55,40 +50,10 @@ def generate_binary_data_plot( binary_outdir, figure_outdir ):
        file_list.append(x)
        print(x)
-    ## Plot each file in parallel using subprocesses.
+    ## Plot each file in the list
    ## Each subprocess is a fresh Python process where the BLAS thread-count
    ## environment variables (set at the top of plot_binary_data.py) take
    ## effect before numpy is imported.  This avoids the thread explosion
    ## that occurs when multiprocessing.Pool with 'fork' context inherits
    ## already-initialized multi-threaded BLAS from the parent.
    script = os.path.join( os.path.dirname(__file__), "plot_binary_data.py" )
    max_workers = min( multiprocessing.cpu_count(), len(file_list) ) if file_list else 0
    running = []
    failed  = []
    for filename in file_list:
        print(filename)
-        proc = subprocess.Popen(
+        plot_binary_data.plot_binary_data(filename, binary_outdir, figure_outdir)
            [sys.executable, script, filename, binary_outdir, figure_outdir],
        )
        running.append( (proc, filename) )
        ## Keep at most max_workers subprocesses active at a time
        if len(running) >= max_workers:
            p, fn = running.pop(0)
            p.wait()
            if p.returncode != 0:
                failed.append(fn)
    ## Wait for all remaining subprocesses to finish
    for p, fn in running:
        p.wait()
        if p.returncode != 0:
            failed.append(fn)
    if failed:
        print( " WARNING: the following binary data plots failed:" )
        for fn in failed:
            print( "   ", fn )
    print(                        )
    print( " Binary Data Plot Has been Finished " )