diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C
index 54652a0..91ead8a 100644
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -499,6 +499,272 @@ void Patch::Interp_Points(MyList<var> *VarList,
 
   delete[] owner_rank;
 }
+void Patch::Interp_Points(MyList<var> *VarList,
+                          int NN, double **XX,
+                          double *Shellf, int Symmetry,
+                          int Nmin_consumer, int Nmax_consumer)
+{
+  // Targeted point-to-point overload: each owner sends each point only to
+  // the one rank that needs it for integration (consumer), reducing
+  // communication volume by ~nprocs times compared to the Bcast version.
+  int myrank, nprocs;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  int ordn = 2 * ghost_width;
+  MyList<var> *varl;
+  int num_var = 0;
+  varl = VarList;
+  while (varl)
+  {
+    num_var++;
+    varl = varl->next;
+  }
+
+  memset(Shellf, 0, sizeof(double) * NN * num_var);
+
+  // owner_rank[j] records which MPI rank owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  // --- Interpolation phase (identical to original) ---
+  for (int j = 0; j < NN; j++)
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp)
+    {
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          varl = VarList;
+          int k = 0;
+          while (varl)
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
+      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
+  }
+
+  // --- Error check for unfound points ---
+  for (int j = 0; j < NN; j++)
+  {
+    if (owner_rank[j] < 0 && myrank == 0)
+    {
+      cout << "ERROR: Patch::Interp_Points fails to find point (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << XX[d][j];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")";
+      }
+      cout << " on Patch (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[d] << "+" << lli[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")--";
+      }
+      cout << "(";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  // --- Targeted point-to-point communication phase ---
+  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
+  int *consumer_rank = new int[NN];
+  {
+    int mp = NN / nprocs;
+    int Lp = NN - nprocs * mp;
+    for (int j = 0; j < NN; j++)
+    {
+      if (j < Lp * (mp + 1))
+        consumer_rank[j] = j / (mp + 1);
+      else
+        consumer_rank[j] = Lp + (j - Lp * (mp + 1)) / mp;
+    }
+  }
+
+  // Count sends and recvs per rank
+  int *send_count = new int[nprocs];
+  int *recv_count = new int[nprocs];
+  memset(send_count, 0, sizeof(int) * nprocs);
+  memset(recv_count, 0, sizeof(int) * nprocs);
+
+  for (int j = 0; j < NN; j++)
+  {
+    int own = owner_rank[j];
+    int con = consumer_rank[j];
+    if (own == con)
+      continue; // local — no communication needed
+    if (own == myrank)
+      send_count[con]++;
+    if (con == myrank)
+      recv_count[own]++;
+  }
+
+  // Build send buffers: for each destination rank, pack (index, data) pairs
+  // Each entry: 1 int (point index j) + num_var doubles
+  int total_send = 0, total_recv = 0;
+  int *send_offset = new int[nprocs];
+  int *recv_offset = new int[nprocs];
+  for (int r = 0; r < nprocs; r++)
+  {
+    send_offset[r] = total_send;
+    total_send += send_count[r];
+    recv_offset[r] = total_recv;
+    total_recv += recv_count[r];
+  }
+
+  // Pack send buffers: each message contains (j, data[0..num_var-1]) per point
+  int stride = 1 + num_var; // 1 double for index + num_var doubles for data
+  double *sendbuf = new double[total_send * stride];
+  double *recvbuf = new double[total_recv * stride];
+
+  // Temporary counters for packing
+  int *pack_pos = new int[nprocs];
+  memset(pack_pos, 0, sizeof(int) * nprocs);
+
+  for (int j = 0; j < NN; j++)
+  {
+    int own = owner_rank[j];
+    int con = consumer_rank[j];
+    if (own != myrank || con == myrank)
+      continue;
+    int pos = (send_offset[con] + pack_pos[con]) * stride;
+    sendbuf[pos] = (double)j; // point index
+    for (int v = 0; v < num_var; v++)
+      sendbuf[pos + 1 + v] = Shellf[j * num_var + v];
+    pack_pos[con]++;
+  }
+
+  // Post non-blocking recvs and sends
+  int n_req = 0;
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (recv_count[r] > 0) n_req++;
+    if (send_count[r] > 0) n_req++;
+  }
+
+  MPI_Request *reqs = new MPI_Request[n_req];
+  int req_idx = 0;
+
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (recv_count[r] > 0)
+    {
+      MPI_Irecv(recvbuf + recv_offset[r] * stride,
+                recv_count[r] * stride, MPI_DOUBLE,
+                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
+    }
+  }
+  for (int r = 0; r < nprocs; r++)
+  {
+    if (send_count[r] > 0)
+    {
+      MPI_Isend(sendbuf + send_offset[r] * stride,
+                send_count[r] * stride, MPI_DOUBLE,
+                r, 0, MPI_COMM_WORLD, &reqs[req_idx++]);
+    }
+  }
+
+  if (n_req > 0)
+    MPI_Waitall(n_req, reqs, MPI_STATUSES_IGNORE);
+
+  // Unpack recv buffers into Shellf
+  for (int i = 0; i < total_recv; i++)
+  {
+    int pos = i * stride;
+    int j = (int)recvbuf[pos];
+    for (int v = 0; v < num_var; v++)
+      Shellf[j * num_var + v] = recvbuf[pos + 1 + v];
+  }
+
+  delete[] reqs;
+  delete[] sendbuf;
+  delete[] recvbuf;
+  delete[] pack_pos;
+  delete[] send_offset;
+  delete[] recv_offset;
+  delete[] send_count;
+  delete[] recv_count;
+  delete[] consumer_rank;
+  delete[] owner_rank;
+}
 void Patch::Interp_Points(MyList<var> *VarList,
                           int NN, double **XX,
                           double *Shellf, int Symmetry, MPI_Comm Comm_here)
diff --git a/AMSS_NCKU_source/MPatch.h b/AMSS_NCKU_source/MPatch.h
index ca19ca5..b993be6 100644
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -39,6 +39,10 @@ public:
 
    bool Find_Point(double *XX);
 
+   void Interp_Points(MyList<var> *VarList,
+                      int NN, double **XX,
+                      double *Shellf, int Symmetry,
+                      int Nmin_consumer, int Nmax_consumer);
    void Interp_Points(MyList<var> *VarList,
                       int NN, double **XX,
                       double *Shellf, int Symmetry, MPI_Comm Comm_here);
diff --git a/AMSS_NCKU_source/surface_integral.C b/AMSS_NCKU_source/surface_integral.C
index e725ae0..c2b7b67 100644
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -220,16 +220,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
     pox[2][n] = rex * nz_g[n];
   }
 
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
   int mp, Lp, Nmin, Nmax;
-
   mp = n_tot / cpusize;
   Lp = n_tot - cpusize * mp;
-
   if (Lp > myrank)
   {
     Nmin = myrank * mp + myrank;
@@ -241,6 +234,11 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
     Nmax = Nmin + mp - 1;
   }
 
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
+
   //|~~~~~> Integrate the dot product of Dphi with the surface normal.
 
   double *RP_out, *IP_out;
@@ -2386,25 +2384,9 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
     pox[2][n] = rex * nz_g[n];
   }
 
-  double *shellf;
-  shellf = new double[n_tot * InList];
-
-  // we have assumed there is only one box on this level,
-  // so we do not need loop boxes
-  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);
-
-  double Mass_out = 0;
-  double ang_outx, ang_outy, ang_outz;
-  double p_outx, p_outy, p_outz;
-  ang_outx = ang_outy = ang_outz = 0.0;
-  p_outx = p_outy = p_outz = 0.0;
-  const double f1o8 = 0.125;
-
   int mp, Lp, Nmin, Nmax;
-
   mp = n_tot / cpusize;
   Lp = n_tot - cpusize * mp;
-
   if (Lp > myrank)
   {
     Nmin = myrank * mp + myrank;
@@ -2416,6 +2398,20 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
     Nmax = Nmin + mp - 1;
   }
 
+  double *shellf;
+  shellf = new double[n_tot * InList];
+
+  // we have assumed there is only one box on this level,
+  // so we do not need loop boxes
+  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
+
+  double Mass_out = 0;
+  double ang_outx, ang_outy, ang_outz;
+  double p_outx, p_outy, p_outz;
+  ang_outx = ang_outy = ang_outz = 0.0;
+  p_outx = p_outy = p_outz = 0.0;
+  const double f1o8 = 0.125;
+
   double Chi, Psi;
   double Gxx, Gxy, Gxz, Gyy, Gyz, Gzz;
   double gupxx, gupxy, gupxz, gupyy, gupyz, gupzz;