Add OpenMP parallelization to BSSN RHS hot-path stencil routines

Enable OpenMP threading for the dominant computational kernels: - makefile.inc: add -qopenmp to f90appflags - diff_new.f90: split fderivs/fdderivs into OpenMP interior + serial boundary - kodiss.f90: split kodis into OpenMP interior + serial boundary - lopsidediff.f90: add OMP PARALLEL DO COLLAPSE(2) to lopsided - fmisc.f90: parallelize symmetry_bd bulk array copy - bssn_rhs.f90: add OMP WORKSHARE to array-syntax operations Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 13:58:55 +08:00
parent 09ffdb553d
commit c6e4d4ab71
6 changed files with 162 additions and 56 deletions
--- a/AMSS_NCKU_source/kodiss.f90
+++ b/AMSS_NCKU_source/kodiss.f90
@@ -159,36 +159,12 @@ integer, parameter :: NO_SYMM=0, OCTANT=2

  call symmetry_bd(3,ex,f,fh,SoA)

-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-
-  if(i-3 >= imin .and. i+3 <= imax .and. &
-     j-3 >= jmin .and. j+3 <= jmax .and. &
-     k-3 >= kmin .and. k+3 <= kmax) then
-#if 0     
-! x direction
-   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dX/cof * (     &
-                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
-                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
-                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
-                          TWT* fh(i,j,k)            )
-! y direction
-
-   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dY/cof * (     &
-                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
-                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
-                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
-                          TWT* fh(i,j,k)            )
-! z direction
-
-   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/dZ/cof * (     &
-                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
-                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
-                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
-                          TWT* fh(i,j,k)            )
-#else
-! calculation order if important ?
+! Interior: all stencil points guaranteed in-bounds
+  !$OMP PARALLEL DO COLLAPSE(2) SCHEDULE(static) PRIVATE(i,j,k)
+  do k=4,ex(3)-3
+  do j=4,ex(2)-3
+  !DIR$ IVDEP
+  do i=4,ex(1)-3
   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
@@ -204,9 +180,37 @@ integer, parameter :: NO_SYMM=0, OCTANT=2
                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
                          TWT* fh(i,j,k)            )/dZ )
-#endif
-  endif
+  enddo
+  enddo
+  enddo
+  !$OMP END PARALLEL DO

+! Boundary shell: original branching logic for points near edges
+  do k=1,ex(3)
+  do j=1,ex(2)
+  do i=1,ex(1)
+  if(i >= 4 .and. i <= ex(1)-3 .and. &
+     j >= 4 .and. j <= ex(2)-3 .and. &
+     k >= 4 .and. k <= ex(3)-3) cycle
+  if(i-3 >= imin .and. i+3 <= imax .and. &
+     j-3 >= jmin .and. j+3 <= jmax .and. &
+     k-3 >= kmin .and. k+3 <= kmax) then
+   f_rhs(i,j,k)       = f_rhs(i,j,k) + eps/cof *( (     &
+                              (fh(i-3,j,k)+fh(i+3,j,k)) - &
+                          SIX*(fh(i-2,j,k)+fh(i+2,j,k)) + &
+                          FIT*(fh(i-1,j,k)+fh(i+1,j,k)) - &
+                          TWT* fh(i,j,k)            )/dX + &
+                                                  (     &
+                              (fh(i,j-3,k)+fh(i,j+3,k)) - &
+                          SIX*(fh(i,j-2,k)+fh(i,j+2,k)) + &
+                          FIT*(fh(i,j-1,k)+fh(i,j+1,k)) - &
+                          TWT* fh(i,j,k)            )/dY + &
+                                                  (     &
+                              (fh(i,j,k-3)+fh(i,j,k+3)) - &
+                          SIX*(fh(i,j,k-2)+fh(i,j,k+2)) + &
+                          FIT*(fh(i,j,k-1)+fh(i,j,k+1)) - &
+                          TWT* fh(i,j,k)            )/dZ )
+  endif
  enddo
  enddo
  enddo