Compare commits

..

5 Commits

Author SHA1 Message Date
CGH0S7
75be0968fc feat: port GPU code to CUDA 13 and enable GPU computation
Major changes:
   - Update makefile.inc for CUDA 13.1 with sm_89 architecture (RTX 4050)
   - Replace deprecated cudaThreadSynchronize() with cudaDeviceSynchronize()
   - Add CUDA_SAFE_CALL macro for CUDA 13 compatibility
   - Fix duplicate function definitions (compare_result_gpu, SHStep)
   - Fix syntax error in bssn_step_gpu.C
   - Enable GPU calculation in AMSS_NCKU_Input.py
   - Successfully build ABEGPU executable
2026-01-13 18:15:49 +00:00
CGH0S7
b27e071cde Makefile updated for rocky10 2026-01-14 01:41:31 +08:00
a1125d4c79 try to build gpu version 2026-01-13 23:52:44 +08:00
dcc66588fc gitignore updated 2026-01-13 23:45:49 +08:00
950d448edf fix(build): update LDLIBS to use -lmpi and remove hardcoded paths 2026-01-13 23:40:51 +08:00
16 changed files with 293 additions and 604 deletions

3
.gitignore vendored
View File

@@ -1,6 +1,3 @@
__pycache__
GW150914
GW150914-origin
docs
*.tmp

View File

@@ -16,12 +16,12 @@ import numpy
File_directory = "GW150914" ## output file directory
Output_directory = "binary_output" ## binary data file directory
## The file directory name should not be too long
MPI_processes = 48 ## number of mpi processes used in the simulation
MPI_processes = 96 ## number of mpi processes used in the simulation
GPU_Calculation = "no" ## Use GPU or not
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
CPU_Part = 1.0
GPU_Part = 0.0
GPU_Calculation = "yes" ## Use GPU or not
## GPU support has been updated for CUDA 13
CPU_Part = 0.0
GPU_Part = 1.0
#################################################

View File

@@ -1,279 +0,0 @@
#!/usr/bin/env python3
"""
AMSS-NCKU GW150914 Simulation Regression Test Script
Verification Requirements:
1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
2. ADM constraint violation < 2 (Grid Level 0)
RMS Calculation Method:
- Computes trajectory deviation on the XY plane independently for BH1 and BH2
- For each black hole: RMS = sqrt((1/M) * sum((Δr_i / r_i^max)^2)) × 100%
- Final RMS = max(RMS_BH1, RMS_BH2)
Usage: python3 AMSS_NCKU_Verify_ASC26.py [output_dir]
Default: output_dir = GW150914/AMSS_NCKU_output
Reference: GW150914-origin (baseline simulation)
"""
import numpy as np
import sys
import os
# ANSI Color Codes
class Color:
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
BLUE = '\033[94m'
BOLD = '\033[1m'
RESET = '\033[0m'
def get_status_text(passed):
if passed:
return f"{Color.GREEN}{Color.BOLD}PASS{Color.RESET}"
else:
return f"{Color.RED}{Color.BOLD}FAIL{Color.RESET}"
def load_bh_trajectory(filepath):
"""Load black hole trajectory data"""
data = np.loadtxt(filepath)
return {
'time': data[:, 0],
'x1': data[:, 1], 'y1': data[:, 2], 'z1': data[:, 3],
'x2': data[:, 4], 'y2': data[:, 5], 'z2': data[:, 6]
}
def load_constraint_data(filepath):
"""Load constraint violation data"""
data = []
with open(filepath, 'r') as f:
for line in f:
if line.startswith('#'):
continue
parts = line.split()
if len(parts) >= 8:
data.append([float(x) for x in parts[:8]])
return np.array(data)
def calculate_rms_error(bh_data_ref, bh_data_target):
"""
Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
This function computes the RMS error independently for BH1 and BH2 trajectories,
then returns the maximum of the two as the final RMS error metric.
For each black hole, the RMS is calculated as:
RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
where:
Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
Args:
bh_data_ref: Reference (baseline) trajectory data
bh_data_target: Target (optimized) trajectory data
Returns:
rms_value: Final RMS error as a percentage (max of BH1 and BH2)
error: Error message if any
"""
# Align data: truncate to the length of the shorter dataset
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
if M < 10:
return None, "Insufficient data points for comparison"
# Extract XY coordinates for both black holes
x1_ref = bh_data_ref['x1'][:M]
y1_ref = bh_data_ref['y1'][:M]
x2_ref = bh_data_ref['x2'][:M]
y2_ref = bh_data_ref['y2'][:M]
x1_new = bh_data_target['x1'][:M]
y1_new = bh_data_target['y1'][:M]
x2_new = bh_data_target['x2'][:M]
y2_new = bh_data_target['y2'][:M]
# Calculate RMS for BH1
delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
r1_new = np.sqrt(x1_new**2 + y1_new**2)
r1_max = np.maximum(r1_ref, r1_new)
# Calculate RMS for BH2
delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
r2_new = np.sqrt(x2_new**2 + y2_new**2)
r2_max = np.maximum(r2_ref, r2_new)
# Avoid division by zero for BH1
valid_mask1 = r1_max > 1e-15
if np.sum(valid_mask1) < 10:
return None, "Insufficient valid data points for BH1"
terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
rms_bh1 = np.sqrt(np.mean(terms1)) * 100
# Avoid division by zero for BH2
valid_mask2 = r2_max > 1e-15
if np.sum(valid_mask2) < 10:
return None, "Insufficient valid data points for BH2"
terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
rms_bh2 = np.sqrt(np.mean(terms2)) * 100
# Final RMS is the maximum of BH1 and BH2
rms_final = max(rms_bh1, rms_bh2)
return rms_final, None
def analyze_constraint_violation(constraint_data, n_levels=9):
"""
Analyze ADM constraint violation
Return maximum constraint violation for Grid Level 0
"""
# Extract Grid Level 0 data (first entry for each time step)
level0_data = constraint_data[::n_levels]
# Calculate maximum absolute value for each constraint
results = {
'Ham': np.max(np.abs(level0_data[:, 1])),
'Px': np.max(np.abs(level0_data[:, 2])),
'Py': np.max(np.abs(level0_data[:, 3])),
'Pz': np.max(np.abs(level0_data[:, 4])),
'Gx': np.max(np.abs(level0_data[:, 5])),
'Gy': np.max(np.abs(level0_data[:, 6])),
'Gz': np.max(np.abs(level0_data[:, 7]))
}
results['max_violation'] = max(results.values())
return results
def print_header():
"""Print report header"""
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
def print_rms_results(rms_rel, error, threshold=1.0):
"""Print RMS error results"""
print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
print("-" * 45)
if error:
print(f" {Color.RED}Error: {error}{Color.RESET}")
return False
passed = rms_rel < threshold
print(f" RMS relative error: {rms_rel:.4f}%")
print(f" Requirement: < {threshold}%")
print(f" Status: {get_status_text(passed)}")
return passed
def print_constraint_results(results, threshold=2.0):
"""Print constraint violation results"""
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
print("-" * 45)
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
for i, name in enumerate(names):
print(f" Max |{name:3}|: {results[name]:.6f}", end=" ")
if (i + 1) % 2 == 0: print()
if len(names) % 2 != 0: print()
passed = results['max_violation'] < threshold
print(f"\n Maximum violation: {results['max_violation']:.6f}")
print(f" Requirement: < {threshold}")
print(f" Status: {get_status_text(passed)}")
return passed
def print_summary(rms_passed, constraint_passed):
"""Print summary"""
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + "Verification Summary" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
all_passed = rms_passed and constraint_passed
res_rms = get_status_text(rms_passed)
res_con = get_status_text(constraint_passed)
print(f" [1] RMS trajectory check: {res_rms}")
print(f" [2] ADM constraint check: {res_con}")
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
print(f"\n Overall result: {final_status}")
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET + "\n")
return all_passed
def main():
# Determine target (optimized) output directory
if len(sys.argv) > 1:
target_dir = sys.argv[1]
else:
script_dir = os.path.dirname(os.path.abspath(__file__))
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
# Determine reference (baseline) directory
script_dir = os.path.dirname(os.path.abspath(__file__))
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
# Data file paths
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
# Check if files exist
if not os.path.exists(bh_file_ref):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
sys.exit(1)
if not os.path.exists(bh_file_target):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
sys.exit(1)
if not os.path.exists(constraint_file):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
sys.exit(1)
# Print header
print_header()
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
# Load data
bh_data_ref = load_bh_trajectory(bh_file_ref)
bh_data_target = load_bh_trajectory(bh_file_target)
constraint_data = load_constraint_data(constraint_file)
# Calculate RMS error
rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
rms_passed = print_rms_results(rms_rel, error)
# Analyze constraint violation
constraint_results = analyze_constraint_violation(constraint_data)
constraint_passed = print_constraint_results(constraint_results)
# Print summary
all_passed = print_summary(rms_passed, constraint_passed)
# Return exit code
sys.exit(0 if all_passed else 1)
if __name__ == "__main__":
main()

View File

@@ -37,51 +37,57 @@ close(77)
end program checkFFT
#endif
!-------------
! Optimized FFT using Intel oneMKL DFTI
! Mathematical equivalence: Standard DFT definition
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
!-------------
SUBROUTINE four1(dataa,nn,isign)
use MKL_DFTI
implicit none
INTEGER, intent(in) :: isign, nn
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
type(DFTI_DESCRIPTOR), pointer :: desc
integer :: status
! Create DFTI descriptor for 1D complex-to-complex transform
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
if (status /= 0) return
! Set input/output storage as interleaved complex (default)
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
INTEGER::isign,nn
double precision,dimension(2*nn)::dataa
INTEGER::i,istep,j,m,mmax,n
double precision::tempi,tempr
DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
n=2*nn
j=1
do i=1,n,2
if(j.gt.i)then
tempr=dataa(j)
tempi=dataa(j+1)
dataa(j)=dataa(i)
dataa(j+1)=dataa(i+1)
dataa(i)=tempr
dataa(i+1)=tempi
endif
m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
j=j-m
m=m/2
goto 1
endif
j=j+m
enddo
mmax=2
2 if (n.gt.mmax) then
istep=2*mmax
theta=6.28318530717959d0/(isign*mmax)
wpr=-2.d0*sin(0.5d0*theta)**2
wpi=sin(theta)
wr=1.d0
wi=0.d0
do m=1,mmax,2
do i=m,n,istep
j=i+mmax
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
dataa(j)=dataa(i)-tempr
dataa(j+1)=dataa(i+1)-tempi
dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
enddo
wtemp=wr
wr=wr*wpr-wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
enddo
mmax=istep
goto 2
endif
! Commit the descriptor
status = DftiCommitDescriptor(desc)
if (status /= 0) then
status = DftiFreeDescriptor(desc)
return
endif
! Execute FFT based on direction
if (isign == 1) then
! Forward FFT: exp(-2*pi*i*k*n/N)
status = DftiComputeForward(desc, dataa)
else
! Backward FFT: exp(+2*pi*i*k*n/N)
status = DftiComputeBackward(desc, dataa)
endif
! Free descriptor
status = DftiFreeDescriptor(desc)
return
END SUBROUTINE four1

View File

@@ -27,7 +27,6 @@ using namespace std;
#endif
#include "TwoPunctures.h"
#include <mkl_cblas.h>
TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz,
@@ -892,17 +891,25 @@ double TwoPunctures::norm1(double *v, int n)
/* -------------------------------------------------------------------------*/
double TwoPunctures::norm2(double *v, int n)
{
// Optimized with oneMKL BLAS DNRM2
// Computes: sqrt(sum(v[i]^2))
return cblas_dnrm2(n, v, 1);
int i;
double result = 0;
for (i = 0; i < n; i++)
result += v[i] * v[i];
return sqrt(result);
}
/* -------------------------------------------------------------------------*/
double TwoPunctures::scalarproduct(double *v, double *w, int n)
{
// Optimized with oneMKL BLAS DDOT
// Computes: sum(v[i] * w[i])
return cblas_ddot(n, v, 1, w, 1);
int i;
double result = 0;
for (i = 0; i < n; i++)
result += v[i] * w[i];
return result;
}
/* -------------------------------------------------------------------------*/

View File

@@ -18,7 +18,7 @@ using namespace std;
#include <fstream>
#endif
void compare_result_gpu(int ftag1,double * datac,int data_num){
static void compare_result_gpu(int ftag1,double * datac,int data_num){
double * data = (double*)malloc(sizeof(double)*data_num);
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
compare_result(ftag1,data,data_num);
@@ -83,7 +83,7 @@ inline void sub_enforce_ga(int matrix_size){
double * trA = M_ chin1;
enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
cudaMemset(trA,0,matrix_size * sizeof(double));
cudaThreadSynchronize();
cudaDeviceSynchronize();
//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
//trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
@@ -273,13 +273,13 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
#endif //ifdef Vertex
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
@@ -378,9 +378,9 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
@@ -445,9 +445,9 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void computeRicci_part1(double * dst)
@@ -465,9 +465,9 @@ __global__ void computeRicci_part1(double * dst)
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}/*Exception*/
@@ -524,9 +524,9 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
{
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
@@ -617,9 +617,9 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
sub_symmetry_bd(3,f,fh,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void compute_rhs_bssn_part1()
@@ -2656,13 +2656,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
#ifdef TIMING1
cudaThreadSynchronize();
cudaDeviceSynchronize();
gettimeofday(&tv2, NULL);
cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl;
#endif
//cout<<"GPU meta data ready.\n";
cudaThreadSynchronize();
cudaDeviceSynchronize();
//--------------test constant memory address & value--------------
/* double rank = mpi_rank;
@@ -2685,7 +2685,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//sub_enforce_ga(matrix_size);
//4.1-----compute rhs---------
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2701,7 +2701,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2711,7 +2711,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2720,20 +2720,20 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
cudaThreadSynchronize();
cudaDeviceSynchronize();
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2805,7 +2805,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
if(co == 0){
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2814,7 +2814,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
#if (ABV == 1)
@@ -2895,7 +2895,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//-------------------FOR GPU TEST----------------------
//-----------------------------------------------------
#ifdef TIMING
cudaThreadSynchronize();
cudaDeviceSynchronize();
gettimeofday(&tv2, NULL);
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
#endif

View File

@@ -4,6 +4,17 @@
#include "bssn_macro.h"
#include "macrodef.fh"
// CUDA error checking macro for CUDA 13 compatibility
#define CUDA_SAFE_CALL(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define DEVICE_ID 0
// #define DEVICE_ID_BY_MPI_RANK
#define GRID_DIM 256

View File

@@ -20,7 +20,7 @@ using namespace std;
__device__ volatile unsigned int global_count = 0;
void compare_result_gpu(int ftag1,double * datac,int data_num){
static void compare_result_gpu(int ftag1,double * datac,int data_num){
double * data = (double*)malloc(sizeof(double)*data_num);
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
compare_result(ftag1,data,data_num);
@@ -153,11 +153,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d
inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){
sub_symmetry_bd_ss_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_ss_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_symmetry_bd_ss_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){
@@ -247,13 +247,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f
//cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double));
//cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double));
sub_symmetry_bd_ss(2,f,fh,SoA1);
cudaThreadSynchronize();
cudaDeviceSynchronize();
//compare_result_gpu(0,fh,h_3D_SIZE[2]);
sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fx,fy,fz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
//compare_result_gpu(1,fx,h_3D_SIZE[0]);
//compare_result_gpu(2,fy,h_3D_SIZE[0]);
//compare_result_gpu(3,fz,h_3D_SIZE[0]);
@@ -451,17 +451,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
//fderivs_sh
sub_symmetry_bd_ss(2,f,fh,SoA1);
cudaThreadSynchronize();
cudaDeviceSynchronize();
//compare_result_gpu(1,fh,h_3D_SIZE[2]);
sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
//fdderivs_sh
sub_symmetry_bd_ss(2,f,fh,SoA1);
cudaThreadSynchronize();
cudaDeviceSynchronize();
//compare_result_gpu(21,fh,h_3D_SIZE[2]);
sub_fdderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
/*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]);
compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]);
compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]);
@@ -472,7 +472,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]);
compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/
sub_fdderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fxx,fxy,fxz,fyy,fyz,fzz);
cudaThreadSynchronize();
cudaDeviceSynchronize();
/*compare_result_gpu(1,fxx,h_3D_SIZE[0]);
compare_result_gpu(2,fxy,h_3D_SIZE[0]);
compare_result_gpu(3,fxz,h_3D_SIZE[0]);
@@ -496,9 +496,9 @@ __global__ void computeRicci_ss_part1(double * dst)
inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta)
{
sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
computeRicci_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void sub_lopsided_ss_part1(double * dst)
@@ -516,9 +516,9 @@ __global__ void sub_lopsided_ss_part1(double * dst)
inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA)
{
sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA);
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_lopsided_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
__global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs)
@@ -590,11 +590,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA
}
//compare_result_gpu(10,f,h_3D_SIZE[0]);
sub_symmetry_bd_ss(3,f,fh,SoA1);
cudaThreadSynchronize();
cudaDeviceSynchronize();
//compare_result_gpu(0,fh,h_3D_SIZE[3]);
sub_kodis_sh_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
cudaThreadSynchronize();
cudaDeviceSynchronize();
//compare_result_gpu(1,f_rhs,h_3D_SIZE[0]);
}
@@ -2287,13 +2287,13 @@ int gpu_rhs_ss(RHS_SS_PARA)
#ifdef TIMING1
cudaThreadSynchronize();
cudaDeviceSynchronize();
gettimeofday(&tv2, NULL);
cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl;
#endif
//cout<<"GPU meta data ready.\n";
cudaThreadSynchronize();
cudaDeviceSynchronize();
//-------------get device info-------------------------------------
@@ -2306,7 +2306,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
//sub_enforce_ga(matrix_size);
//4.1-----compute rhs---------
compute_rhs_ss_part1<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
@@ -2322,7 +2322,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
compute_rhs_ss_part2<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
@@ -2332,7 +2332,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
compute_rhs_ss_part3<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta);
computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta);
@@ -2340,25 +2340,25 @@ int gpu_rhs_ss(RHS_SS_PARA)
computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta);
computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta);
computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta);
cudaThreadSynchronize();
cudaDeviceSynchronize();
compute_rhs_ss_part4<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
//cudaThreadSynchronize();
//cudaDeviceSynchronize();
//compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]);
//compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]);
//compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]);
compute_rhs_ss_part5<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
compute_rhs_ss_part6<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
@@ -2423,7 +2423,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
}
if(co == 0){
compute_rhs_ss_part7<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
@@ -2432,7 +2432,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
compute_rhs_ss_part8<<<GRID_DIM,BLOCK_DIM>>>();
cudaThreadSynchronize();
cudaDeviceSynchronize();
}
#if (ABV == 1)
@@ -2512,7 +2512,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
//test kodis
//sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss);
#ifdef TIMING
cudaThreadSynchronize();
cudaDeviceSynchronize();
gettimeofday(&tv2, NULL);
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
#endif

View File

@@ -1676,8 +1676,11 @@ void bssn_class::Step_GPU(int lev, int YN)
#endif // PSTR == ?
//--------------------------With Shell--------------------------
// Note: SHStep() implementation is in bssn_gpu_class.C
#ifdef WithShell
#if 0
// This SHStep() implementation has been moved to bssn_gpu_class.C to avoid duplicate definition
void bssn_class::SHStep()
{
int lev = 0;
@@ -1938,5 +1941,5 @@ void bssn_class::SHStep()
sPp = sPp->next;
}
}
d
#endif // #if 0
#endif // withshell

View File

@@ -69,7 +69,6 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -152,7 +151,6 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -220,7 +218,6 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -285,7 +282,6 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -375,7 +371,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -474,7 +469,6 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -537,7 +531,6 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -601,7 +594,6 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -665,7 +657,6 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -728,7 +719,6 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -790,7 +780,6 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -877,7 +866,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1009,7 +997,6 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1164,7 +1151,6 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1241,7 +1227,6 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1312,7 +1297,6 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1417,7 +1401,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1593,7 +1576,6 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1661,7 +1643,6 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1731,7 +1712,6 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1801,7 +1781,6 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1872,7 +1851,6 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -1941,7 +1919,6 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2034,7 +2011,6 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2151,7 +2127,6 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2237,7 +2212,6 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2314,7 +2288,6 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2433,7 +2406,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2621,7 +2593,6 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2694,7 +2665,6 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2770,7 +2740,6 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2846,7 +2815,6 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -2927,7 +2895,6 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3006,7 +2973,6 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3114,7 +3080,6 @@
fy = ZEO
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3251,7 +3216,6 @@
fx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3347,7 +3311,6 @@
fy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3432,7 +3395,6 @@
fz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3568,7 +3530,6 @@
fxz = ZEO
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3841,7 +3802,6 @@
fxx = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -3923,7 +3883,6 @@
fyy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4008,7 +3967,6 @@
fzz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4093,7 +4051,6 @@
fxy = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4196,7 +4153,6 @@
fxz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1
@@ -4297,7 +4253,6 @@
fyz = ZEO
!$omp parallel do collapse(3) schedule(static)
do k=1,ex(3)-1
do j=1,ex(2)-1
do i=1,ex(1)-1

View File

@@ -1259,7 +1259,7 @@ end subroutine d2dump
end subroutine polin3
!--------------------------------------------------------------------------------------
! calculate L2norm
! calculate L2norm
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f,f_out,gw)
@@ -1276,9 +1276,7 @@ end subroutine d2dump
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k,n_elements
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
integer::i,j,k
dX = X(2) - X(1)
dY = Y(2) - Y(1)
@@ -1302,12 +1300,7 @@ if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
f_out = f_out*dX*dY*dZ
@@ -1332,9 +1325,7 @@ f_out = f_out*dX*dY*dZ
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k,n_elements
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
integer::i,j,k
real*8 :: PIo4
@@ -1397,12 +1388,7 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
f_out = f_out*dX*dY*dZ
@@ -1430,8 +1416,6 @@ f_out = f_out*dX*dY*dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8, dimension(:), allocatable :: f_flat
real*8, external :: DDOT
real*8 :: PIo4
@@ -1494,12 +1478,11 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif
! Optimized with oneMKL BLAS DDOT for dot product
f_out = sum(f(imin:imax,jmin:jmax,kmin:kmax)*f(imin:imax,jmin:jmax,kmin:kmax))
f_out = f_out
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(Nout))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
deallocate(f_flat)
return
@@ -1697,7 +1680,6 @@ deallocate(f_flat)
real*8, dimension(ORDN,ORDN) :: tmp2
real*8, dimension(ORDN) :: tmp1
real*8, dimension(3) :: SoAh
real*8, external :: DDOT
! +1 because c++ gives 0 for first point
cxB = inds+1
@@ -1733,21 +1715,20 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),cxB(3):cxT(3))
endif
! Optimized with BLAS operations for better performance
! First dimension: z-direction weighted sum
tmp2=0
do m=1,ORDN
tmp2 = tmp2 + coef(2*ORDN+m)*ya(:,:,m)
enddo
! Second dimension: y-direction weighted sum
tmp1=0
do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
enddo
! Third dimension: x-direction weighted sum using BLAS DDOT
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
f_int=0
do m=1,ORDN
f_int = f_int + coef(m)*tmp1(m)
enddo
return
@@ -1777,7 +1758,6 @@ deallocate(f_flat)
real*8, dimension(ORDN,ORDN) :: ya
real*8, dimension(ORDN) :: tmp1
real*8, dimension(2) :: SoAh
real*8, external :: DDOT
! +1 because c++ gives 0 for first point
cxB = inds(1:2)+1
@@ -1807,14 +1787,15 @@ deallocate(f_flat)
ya=fh(cxB(1):cxT(1),cxB(2):cxT(2),inds(3))
endif
! Optimized with BLAS operations
tmp1=0
do m=1,ORDN
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
enddo
! Use BLAS DDOT for final weighted sum
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
f_int=0
do m=1,ORDN
f_int = f_int + coef(m)*tmp1(m)
enddo
return
@@ -1845,7 +1826,6 @@ deallocate(f_flat)
real*8, dimension(ORDN) :: ya
real*8 :: SoAh
integer,dimension(3) :: inds
real*8, external :: DDOT
! +1 because c++ gives 0 for first point
inds = indsi + 1
@@ -1906,8 +1886,10 @@ deallocate(f_flat)
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
endif
! Optimized with BLAS DDOT for weighted sum
f_int = DDOT(ORDN, coef, 1, ya, 1)
f_int=0
do m=1,ORDN
f_int = f_int + coef(m)*ya(m)
enddo
return
@@ -2139,38 +2121,24 @@ deallocate(f_flat)
end function fWigner_d_function
!----------------------------------
! Optimized factorial function using lookup table for small N
! and log-gamma for large N to avoid overflow
function ffact(N) result(gont)
implicit none
integer,intent(in) :: N
real*8 :: gont
integer :: i
! Lookup table for factorials 0! to 20! (precomputed)
real*8, parameter, dimension(0:20) :: fact_table = [ &
1.d0, 1.d0, 2.d0, 6.d0, 24.d0, 120.d0, 720.d0, 5040.d0, 40320.d0, &
362880.d0, 3628800.d0, 39916800.d0, 479001600.d0, 6227020800.d0, &
87178291200.d0, 1307674368000.d0, 20922789888000.d0, &
355687428096000.d0, 6402373705728000.d0, 121645100408832000.d0, &
2432902008176640000.d0 ]
integer :: i
! sanity check
if(N < 0)then
write(*,*) "ffact: error input for factorial"
gont = 1.d0
return
endif
! Use lookup table for small N (fast path)
if(N <= 20)then
gont = fact_table(N)
else
! Use log-gamma function for large N: N! = exp(log_gamma(N+1))
! This avoids overflow and is computed efficiently
gont = exp(log_gamma(dble(N+1)))
endif
gont = 1.d0
do i=1,N
gont = gont*i
enddo
return

View File

@@ -16,66 +16,115 @@ using namespace std;
#include <string.h>
#include <math.h>
#endif
// Intel oneMKL LAPACK interface
#include <mkl_lapacke.h>
/* Linear equation solution using Intel oneMKL LAPACK.
/* Linear equation solution by Gauss-Jordan elimination.
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
containing the right-hand side vectors. On output a is
replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors.
Mathematical equivalence:
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
within numerical precision. */
corresponding set of solution vectors */
int gaussj(double *a, double *b, int n)
{
// Allocate pivot array and workspace
lapack_int *ipiv = new lapack_int[n];
lapack_int info;
double swap;
// Make a copy of matrix a for solving (dgesv modifies it to LU form)
double *a_copy = new double[n * n];
for (int i = 0; i < n * n; i++) {
a_copy[i] = a[i];
int *indxc, *indxr, *ipiv;
indxc = new int[n];
indxr = new int[n];
ipiv = new int[n];
int i, icol, irow, j, k, l, ll;
double big, dum, pivinv, temp;
for (j = 0; j < n; j++)
ipiv[j] = 0;
for (i = 0; i < n; i++)
{
big = 0.0;
for (j = 0; j < n; j++)
if (ipiv[j] != 1)
for (k = 0; k < n; k++)
{
if (ipiv[k] == 0)
{
if (fabs(a[j * n + k]) >= big)
{
big = fabs(a[j * n + k]);
irow = j;
icol = k;
}
}
else if (ipiv[k] > 1)
{
cout << "gaussj: Singular Matrix-1" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
}
ipiv[icol] = ipiv[icol] + 1;
if (irow != icol)
{
for (l = 0; l < n; l++)
{
swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
a[icol * n + l] = swap;
}
swap = b[irow];
b[irow] = b[icol];
b[icol] = swap;
}
indxr[i] = irow;
indxc[i] = icol;
if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
for (int ii = 0; ii < n; ii++)
{
for (int jj = 0; jj < n; jj++)
cout << a[ii * n + jj] << " ";
cout << endl;
}
return 1; // error return
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
}
// Step 1: Solve linear system A*x = b using LU decomposition
// LAPACKE_dgesv uses column-major by default, but we use row-major
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization
// First do LU factorization of original matrix a
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
}
// Then compute inverse from LU factorization
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
if (info != 0) {
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
delete[] ipiv;
delete[] a_copy;
return 1;
for (l = n - 1; l >= 0; l--)
{
if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
{
swap = a[k * n + indxr[l]];
a[k * n + indxr[l]] = a[k * n + indxc[l]];
a[k * n + indxc[l]] = swap;
}
}
delete[] indxc;
delete[] indxr;
delete[] ipiv;
delete[] a_copy;
return 0;
}

View File

@@ -512,10 +512,11 @@
IMPLICIT DOUBLE PRECISION (A-H,O-Z)
DIMENSION V(N),W(N)
! SUBROUTINE TO COMPUTE DOUBLE PRECISION VECTOR DOT PRODUCT.
! Optimized using Intel oneMKL BLAS ddot
! Mathematical equivalence: DGVV = sum_{i=1}^{N} V(i)*W(i)
DOUBLE PRECISION, EXTERNAL :: DDOT
DGVV = DDOT(N, V, 1, W, 1)
SUM = 0.0D0
DO 10 I = 1,N
SUM = SUM + V(I)*W(I)
10 CONTINUE
DGVV = SUM
RETURN
END

View File

@@ -2,7 +2,7 @@
#ifndef MICRODEF_H
#define MICRODEF_H
#include "macrodef.fh"
#include "microdef.fh"
// application parameters

View File

@@ -1,32 +1,22 @@
## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
filein = -I/usr/include -I/usr/include/openmpi-x86_64 -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
##filein = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -I/usr/lib/cuda/include
## Aggressive optimization flags:
## -O3: Maximum optimization
## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
## -fp-model fast=2: Aggressive floating-point optimizations
## -fma: Enable fused multiply-add instructions
## Note: OpenMP enabled for hybrid MPI+OpenMP
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -qopenmp \
-fpp -I${MKLROOT}/include
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx -qopenmp
LDLIBS = -L/usr/lib64/openmpi/lib -Wl,-rpath,/usr/lib64/openmpi/lib -lmpi -lgfortran -L/usr/local/cuda-13.1/lib64 -Wl,-rpath,/usr/local/cuda-13.1/lib64 -lcudart -lcuda
##LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -L/usr/lib/cuda/lib64 -lcudart -lmpi -lgfortran
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
CXXAPPFLAGS = -O3 -Wno-deprecated -Dfortran3 -Dnewc
#f90appflags = -O3 -fpp
f90appflags = -O3 -x f95-cpp-input
f90 = gfortran
f77 = gfortran
CXX = g++
CC = gcc
CLINKER = mpic++
Cu = /usr/local/cuda-13.1/bin/nvcc
CUDA_LIB_PATH = -L/usr/local/cuda-13.1/lib64 -I/usr/include -I/usr/local/cuda-13.1/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
# RTX 4050 uses Ada Lovelace architecture (compute capability 8.9)
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch=sm_89 -Dfortran3 -Dnewc

View File

@@ -11,15 +11,6 @@
import AMSS_NCKU_Input as input_data
import subprocess
## CPU core binding configuration
## Removed hardcoded taskset to allow full utilization of 96 cores via MPI+OpenMP
NUMACTL_CPU_BIND = ""
## Build parallelism configuration
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
## Set make -j to utilize available cores for faster builds
BUILD_JOBS = 96
##################################################################
@@ -37,9 +28,9 @@ def makefile_ABE():
## Build command
if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
makefile_command = "make -j4" + " ABE"
elif (input_data.GPU_Calculation == "yes"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
makefile_command = "make -j4" + " ABEGPU"
else:
print( " CPU/GPU numerical calculation setting is wrong " )
print( )
@@ -77,7 +68,7 @@ def makefile_TwoPunctureABE():
print( )
## Build command
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} TwoPunctureABE"
makefile_command = "make" + " TwoPunctureABE"
## Execute the command with subprocess.Popen and stream output
makefile_process = subprocess.Popen(makefile_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
@@ -111,23 +102,13 @@ def run_ABE():
print( " Running the AMSS-NCKU executable file ABE/ABEGPU " )
print( )
## Calculate OMP_NUM_THREADS
## User has 96 cores. Calculate threads per MPI process.
total_physical_cores = 96
omp_num_threads = total_physical_cores // input_data.MPI_processes
if omp_num_threads < 1:
omp_num_threads = 1
print( f" Configuration: {input_data.MPI_processes} MPI processes, {omp_num_threads} OpenMP threads per process." )
print( f" Total cores utilized: {input_data.MPI_processes * omp_num_threads}" )
## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"):
mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABE"
mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
mpi_command = f"{NUMACTL_CPU_BIND} mpirun -genv OMP_NUM_THREADS {omp_num_threads} -np {input_data.MPI_processes} ./ABEGPU"
mpi_command = "mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output
@@ -166,7 +147,7 @@ def run_TwoPunctureABE():
print( )
## Define the command to run
TwoPuncture_command = NUMACTL_CPU_BIND + " ./TwoPunctureABE"
TwoPuncture_command = "./TwoPunctureABE"
TwoPuncture_command_outfile = "TwoPunctureABE_out.log"
## Execute the command with subprocess.Popen and stream output