Compare commits
36 Commits
cjy-oneapi
...
hxh-omp
| Author | SHA1 | Date | |
|---|---|---|---|
| e11363e06e | |||
| f70e90f694 | |||
|
|
75dd5353b0 | ||
|
|
23a82d063b | ||
| 44efb2e08c | |||
| 16013081e0 | |||
| 03416a7b28 | |||
| cca3c16c2b | |||
| e5231849ee | |||
| a766e49ff0 | |||
| 1a518cd3f6 | |||
| 1dc622e516 | |||
| 3046a0ccde | |||
| d4ec69c98a | |||
| 2c0a3055d4 | |||
| 1eba73acbe | |||
| b91cfff301 | |||
| e29ca2dca9 | |||
| 6493101ca0 | |||
| 169986cde1 | |||
| 1fbc213888 | |||
| 6024708a48 | |||
| bc457d981e | |||
| 51dead090e | |||
| 34d6922a66 | |||
| 8010ad27ed | |||
| 38e691f013 | |||
| 808387aa11 | |||
| c2b676abf2 | |||
| 2c60533501 | |||
| 318b5254cc | |||
| 3cee05f262 | |||
| e6329b013d | |||
| 2791d2e225 | |||
| 72ce153e48 | |||
|
|
79af79d471 |
@@ -1,9 +1,13 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
AMSS-NCKU GW150914 Simulation Regression Test Script
|
AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
|
||||||
|
|
||||||
Verification Requirements:
|
Verification Requirements:
|
||||||
1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
|
1. RMS errors < 1% for:
|
||||||
|
- 3D Vector Total RMS
|
||||||
|
- X Component RMS
|
||||||
|
- Y Component RMS
|
||||||
|
- Z Component RMS
|
||||||
2. ADM constraint violation < 2 (Grid Level 0)
|
2. ADM constraint violation < 2 (Grid Level 0)
|
||||||
|
|
||||||
RMS Calculation Method:
|
RMS Calculation Method:
|
||||||
@@ -57,79 +61,62 @@ def load_constraint_data(filepath):
|
|||||||
data.append([float(x) for x in parts[:8]])
|
data.append([float(x) for x in parts[:8]])
|
||||||
return np.array(data)
|
return np.array(data)
|
||||||
|
|
||||||
|
def calculate_all_rms_errors(bh_data_ref, bh_data_target):
|
||||||
def calculate_rms_error(bh_data_ref, bh_data_target):
|
|
||||||
"""
|
"""
|
||||||
Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
|
Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently.
|
||||||
|
Uses r = sqrt(x^2 + y^2) as the denominator for all error normalizations.
|
||||||
This function computes the RMS error independently for BH1 and BH2 trajectories,
|
Returns the maximum error between BH1 and BH2 for each category.
|
||||||
then returns the maximum of the two as the final RMS error metric.
|
|
||||||
|
|
||||||
For each black hole, the RMS is calculated as:
|
|
||||||
RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
|
|
||||||
|
|
||||||
where:
|
|
||||||
Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
|
|
||||||
r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
|
|
||||||
|
|
||||||
Args:
|
|
||||||
bh_data_ref: Reference (baseline) trajectory data
|
|
||||||
bh_data_target: Target (optimized) trajectory data
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
rms_value: Final RMS error as a percentage (max of BH1 and BH2)
|
|
||||||
error: Error message if any
|
|
||||||
"""
|
"""
|
||||||
# Align data: truncate to the length of the shorter dataset
|
|
||||||
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
|
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
|
||||||
|
|
||||||
if M < 10:
|
if M < 10:
|
||||||
return None, "Insufficient data points for comparison"
|
return None, "Insufficient data points for comparison"
|
||||||
|
|
||||||
# Extract XY coordinates for both black holes
|
results = {}
|
||||||
x1_ref = bh_data_ref['x1'][:M]
|
|
||||||
y1_ref = bh_data_ref['y1'][:M]
|
|
||||||
x2_ref = bh_data_ref['x2'][:M]
|
|
||||||
y2_ref = bh_data_ref['y2'][:M]
|
|
||||||
|
|
||||||
x1_new = bh_data_target['x1'][:M]
|
for bh in ['1', '2']:
|
||||||
y1_new = bh_data_target['y1'][:M]
|
x_r, y_r, z_r = bh_data_ref[f'x{bh}'][:M], bh_data_ref[f'y{bh}'][:M], bh_data_ref[f'z{bh}'][:M]
|
||||||
x2_new = bh_data_target['x2'][:M]
|
x_n, y_n, z_n = bh_data_target[f'x{bh}'][:M], bh_data_target[f'y{bh}'][:M], bh_data_target[f'z{bh}'][:M]
|
||||||
y2_new = bh_data_target['y2'][:M]
|
|
||||||
|
|
||||||
# Calculate RMS for BH1
|
# 核心修改:根据组委会的邮件指示,分母统一使用 r = sqrt(x^2 + y^2)
|
||||||
delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
|
r_ref = np.sqrt(x_r**2 + y_r**2)
|
||||||
r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
|
r_new = np.sqrt(x_n**2 + y_n**2)
|
||||||
r1_new = np.sqrt(x1_new**2 + y1_new**2)
|
denom_max = np.maximum(r_ref, r_new)
|
||||||
r1_max = np.maximum(r1_ref, r1_new)
|
|
||||||
|
|
||||||
# Calculate RMS for BH2
|
valid = denom_max > 1e-15
|
||||||
delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
|
if np.sum(valid) < 10:
|
||||||
r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
|
results[f'BH{bh}'] = { '3D_Vector': 0.0, 'X_Component': 0.0, 'Y_Component': 0.0, 'Z_Component': 0.0 }
|
||||||
r2_new = np.sqrt(x2_new**2 + y2_new**2)
|
continue
|
||||||
r2_max = np.maximum(r2_ref, r2_new)
|
|
||||||
|
|
||||||
# Avoid division by zero for BH1
|
def calc_rms(delta):
|
||||||
valid_mask1 = r1_max > 1e-15
|
# 将对应分量的偏差除以统一的轨道半径分母 denom_max
|
||||||
if np.sum(valid_mask1) < 10:
|
return np.sqrt(np.mean((delta[valid] / denom_max[valid])**2)) * 100
|
||||||
return None, "Insufficient valid data points for BH1"
|
|
||||||
|
|
||||||
terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
|
# 1. Total 3D Vector RMS
|
||||||
rms_bh1 = np.sqrt(np.mean(terms1)) * 100
|
delta_vec = np.sqrt((x_r - x_n)**2 + (y_r - y_n)**2 + (z_r - z_n)**2)
|
||||||
|
rms_3d = calc_rms(delta_vec)
|
||||||
|
|
||||||
# Avoid division by zero for BH2
|
# 2. Component-wise RMS (分离计算各轴,但共用半径分母)
|
||||||
valid_mask2 = r2_max > 1e-15
|
rms_x = calc_rms(np.abs(x_r - x_n))
|
||||||
if np.sum(valid_mask2) < 10:
|
rms_y = calc_rms(np.abs(y_r - y_n))
|
||||||
return None, "Insufficient valid data points for BH2"
|
rms_z = calc_rms(np.abs(z_r - z_n))
|
||||||
|
|
||||||
terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
|
results[f'BH{bh}'] = {
|
||||||
rms_bh2 = np.sqrt(np.mean(terms2)) * 100
|
'3D_Vector': rms_3d,
|
||||||
|
'X_Component': rms_x,
|
||||||
|
'Y_Component': rms_y,
|
||||||
|
'Z_Component': rms_z
|
||||||
|
}
|
||||||
|
|
||||||
# Final RMS is the maximum of BH1 and BH2
|
# 获取 BH1 和 BH2 中的最大误差
|
||||||
rms_final = max(rms_bh1, rms_bh2)
|
max_rms = {
|
||||||
|
'3D_Vector': max(results['BH1']['3D_Vector'], results['BH2']['3D_Vector']),
|
||||||
return rms_final, None
|
'X_Component': max(results['BH1']['X_Component'], results['BH2']['X_Component']),
|
||||||
|
'Y_Component': max(results['BH1']['Y_Component'], results['BH2']['Y_Component']),
|
||||||
|
'Z_Component': max(results['BH1']['Z_Component'], results['BH2']['Z_Component'])
|
||||||
|
}
|
||||||
|
|
||||||
|
return max_rms, None
|
||||||
|
|
||||||
def analyze_constraint_violation(constraint_data, n_levels=9):
|
def analyze_constraint_violation(constraint_data, n_levels=9):
|
||||||
"""
|
"""
|
||||||
@@ -155,34 +142,32 @@ def analyze_constraint_violation(constraint_data, n_levels=9):
|
|||||||
|
|
||||||
|
|
||||||
def print_header():
|
def print_header():
|
||||||
"""Print report header"""
|
|
||||||
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
|
print(Color.BOLD + " AMSS-NCKU GW150914 Comprehensive Regression Test" + Color.RESET)
|
||||||
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
|
|
||||||
|
def print_rms_results(rms_dict, error, threshold=1.0):
|
||||||
def print_rms_results(rms_rel, error, threshold=1.0):
|
print(f"\n{Color.BOLD}1. RMS Error Analysis (Maximums of BH1 & BH2){Color.RESET}")
|
||||||
"""Print RMS error results"""
|
print("-" * 65)
|
||||||
print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
|
|
||||||
print("-" * 45)
|
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
print(f" {Color.RED}Error: {error}{Color.RESET}")
|
print(f" {Color.RED}Error: {error}{Color.RESET}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
passed = rms_rel < threshold
|
all_passed = True
|
||||||
|
print(f" Requirement: < {threshold}%\n")
|
||||||
|
|
||||||
print(f" RMS relative error: {rms_rel:.4f}%")
|
for key, val in rms_dict.items():
|
||||||
print(f" Requirement: < {threshold}%")
|
passed = val < threshold
|
||||||
print(f" Status: {get_status_text(passed)}")
|
all_passed = all_passed and passed
|
||||||
|
status = get_status_text(passed)
|
||||||
return passed
|
print(f" {key:15}: {val:8.4f}% | Status: {status}")
|
||||||
|
|
||||||
|
return all_passed
|
||||||
|
|
||||||
def print_constraint_results(results, threshold=2.0):
|
def print_constraint_results(results, threshold=2.0):
|
||||||
"""Print constraint violation results"""
|
|
||||||
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
|
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
|
||||||
print("-" * 45)
|
print("-" * 65)
|
||||||
|
|
||||||
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
|
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
|
||||||
for i, name in enumerate(names):
|
for i, name in enumerate(names):
|
||||||
@@ -200,7 +185,6 @@ def print_constraint_results(results, threshold=2.0):
|
|||||||
|
|
||||||
|
|
||||||
def print_summary(rms_passed, constraint_passed):
|
def print_summary(rms_passed, constraint_passed):
|
||||||
"""Print summary"""
|
|
||||||
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
print(Color.BOLD + "Verification Summary" + Color.RESET)
|
print(Color.BOLD + "Verification Summary" + Color.RESET)
|
||||||
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
@@ -210,7 +194,7 @@ def print_summary(rms_passed, constraint_passed):
|
|||||||
res_rms = get_status_text(rms_passed)
|
res_rms = get_status_text(rms_passed)
|
||||||
res_con = get_status_text(constraint_passed)
|
res_con = get_status_text(constraint_passed)
|
||||||
|
|
||||||
print(f" [1] RMS trajectory check: {res_rms}")
|
print(f" [1] Comprehensive RMS check: {res_rms}")
|
||||||
print(f" [2] ADM constraint check: {res_con}")
|
print(f" [2] ADM constraint check: {res_con}")
|
||||||
|
|
||||||
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
|
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
|
||||||
@@ -219,61 +203,48 @@ def print_summary(rms_passed, constraint_passed):
|
|||||||
|
|
||||||
return all_passed
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# Determine target (optimized) output directory
|
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
target_dir = sys.argv[1]
|
target_dir = sys.argv[1]
|
||||||
else:
|
else:
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
|
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
|
||||||
|
|
||||||
# Determine reference (baseline) directory
|
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
|
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
|
||||||
|
|
||||||
# Data file paths
|
|
||||||
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
|
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
|
||||||
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
|
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
|
||||||
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
|
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
|
||||||
|
|
||||||
# Check if files exist
|
|
||||||
if not os.path.exists(bh_file_ref):
|
if not os.path.exists(bh_file_ref):
|
||||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not os.path.exists(bh_file_target):
|
if not os.path.exists(bh_file_target):
|
||||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not os.path.exists(constraint_file):
|
if not os.path.exists(constraint_file):
|
||||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# Print header
|
|
||||||
print_header()
|
print_header()
|
||||||
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
|
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
|
||||||
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
|
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
|
||||||
|
|
||||||
# Load data
|
|
||||||
bh_data_ref = load_bh_trajectory(bh_file_ref)
|
bh_data_ref = load_bh_trajectory(bh_file_ref)
|
||||||
bh_data_target = load_bh_trajectory(bh_file_target)
|
bh_data_target = load_bh_trajectory(bh_file_target)
|
||||||
constraint_data = load_constraint_data(constraint_file)
|
constraint_data = load_constraint_data(constraint_file)
|
||||||
|
|
||||||
# Calculate RMS error
|
# Output modified RMS results
|
||||||
rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
|
rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
|
||||||
rms_passed = print_rms_results(rms_rel, error)
|
rms_passed = print_rms_results(rms_dict, error)
|
||||||
|
|
||||||
# Analyze constraint violation
|
# Output constraint results
|
||||||
constraint_results = analyze_constraint_violation(constraint_data)
|
constraint_results = analyze_constraint_violation(constraint_data)
|
||||||
constraint_passed = print_constraint_results(constraint_results)
|
constraint_passed = print_constraint_results(constraint_results)
|
||||||
|
|
||||||
# Print summary
|
|
||||||
all_passed = print_summary(rms_passed, constraint_passed)
|
all_passed = print_summary(rms_passed, constraint_passed)
|
||||||
|
|
||||||
# Return exit code
|
|
||||||
sys.exit(0 if all_passed else 1)
|
sys.exit(0 if all_passed else 1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
// printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all);
|
// printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all);
|
||||||
|
|
||||||
// temp variable
|
// temp variable
|
||||||
double gxx[all],gyy[all],gzz[all];
|
|
||||||
double chix[all],chiy[all],chiz[all];
|
double chix[all],chiy[all],chiz[all];
|
||||||
double gxxx[all],gxyx[all],gxzx[all],gyyx[all],gyzx[all],gzzx[all];
|
double gxxx[all],gxyx[all],gxzx[all],gyyx[all],gyzx[all],gzzx[all];
|
||||||
double gxxy[all],gxyy[all],gxzy[all],gyyy[all],gyzy[all],gzzy[all];
|
double gxxy[all],gxyy[all],gxzy[all],gyyy[all],gyzy[all],gzzy[all];
|
||||||
@@ -51,9 +50,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
double Gamxx[all],Gamxy[all],Gamxz[all];
|
double Gamxx[all],Gamxy[all],Gamxz[all];
|
||||||
double Gamyx[all],Gamyy[all],Gamyz[all];
|
double Gamyx[all],Gamyy[all],Gamyz[all];
|
||||||
double Gamzx[all],Gamzy[all],Gamzz[all];
|
double Gamzx[all],Gamzy[all],Gamzz[all];
|
||||||
double Kx[all], Ky[all], Kz[all], div_beta[all], S[all];
|
double Kx[all], Ky[all], Kz[all], S[all];
|
||||||
double f[all], fxx[all], fxy[all], fxz[all], fyy[all], fyz[all], fzz[all];
|
double f[all], fxx[all], fxy[all], fxz[all], fyy[all], fyz[all], fzz[all];
|
||||||
double Gamxa[all], Gamya[all], Gamza[all], alpn1[all], chin1[all];
|
double alpn1[all], chin1[all];
|
||||||
double gupxx[all], gupxy[all], gupxz[all];
|
double gupxx[all], gupxy[all], gupxz[all];
|
||||||
double gupyy[all], gupyz[all], gupzz[all];
|
double gupyy[all], gupyz[all], gupzz[all];
|
||||||
double SSS[3] = { 1.0, 1.0, 1.0};
|
double SSS[3] = { 1.0, 1.0, 1.0};
|
||||||
@@ -107,9 +106,6 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
for(int i=0;i<all;i+=1){
|
for(int i=0;i<all;i+=1){
|
||||||
alpn1[i] = Lap[i] + 1.0;
|
alpn1[i] = Lap[i] + 1.0;
|
||||||
chin1[i] = chi[i] + 1.0;
|
chin1[i] = chi[i] + 1.0;
|
||||||
gxx[i] = dxx[i] + 1.0;
|
|
||||||
gyy[i] = dyy[i] + 1.0;
|
|
||||||
gzz[i] = dzz[i] + 1.0;
|
|
||||||
}
|
}
|
||||||
// 9ms //
|
// 9ms //
|
||||||
fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
|
fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
|
||||||
@@ -127,231 +123,196 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
|
|
||||||
// 3ms //
|
// 3ms //
|
||||||
for(int i=0;i<all;i+=1){
|
for(int i=0;i<all;i+=1){
|
||||||
div_beta[i] = betaxx[i] + betayy[i] + betazz[i];
|
const double divb = betaxx[i] + betayy[i] + betazz[i];
|
||||||
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - div_beta[i]);
|
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - divb);
|
||||||
gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * gxx[i] * div_beta[i] +
|
gxx_rhs[i] = -TWO * alpn1[i] * Axx[i] - F2o3 * (dxx[i] + ONE) * divb +
|
||||||
TWO * (gxx[i] * betaxx[i] + gxy[i] * betayx[i] + gxz[i] * betazx[i]);
|
TWO * ((dxx[i] + ONE) * betaxx[i] + gxy[i] * betayx[i] + gxz[i] * betazx[i]);
|
||||||
gyy_rhs[i] = -TWO * alpn1[i] * Ayy[i] - F2o3 * gyy[i] * div_beta[i] +
|
gyy_rhs[i] = -TWO * alpn1[i] * Ayy[i] - F2o3 * (dyy[i] + ONE) * divb +
|
||||||
TWO * (gxy[i] * betaxy[i] + gyy[i] * betayy[i] + gyz[i] * betazy[i]);
|
TWO * (gxy[i] * betaxy[i] + (dyy[i] + ONE) * betayy[i] + gyz[i] * betazy[i]);
|
||||||
gzz_rhs[i] = -TWO * alpn1[i] * Azz[i] - F2o3 * gzz[i] * div_beta[i] +
|
gzz_rhs[i] = -TWO * alpn1[i] * Azz[i] - F2o3 * (dzz[i] + ONE) * divb +
|
||||||
TWO * (gxz[i] * betaxz[i] + gyz[i] * betayz[i] + gzz[i] * betazz[i]);
|
TWO * (gxz[i] * betaxz[i] + gyz[i] * betayz[i] + (dzz[i] + ONE) * betazz[i]);
|
||||||
gxy_rhs[i] = -TWO * alpn1[i] * Axy[i] + F1o3 * gxy[i] * div_beta[i] +
|
gxy_rhs[i] = -TWO * alpn1[i] * Axy[i] + F1o3 * gxy[i] * divb +
|
||||||
gxx[i] * betaxy[i] + gxz[i] * betazy[i] + gyy[i] * betayx[i]
|
(dxx[i] + ONE) * betaxy[i] + gxz[i] * betazy[i] + (dyy[i] + ONE) * betayx[i]
|
||||||
+ gyz[i] * betazx[i] - gxy[i] * betazz[i];
|
+ gyz[i] * betazx[i] - gxy[i] * betazz[i];
|
||||||
gyz_rhs[i] = -TWO * alpn1[i] * Ayz[i] + F1o3 * gyz[i] * div_beta[i] +
|
gyz_rhs[i] = -TWO * alpn1[i] * Ayz[i] + F1o3 * gyz[i] * divb +
|
||||||
gxy[i] * betaxz[i] + gyy[i] * betayz[i] + gxz[i] * betaxy[i]
|
gxy[i] * betaxz[i] + (dyy[i] + ONE) * betayz[i] + gxz[i] * betaxy[i]
|
||||||
+ gzz[i] * betazy[i] - gyz[i] * betaxx[i];
|
+ (dzz[i] + ONE) * betazy[i] - gyz[i] * betaxx[i];
|
||||||
gxz_rhs[i] = -TWO * alpn1[i] * Axz[i] + F1o3 * gxz[i] * div_beta[i] +
|
gxz_rhs[i] = -TWO * alpn1[i] * Axz[i] + F1o3 * gxz[i] * divb +
|
||||||
gxx[i] * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
|
(dxx[i] + ONE) * betaxz[i] + gxy[i] * betayz[i] + gyz[i] * betayx[i]
|
||||||
+ gzz[i] * betazx[i] - gxz[i] * betayy[i];
|
+ (dzz[i] + ONE) * betazx[i] - gxz[i] * betayy[i];
|
||||||
}
|
}
|
||||||
// 1ms //
|
// Fused: inverse metric + Gamma constraint + Christoffel (3 loops -> 1)
|
||||||
for(int i=0;i<all;i+=1){
|
for(int i=0;i<all;i+=1){
|
||||||
double det = gxx[i] * gyy[i] * gzz[i] + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
|
double det = (dxx[i] + ONE) * (dyy[i] + ONE) * (dzz[i] + ONE) + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i] -
|
||||||
gxz[i] * gyy[i] * gxz[i] - gxy[i] * gxy[i] * gzz[i] - gxx[i] * gyz[i] * gyz[i];
|
gxz[i] * (dyy[i] + ONE) * gxz[i] - gxy[i] * gxy[i] * (dzz[i] + ONE) - (dxx[i] + ONE) * gyz[i] * gyz[i];
|
||||||
gupxx[i] = (gyy[i] * gzz[i] - gyz[i] * gyz[i]) / det;
|
double lg_xx = ((dyy[i] + ONE) * (dzz[i] + ONE) - gyz[i] * gyz[i]) / det;
|
||||||
gupxy[i] = -(gxy[i] * gzz[i] - gyz[i] * gxz[i]) / det;
|
double lg_xy = -(gxy[i] * (dzz[i] + ONE) - gyz[i] * gxz[i]) / det;
|
||||||
gupxz[i] = (gxy[i] * gyz[i] - gyy[i] * gxz[i]) / det;
|
double lg_xz = (gxy[i] * gyz[i] - (dyy[i] + ONE) * gxz[i]) / det;
|
||||||
gupyy[i] = (gxx[i] * gzz[i] - gxz[i] * gxz[i]) / det;
|
double lg_yy = ((dxx[i] + ONE) * (dzz[i] + ONE) - gxz[i] * gxz[i]) / det;
|
||||||
gupyz[i] = -(gxx[i] * gyz[i] - gxy[i] * gxz[i]) / det;
|
double lg_yz = -((dxx[i] + ONE) * gyz[i] - gxy[i] * gxz[i]) / det;
|
||||||
gupzz[i] = (gxx[i] * gyy[i] - gxy[i] * gxy[i]) / det;
|
double lg_zz = ((dxx[i] + ONE) * (dyy[i] + ONE) - gxy[i] * gxy[i]) / det;
|
||||||
}
|
gupxx[i] = lg_xx; gupxy[i] = lg_xy; gupxz[i] = lg_xz;
|
||||||
// 2.2ms //
|
gupyy[i] = lg_yy; gupyz[i] = lg_yz; gupzz[i] = lg_zz;
|
||||||
if(co==0){
|
|
||||||
for (int i=0;i<all;i+=1) {
|
if(co==0){
|
||||||
Gmx_Res[i] = Gamx[i] - (
|
Gmx_Res[i] = Gamx[i] - (
|
||||||
gupxx[i] * (gupxx[i]*gxxx[i] + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i]) +
|
lg_xx * (lg_xx*gxxx[i] + lg_xy*gxyx[i] + lg_xz*gxzx[i]) +
|
||||||
gupxy[i] * (gupxx[i]*gxyx[i] + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i]) +
|
lg_xy * (lg_xx*gxyx[i] + lg_xy*gyyx[i] + lg_xz*gyzx[i]) +
|
||||||
gupxz[i] * (gupxx[i]*gxzx[i] + gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i]) +
|
lg_xz * (lg_xx*gxzx[i] + lg_xy*gyzx[i] + lg_xz*gzzx[i]) +
|
||||||
|
lg_xx * (lg_xy*gxxy[i] + lg_yy*gxyy[i] + lg_yz*gxzy[i]) +
|
||||||
gupxx[i] * (gupxy[i]*gxxy[i] + gupyy[i]*gxyy[i] + gupyz[i]*gxzy[i]) +
|
lg_xy * (lg_xy*gxyy[i] + lg_yy*gyyy[i] + lg_yz*gyzy[i]) +
|
||||||
gupxy[i] * (gupxy[i]*gxyy[i] + gupyy[i]*gyyy[i] + gupyz[i]*gyzy[i]) +
|
lg_xz * (lg_xy*gxzy[i] + lg_yy*gyzy[i] + lg_yz*gzzy[i]) +
|
||||||
gupxz[i] * (gupxy[i]*gxzy[i] + gupyy[i]*gyzy[i] + gupyz[i]*gzzy[i]) +
|
lg_xx * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
|
||||||
|
lg_xy * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
|
||||||
gupxx[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
|
lg_xz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
|
||||||
gupxy[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
|
|
||||||
gupxz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
|
|
||||||
);
|
);
|
||||||
|
|
||||||
Gmy_Res[i] = Gamy[i] - (
|
Gmy_Res[i] = Gamy[i] - (
|
||||||
gupxx[i] * (gupxy[i]*gxxx[i] + gupyy[i]*gxyx[i] + gupyz[i]*gxzx[i]) +
|
lg_xx * (lg_xy*gxxx[i] + lg_yy*gxyx[i] + lg_yz*gxzx[i]) +
|
||||||
gupxy[i] * (gupxy[i]*gxyx[i] + gupyy[i]*gyyx[i] + gupyz[i]*gyzx[i]) +
|
lg_xy * (lg_xy*gxyx[i] + lg_yy*gyyx[i] + lg_yz*gyzx[i]) +
|
||||||
gupxz[i] * (gupxy[i]*gxzx[i] + gupyy[i]*gyzx[i] + gupyz[i]*gzzx[i]) +
|
lg_xz * (lg_xy*gxzx[i] + lg_yy*gyzx[i] + lg_yz*gzzx[i]) +
|
||||||
|
lg_xy * (lg_xy*gxxy[i] + lg_yy*gxyy[i] + lg_yz*gxzy[i]) +
|
||||||
gupxy[i] * (gupxy[i]*gxxy[i] + gupyy[i]*gxyy[i] + gupyz[i]*gxzy[i]) +
|
lg_yy * (lg_xy*gxyy[i] + lg_yy*gyyy[i] + lg_yz*gyzy[i]) +
|
||||||
gupyy[i] * (gupxy[i]*gxyy[i] + gupyy[i]*gyyy[i] + gupyz[i]*gyzy[i]) +
|
lg_yz * (lg_xy*gxzy[i] + lg_yy*gyzy[i] + lg_yz*gzzy[i]) +
|
||||||
gupyz[i] * (gupxy[i]*gxzy[i] + gupyy[i]*gyzy[i] + gupyz[i]*gzzy[i]) +
|
lg_xy * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
|
||||||
|
lg_yy * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
|
||||||
gupxy[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
|
lg_yz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
|
||||||
gupyy[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
|
|
||||||
gupyz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
|
|
||||||
);
|
);
|
||||||
|
|
||||||
Gmz_Res[i] = Gamz[i] - (
|
Gmz_Res[i] = Gamz[i] - (
|
||||||
gupxx[i] * (gupxz[i]*gxxx[i] + gupyz[i]*gxyx[i] + gupzz[i]*gxzx[i]) +
|
lg_xx * (lg_xz*gxxx[i] + lg_yz*gxyx[i] + lg_zz*gxzx[i]) +
|
||||||
gupxy[i] * (gupxz[i]*gxyx[i] + gupyz[i]*gyyx[i] + gupzz[i]*gyzx[i]) +
|
lg_xy * (lg_xz*gxyx[i] + lg_yz*gyyx[i] + lg_zz*gyzx[i]) +
|
||||||
gupxz[i] * (gupxz[i]*gxzx[i] + gupyz[i]*gyzx[i] + gupzz[i]*gzzx[i]) +
|
lg_xz * (lg_xz*gxzx[i] + lg_yz*gyzx[i] + lg_zz*gzzx[i]) +
|
||||||
|
lg_xy * (lg_xz*gxxy[i] + lg_yz*gxyy[i] + lg_zz*gxzy[i]) +
|
||||||
gupxy[i] * (gupxz[i]*gxxy[i] + gupyz[i]*gxyy[i] + gupzz[i]*gxzy[i]) +
|
lg_yy * (lg_xz*gxyy[i] + lg_yz*gyyy[i] + lg_zz*gyzy[i]) +
|
||||||
gupyy[i] * (gupxz[i]*gxyy[i] + gupyz[i]*gyyy[i] + gupzz[i]*gyzy[i]) +
|
lg_yz * (lg_xz*gxzy[i] + lg_yz*gyzy[i] + lg_zz*gzzy[i]) +
|
||||||
gupyz[i] * (gupxz[i]*gxzy[i] + gupyz[i]*gyzy[i] + gupzz[i]*gzzy[i]) +
|
lg_xz * (lg_xz*gxxz[i] + lg_yz*gxyz[i] + lg_zz*gxzz[i]) +
|
||||||
|
lg_yz * (lg_xz*gxyz[i] + lg_yz*gyyz[i] + lg_zz*gyzz[i]) +
|
||||||
gupxz[i] * (gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i] + gupzz[i]*gxzz[i]) +
|
lg_zz * (lg_xz*gxzz[i] + lg_yz*gyzz[i] + lg_zz*gzzz[i])
|
||||||
gupyz[i] * (gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i] + gupzz[i]*gyzz[i]) +
|
|
||||||
gupzz[i] * (gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i] + gupzz[i]*gzzz[i])
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Gamxxx[i] = HALF * ( lg_xx*gxxx[i]
|
||||||
|
+ lg_xy*(TWO*gxyx[i] - gxxy[i])
|
||||||
|
+ lg_xz*(TWO*gxzx[i] - gxxz[i]) );
|
||||||
|
Gamyxx[i] = HALF * ( lg_xy*gxxx[i]
|
||||||
|
+ lg_yy*(TWO*gxyx[i] - gxxy[i])
|
||||||
|
+ lg_yz*(TWO*gxzx[i] - gxxz[i]) );
|
||||||
|
Gamzxx[i] = HALF * ( lg_xz*gxxx[i]
|
||||||
|
+ lg_yz*(TWO*gxyx[i] - gxxy[i])
|
||||||
|
+ lg_zz*(TWO*gxzx[i] - gxxz[i]) );
|
||||||
|
Gamxyy[i] = HALF * ( lg_xx*(TWO*gxyy[i] - gyyx[i])
|
||||||
|
+ lg_xy*gyyy[i]
|
||||||
|
+ lg_xz*(TWO*gyzy[i] - gyyz[i]) );
|
||||||
|
Gamyyy[i] = HALF * ( lg_xy*(TWO*gxyy[i] - gyyx[i])
|
||||||
|
+ lg_yy*gyyy[i]
|
||||||
|
+ lg_yz*(TWO*gyzy[i] - gyyz[i]) );
|
||||||
|
Gamzyy[i] = HALF * ( lg_xz*(TWO*gxyy[i] - gyyx[i])
|
||||||
|
+ lg_yz*gyyy[i]
|
||||||
|
+ lg_zz*(TWO*gyzy[i] - gyyz[i]) );
|
||||||
|
Gamxzz[i] = HALF * ( lg_xx*(TWO*gxzz[i] - gzzx[i])
|
||||||
|
+ lg_xy*(TWO*gyzz[i] - gzzy[i])
|
||||||
|
+ lg_xz*gzzz[i] );
|
||||||
|
Gamyzz[i] = HALF * ( lg_xy*(TWO*gxzz[i] - gzzx[i])
|
||||||
|
+ lg_yy*(TWO*gyzz[i] - gzzy[i])
|
||||||
|
+ lg_yz*gzzz[i] );
|
||||||
|
Gamzzz[i] = HALF * ( lg_xz*(TWO*gxzz[i] - gzzx[i])
|
||||||
|
+ lg_yz*(TWO*gyzz[i] - gzzy[i])
|
||||||
|
+ lg_zz*gzzz[i] );
|
||||||
|
Gamxxy[i] = HALF * ( lg_xx*gxxy[i]
|
||||||
|
+ lg_xy*gyyx[i]
|
||||||
|
+ lg_xz*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||||
|
Gamyxy[i] = HALF * ( lg_xy*gxxy[i]
|
||||||
|
+ lg_yy*gyyx[i]
|
||||||
|
+ lg_yz*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||||
|
Gamzxy[i] = HALF * ( lg_xz*gxxy[i]
|
||||||
|
+ lg_yz*gyyx[i]
|
||||||
|
+ lg_zz*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
||||||
|
Gamxxz[i] = HALF * ( lg_xx*gxxz[i]
|
||||||
|
+ lg_xy*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||||
|
+ lg_xz*gzzx[i] );
|
||||||
|
Gamyxz[i] = HALF * ( lg_xy*gxxz[i]
|
||||||
|
+ lg_yy*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||||
|
+ lg_yz*gzzx[i] );
|
||||||
|
Gamzxz[i] = HALF * ( lg_xz*gxxz[i]
|
||||||
|
+ lg_yz*(gxyz[i] + gyzx[i] - gxzy[i])
|
||||||
|
+ lg_zz*gzzx[i] );
|
||||||
|
Gamxyz[i] = HALF * ( lg_xx*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||||
|
+ lg_xy*gyyz[i]
|
||||||
|
+ lg_xz*gzzy[i] );
|
||||||
|
Gamyyz[i] = HALF * ( lg_xy*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||||
|
+ lg_yy*gyyz[i]
|
||||||
|
+ lg_yz*gzzy[i] );
|
||||||
|
Gamzyz[i] = HALF * ( lg_xz*(gxyz[i] + gxzy[i] - gyzx[i])
|
||||||
|
+ lg_yz*gyyz[i]
|
||||||
|
+ lg_zz*gzzy[i] );
|
||||||
}
|
}
|
||||||
// 5ms //
|
// Fused: A^{ij} raise-index + Gamma_rhs part 1 (2 loops -> 1)
|
||||||
for (int i=0;i<all;i+=1) {
|
for (int i=0;i<all;i+=1) {
|
||||||
|
double axx = gupxx[i]*gupxx[i]*Axx[i]
|
||||||
Gamxxx[i] = HALF * ( gupxx[i]*gxxx[i]
|
|
||||||
+ gupxy[i]*(TWO*gxyx[i] - gxxy[i])
|
|
||||||
+ gupxz[i]*(TWO*gxzx[i] - gxxz[i]) );
|
|
||||||
|
|
||||||
Gamyxx[i] = HALF * ( gupxy[i]*gxxx[i]
|
|
||||||
+ gupyy[i]*(TWO*gxyx[i] - gxxy[i])
|
|
||||||
+ gupyz[i]*(TWO*gxzx[i] - gxxz[i]) );
|
|
||||||
|
|
||||||
Gamzxx[i] = HALF * ( gupxz[i]*gxxx[i]
|
|
||||||
+ gupyz[i]*(TWO*gxyx[i] - gxxy[i])
|
|
||||||
+ gupzz[i]*(TWO*gxzx[i] - gxxz[i]) );
|
|
||||||
|
|
||||||
Gamxyy[i] = HALF * ( gupxx[i]*(TWO*gxyy[i] - gyyx[i])
|
|
||||||
+ gupxy[i]*gyyy[i]
|
|
||||||
+ gupxz[i]*(TWO*gyzy[i] - gyyz[i]) );
|
|
||||||
|
|
||||||
Gamyyy[i] = HALF * ( gupxy[i]*(TWO*gxyy[i] - gyyx[i])
|
|
||||||
+ gupyy[i]*gyyy[i]
|
|
||||||
+ gupyz[i]*(TWO*gyzy[i] - gyyz[i]) );
|
|
||||||
|
|
||||||
Gamzyy[i] = HALF * ( gupxz[i]*(TWO*gxyy[i] - gyyx[i])
|
|
||||||
+ gupyz[i]*gyyy[i]
|
|
||||||
+ gupzz[i]*(TWO*gyzy[i] - gyyz[i]) );
|
|
||||||
|
|
||||||
Gamxzz[i] = HALF * ( gupxx[i]*(TWO*gxzz[i] - gzzx[i])
|
|
||||||
+ gupxy[i]*(TWO*gyzz[i] - gzzy[i])
|
|
||||||
+ gupxz[i]*gzzz[i] );
|
|
||||||
|
|
||||||
Gamyzz[i] = HALF * ( gupxy[i]*(TWO*gxzz[i] - gzzx[i])
|
|
||||||
+ gupyy[i]*(TWO*gyzz[i] - gzzy[i])
|
|
||||||
+ gupyz[i]*gzzz[i] );
|
|
||||||
|
|
||||||
Gamzzz[i] = HALF * ( gupxz[i]*(TWO*gxzz[i] - gzzx[i])
|
|
||||||
+ gupyz[i]*(TWO*gyzz[i] - gzzy[i])
|
|
||||||
+ gupzz[i]*gzzz[i] );
|
|
||||||
|
|
||||||
Gamxxy[i] = HALF * ( gupxx[i]*gxxy[i]
|
|
||||||
+ gupxy[i]*gyyx[i]
|
|
||||||
+ gupxz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
|
||||||
|
|
||||||
Gamyxy[i] = HALF * ( gupxy[i]*gxxy[i]
|
|
||||||
+ gupyy[i]*gyyx[i]
|
|
||||||
+ gupyz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
|
||||||
|
|
||||||
Gamzxy[i] = HALF * ( gupxz[i]*gxxy[i]
|
|
||||||
+ gupyz[i]*gyyx[i]
|
|
||||||
+ gupzz[i]*(gxzy[i] + gyzx[i] - gxyz[i]) );
|
|
||||||
|
|
||||||
Gamxxz[i] = HALF * ( gupxx[i]*gxxz[i]
|
|
||||||
+ gupxy[i]*(gxyz[i] + gyzx[i] - gxzy[i])
|
|
||||||
+ gupxz[i]*gzzx[i] );
|
|
||||||
|
|
||||||
Gamyxz[i] = HALF * ( gupxy[i]*gxxz[i]
|
|
||||||
+ gupyy[i]*(gxyz[i] + gyzx[i] - gxzy[i])
|
|
||||||
+ gupyz[i]*gzzx[i] );
|
|
||||||
|
|
||||||
Gamzxz[i] = HALF * ( gupxz[i]*gxxz[i]
|
|
||||||
+ gupyz[i]*(gxyz[i] + gyzx[i] - gxzy[i])
|
|
||||||
+ gupzz[i]*gzzx[i] );
|
|
||||||
|
|
||||||
Gamxyz[i] = HALF * ( gupxx[i]*(gxyz[i] + gxzy[i] - gyzx[i])
|
|
||||||
+ gupxy[i]*gyyz[i]
|
|
||||||
+ gupxz[i]*gzzy[i] );
|
|
||||||
|
|
||||||
Gamyyz[i] = HALF * ( gupxy[i]*(gxyz[i] + gxzy[i] - gyzx[i])
|
|
||||||
+ gupyy[i]*gyyz[i]
|
|
||||||
+ gupyz[i]*gzzy[i] );
|
|
||||||
|
|
||||||
Gamzyz[i] = HALF * ( gupxz[i]*(gxyz[i] + gxzy[i] - gyzx[i])
|
|
||||||
+ gupyz[i]*gyyz[i]
|
|
||||||
+ gupzz[i]*gzzy[i] );
|
|
||||||
|
|
||||||
}
|
|
||||||
// 1.8ms //
|
|
||||||
for (int i=0;i<all;i+=1) {
|
|
||||||
|
|
||||||
Rxx[i] = gupxx[i]*gupxx[i]*Axx[i]
|
|
||||||
+ gupxy[i]*gupxy[i]*Ayy[i]
|
+ gupxy[i]*gupxy[i]*Ayy[i]
|
||||||
+ gupxz[i]*gupxz[i]*Azz[i]
|
+ gupxz[i]*gupxz[i]*Azz[i]
|
||||||
+ TWO * ( gupxx[i]*gupxy[i]*Axy[i]
|
+ TWO * ( gupxx[i]*gupxy[i]*Axy[i]
|
||||||
+ gupxx[i]*gupxz[i]*Axz[i]
|
+ gupxx[i]*gupxz[i]*Axz[i]
|
||||||
+ gupxy[i]*gupxz[i]*Ayz[i] );
|
+ gupxy[i]*gupxz[i]*Ayz[i] );
|
||||||
|
double ayy = gupxy[i]*gupxy[i]*Axx[i]
|
||||||
Ryy[i] = gupxy[i]*gupxy[i]*Axx[i]
|
|
||||||
+ gupyy[i]*gupyy[i]*Ayy[i]
|
+ gupyy[i]*gupyy[i]*Ayy[i]
|
||||||
+ gupyz[i]*gupyz[i]*Azz[i]
|
+ gupyz[i]*gupyz[i]*Azz[i]
|
||||||
+ TWO * ( gupxy[i]*gupyy[i]*Axy[i]
|
+ TWO * ( gupxy[i]*gupyy[i]*Axy[i]
|
||||||
+ gupxy[i]*gupyz[i]*Axz[i]
|
+ gupxy[i]*gupyz[i]*Axz[i]
|
||||||
+ gupyy[i]*gupyz[i]*Ayz[i] );
|
+ gupyy[i]*gupyz[i]*Ayz[i] );
|
||||||
|
double azz = gupxz[i]*gupxz[i]*Axx[i]
|
||||||
Rzz[i] = gupxz[i]*gupxz[i]*Axx[i]
|
|
||||||
+ gupyz[i]*gupyz[i]*Ayy[i]
|
+ gupyz[i]*gupyz[i]*Ayy[i]
|
||||||
+ gupzz[i]*gupzz[i]*Azz[i]
|
+ gupzz[i]*gupzz[i]*Azz[i]
|
||||||
+ TWO * ( gupxz[i]*gupyz[i]*Axy[i]
|
+ TWO * ( gupxz[i]*gupyz[i]*Axy[i]
|
||||||
+ gupxz[i]*gupzz[i]*Axz[i]
|
+ gupxz[i]*gupzz[i]*Axz[i]
|
||||||
+ gupyz[i]*gupzz[i]*Ayz[i] );
|
+ gupyz[i]*gupzz[i]*Ayz[i] );
|
||||||
|
double axy = gupxx[i]*gupxy[i]*Axx[i]
|
||||||
Rxy[i] = gupxx[i]*gupxy[i]*Axx[i]
|
|
||||||
+ gupxy[i]*gupyy[i]*Ayy[i]
|
+ gupxy[i]*gupyy[i]*Ayy[i]
|
||||||
+ gupxz[i]*gupyz[i]*Azz[i]
|
+ gupxz[i]*gupyz[i]*Azz[i]
|
||||||
+ ( gupxx[i]*gupyy[i] + gupxy[i]*gupxy[i] ) * Axy[i]
|
+ ( gupxx[i]*gupyy[i] + gupxy[i]*gupxy[i] ) * Axy[i]
|
||||||
+ ( gupxx[i]*gupyz[i] + gupxz[i]*gupxy[i] ) * Axz[i]
|
+ ( gupxx[i]*gupyz[i] + gupxz[i]*gupxy[i] ) * Axz[i]
|
||||||
+ ( gupxy[i]*gupyz[i] + gupxz[i]*gupyy[i] ) * Ayz[i];
|
+ ( gupxy[i]*gupyz[i] + gupxz[i]*gupyy[i] ) * Ayz[i];
|
||||||
|
double axz = gupxx[i]*gupxz[i]*Axx[i]
|
||||||
Rxz[i] = gupxx[i]*gupxz[i]*Axx[i]
|
|
||||||
+ gupxy[i]*gupyz[i]*Ayy[i]
|
+ gupxy[i]*gupyz[i]*Ayy[i]
|
||||||
+ gupxz[i]*gupzz[i]*Azz[i]
|
+ gupxz[i]*gupzz[i]*Azz[i]
|
||||||
+ ( gupxx[i]*gupyz[i] + gupxy[i]*gupxz[i] ) * Axy[i]
|
+ ( gupxx[i]*gupyz[i] + gupxy[i]*gupxz[i] ) * Axy[i]
|
||||||
+ ( gupxx[i]*gupzz[i] + gupxz[i]*gupxz[i] ) * Axz[i]
|
+ ( gupxx[i]*gupzz[i] + gupxz[i]*gupxz[i] ) * Axz[i]
|
||||||
+ ( gupxy[i]*gupzz[i] + gupxz[i]*gupyz[i] ) * Ayz[i];
|
+ ( gupxy[i]*gupzz[i] + gupxz[i]*gupyz[i] ) * Ayz[i];
|
||||||
|
double ayz = gupxy[i]*gupxz[i]*Axx[i]
|
||||||
Ryz[i] = gupxy[i]*gupxz[i]*Axx[i]
|
|
||||||
+ gupyy[i]*gupyz[i]*Ayy[i]
|
+ gupyy[i]*gupyz[i]*Ayy[i]
|
||||||
+ gupyz[i]*gupzz[i]*Azz[i]
|
+ gupyz[i]*gupzz[i]*Azz[i]
|
||||||
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
|
+ ( gupxy[i]*gupyz[i] + gupyy[i]*gupxz[i] ) * Axy[i]
|
||||||
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
|
+ ( gupxy[i]*gupzz[i] + gupyz[i]*gupxz[i] ) * Axz[i]
|
||||||
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
|
+ ( gupyy[i]*gupzz[i] + gupyz[i]*gupyz[i] ) * Ayz[i];
|
||||||
}
|
Rxx[i] = axx; Ryy[i] = ayy; Rzz[i] = azz;
|
||||||
// 4ms //
|
Rxy[i] = axy; Rxz[i] = axz; Ryz[i] = ayz;
|
||||||
for(int i=0;i<all;i+=1){
|
|
||||||
Gamx_rhs[i] = - TWO * ( Lapx[i] * Rxx[i] + Lapy[i] * Rxy[i] + Lapz[i] * Rxz[i] ) +
|
|
||||||
TWO * alpn1[i] * (
|
|
||||||
-F3o2/chin1[i] * ( chix[i] * Rxx[i] + chiy[i] * Rxy[i] + chiz[i] * Rxz[i] ) -
|
|
||||||
gupxx[i] * ( F2o3 * Kx[i] + EIGHT * PI * Sx[i] ) -
|
|
||||||
gupxy[i] * ( F2o3 * Ky[i] + EIGHT * PI * Sy[i] ) -
|
|
||||||
gupxz[i] * ( F2o3 * Kz[i] + EIGHT * PI * Sz[i] ) +
|
|
||||||
Gamxxx[i] * Rxx[i] + Gamxyy[i] * Ryy[i] + Gamxzz[i] * Rzz[i] +
|
|
||||||
TWO * ( Gamxxy[i] * Rxy[i] + Gamxxz[i] * Rxz[i] + Gamxyz[i] * Ryz[i] ) );
|
|
||||||
|
|
||||||
Gamy_rhs[i] = -TWO * ( Lapx[i]*Rxy[i] + Lapy[i]*Ryy[i] + Lapz[i]*Ryz[i] )
|
Gamx_rhs[i] = - TWO * ( Lapx[i]*axx + Lapy[i]*axy + Lapz[i]*axz ) +
|
||||||
|
TWO * alpn1[i] * (
|
||||||
|
-F3o2/chin1[i] * ( chix[i]*axx + chiy[i]*axy + chiz[i]*axz ) -
|
||||||
|
gupxx[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] ) -
|
||||||
|
gupxy[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] ) -
|
||||||
|
gupxz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] ) +
|
||||||
|
Gamxxx[i]*axx + Gamxyy[i]*ayy + Gamxzz[i]*azz +
|
||||||
|
TWO * ( Gamxxy[i]*axy + Gamxxz[i]*axz + Gamxyz[i]*ayz ) );
|
||||||
|
|
||||||
|
Gamy_rhs[i] = -TWO * ( Lapx[i]*axy + Lapy[i]*ayy + Lapz[i]*ayz )
|
||||||
+ TWO * alpn1[i] * (
|
+ TWO * alpn1[i] * (
|
||||||
-F3o2/chin1[i] * ( chix[i]*Rxy[i] + chiy[i]*Ryy[i] + chiz[i]*Ryz[i] )
|
-F3o2/chin1[i] * ( chix[i]*axy + chiy[i]*ayy + chiz[i]*ayz )
|
||||||
- gupxy[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
|
- gupxy[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
|
||||||
- gupyy[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
|
- gupyy[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
|
||||||
- gupyz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
|
- gupyz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
|
||||||
+ Gamyxx[i]*Rxx[i] + Gamyyy[i]*Ryy[i] + Gamyzz[i]*Rzz[i]
|
+ Gamyxx[i]*axx + Gamyyy[i]*ayy + Gamyzz[i]*azz
|
||||||
+ TWO * ( Gamyxy[i]*Rxy[i] + Gamyxz[i]*Rxz[i] + Gamyyz[i]*Ryz[i] )
|
+ TWO * ( Gamyxy[i]*axy + Gamyxz[i]*axz + Gamyyz[i]*ayz )
|
||||||
);
|
);
|
||||||
|
|
||||||
Gamz_rhs[i] = -TWO * ( Lapx[i]*Rxz[i] + Lapy[i]*Ryz[i] + Lapz[i]*Rzz[i] )
|
Gamz_rhs[i] = -TWO * ( Lapx[i]*axz + Lapy[i]*ayz + Lapz[i]*azz )
|
||||||
+ TWO * alpn1[i] * (
|
+ TWO * alpn1[i] * (
|
||||||
-F3o2/chin1[i] * ( chix[i]*Rxz[i] + chiy[i]*Ryz[i] + chiz[i]*Rzz[i] )
|
-F3o2/chin1[i] * ( chix[i]*axz + chiy[i]*ayz + chiz[i]*azz )
|
||||||
- gupxz[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
|
- gupxz[i] * ( F2o3*Kx[i] + EIGHT*PI*Sx[i] )
|
||||||
- gupyz[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
|
- gupyz[i] * ( F2o3*Ky[i] + EIGHT*PI*Sy[i] )
|
||||||
- gupzz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
|
- gupzz[i] * ( F2o3*Kz[i] + EIGHT*PI*Sz[i] )
|
||||||
+ Gamzxx[i]*Rxx[i] + Gamzyy[i]*Ryy[i] + Gamzzz[i]*Rzz[i]
|
+ Gamzxx[i]*axx + Gamzyy[i]*ayy + Gamzzz[i]*azz
|
||||||
+ TWO * ( Gamzxy[i]*Rxy[i] + Gamzxz[i]*Rxz[i] + Gamzyz[i]*Ryz[i] )
|
+ TWO * ( Gamzxy[i]*axy + Gamzxz[i]*axz + Gamzyz[i]*ayz )
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// 22.3ms //
|
// 22.3ms //
|
||||||
@@ -365,65 +326,63 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
|
fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
|
||||||
fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);
|
fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);
|
||||||
|
|
||||||
// 3.5ms //
|
// Fused: fxx/Gamxa + Gamma_rhs part 2 (2 loops -> 1)
|
||||||
for(int i=0;i<all;i+=1){
|
for(int i=0;i<all;i+=1){
|
||||||
fxx[i] = gxxx[i] + gxyy[i] + gxzz[i];
|
const double divb = betaxx[i] + betayy[i] + betazz[i];
|
||||||
fxy[i] = gxyx[i] + gyyy[i] + gyzz[i];
|
double lfxx = gxxx[i] + gxyy[i] + gxzz[i];
|
||||||
fxz[i] = gxzx[i] + gyzy[i] + gzzz[i];
|
double lfxy = gxyx[i] + gyyy[i] + gyzz[i];
|
||||||
Gamxa[i] = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
|
double lfxz = gxzx[i] + gyzy[i] + gzzz[i];
|
||||||
|
fxx[i] = lfxx; fxy[i] = lfxy; fxz[i] = lfxz;
|
||||||
|
|
||||||
|
double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
|
||||||
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
|
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
|
||||||
|
double gya = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
|
||||||
Gamya[i] = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
|
|
||||||
+ TWO * ( gupxy[i]*Gamyxy[i] + gupxz[i]*Gamyxz[i] + gupyz[i]*Gamyyz[i] );
|
+ TWO * ( gupxy[i]*Gamyxy[i] + gupxz[i]*Gamyxz[i] + gupyz[i]*Gamyyz[i] );
|
||||||
|
double gza = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
|
||||||
Gamza[i] = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
|
|
||||||
+ TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
|
+ TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
|
||||||
}
|
|
||||||
// 3.9ms //
|
|
||||||
for(int i=0;i<all;i+=1){
|
|
||||||
Gamx_rhs[i] = Gamx_rhs[i]
|
Gamx_rhs[i] = Gamx_rhs[i]
|
||||||
+ F2o3 * Gamxa[i] * div_beta[i]
|
+ F2o3 * gxa * divb
|
||||||
- Gamxa[i] * betaxx[i] - Gamya[i] * betaxy[i] - Gamza[i] * betaxz[i]
|
- gxa * betaxx[i] - gya * betaxy[i] - gza * betaxz[i]
|
||||||
+ F1o3 * ( gupxx[i] * fxx[i] + gupxy[i] * fxy[i] + gupxz[i] * fxz[i] )
|
+ F1o3 * ( gupxx[i] * lfxx + gupxy[i] * lfxy + gupxz[i] * lfxz )
|
||||||
+ gupxx[i] * gxxx[i] + gupyy[i] * gyyx[i] + gupzz[i] * gzzx[i]
|
+ gupxx[i] * gxxx[i] + gupyy[i] * gyyx[i] + gupzz[i] * gzzx[i]
|
||||||
+ TWO * ( gupxy[i] * gxyx[i] + gupxz[i] * gxzx[i] + gupyz[i] * gyzx[i] );
|
+ TWO * ( gupxy[i] * gxyx[i] + gupxz[i] * gxzx[i] + gupyz[i] * gyzx[i] );
|
||||||
|
|
||||||
Gamy_rhs[i] = Gamy_rhs[i]
|
Gamy_rhs[i] = Gamy_rhs[i]
|
||||||
+ F2o3 * Gamya[i] * div_beta[i]
|
+ F2o3 * gya * divb
|
||||||
- Gamxa[i] * betayx[i] - Gamya[i] * betayy[i] - Gamza[i] * betayz[i]
|
- gxa * betayx[i] - gya * betayy[i] - gza * betayz[i]
|
||||||
+ F1o3 * ( gupxy[i] * fxx[i] + gupyy[i] * fxy[i] + gupyz[i] * fxz[i] )
|
+ F1o3 * ( gupxy[i] * lfxx + gupyy[i] * lfxy + gupyz[i] * lfxz )
|
||||||
+ gupxx[i] * gxxy[i] + gupyy[i] * gyyy[i] + gupzz[i] * gzzy[i]
|
+ gupxx[i] * gxxy[i] + gupyy[i] * gyyy[i] + gupzz[i] * gzzy[i]
|
||||||
+ TWO * ( gupxy[i] * gxyy[i] + gupxz[i] * gxzy[i] + gupyz[i] * gyzy[i] );
|
+ TWO * ( gupxy[i] * gxyy[i] + gupxz[i] * gxzy[i] + gupyz[i] * gyzy[i] );
|
||||||
|
|
||||||
Gamz_rhs[i] = Gamz_rhs[i]
|
Gamz_rhs[i] = Gamz_rhs[i]
|
||||||
+ F2o3 * Gamza[i] * div_beta[i]
|
+ F2o3 * gza * divb
|
||||||
- Gamxa[i] * betazx[i] - Gamya[i] * betazy[i] - Gamza[i] * betazz[i]
|
- gxa * betazx[i] - gya * betazy[i] - gza * betazz[i]
|
||||||
+ F1o3 * ( gupxz[i] * fxx[i] + gupyz[i] * fxy[i] + gupzz[i] * fxz[i] )
|
+ F1o3 * ( gupxz[i] * lfxx + gupyz[i] * lfxy + gupzz[i] * lfxz )
|
||||||
+ gupxx[i] * gxxz[i] + gupyy[i] * gyyz[i] + gupzz[i] * gzzz[i]
|
+ gupxx[i] * gxxz[i] + gupyy[i] * gyyz[i] + gupzz[i] * gzzz[i]
|
||||||
+ TWO * ( gupxy[i] * gxyz[i] + gupxz[i] * gxzz[i] + gupyz[i] * gyzz[i] );
|
+ TWO * ( gupxy[i] * gxyz[i] + gupxz[i] * gxzz[i] + gupyz[i] * gyzz[i] );
|
||||||
}
|
}
|
||||||
// 4.4ms //
|
// 4.4ms //
|
||||||
for (int i=0;i<all;i+=1) {
|
for (int i=0;i<all;i+=1) {
|
||||||
gxxx[i] = gxx[i]*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
|
gxxx[i] = (dxx[i] + ONE)*Gamxxx[i] + gxy[i]*Gamyxx[i] + gxz[i]*Gamzxx[i];
|
||||||
gxyx[i] = gxx[i]*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
|
gxyx[i] = (dxx[i] + ONE)*Gamxxy[i] + gxy[i]*Gamyxy[i] + gxz[i]*Gamzxy[i];
|
||||||
gxzx[i] = gxx[i]*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
|
gxzx[i] = (dxx[i] + ONE)*Gamxxz[i] + gxy[i]*Gamyxz[i] + gxz[i]*Gamzxz[i];
|
||||||
gyyx[i] = gxx[i]*Gamxyy[i] + gxy[i]*Gamyyy[i] + gxz[i]*Gamzyy[i];
|
gyyx[i] = (dxx[i] + ONE)*Gamxyy[i] + gxy[i]*Gamyyy[i] + gxz[i]*Gamzyy[i];
|
||||||
gyzx[i] = gxx[i]*Gamxyz[i] + gxy[i]*Gamyyz[i] + gxz[i]*Gamzyz[i];
|
gyzx[i] = (dxx[i] + ONE)*Gamxyz[i] + gxy[i]*Gamyyz[i] + gxz[i]*Gamzyz[i];
|
||||||
gzzx[i] = gxx[i]*Gamxzz[i] + gxy[i]*Gamyzz[i] + gxz[i]*Gamzzz[i];
|
gzzx[i] = (dxx[i] + ONE)*Gamxzz[i] + gxy[i]*Gamyzz[i] + gxz[i]*Gamzzz[i];
|
||||||
|
|
||||||
gxxy[i] = gxy[i]*Gamxxx[i] + gyy[i]*Gamyxx[i] + gyz[i]*Gamzxx[i];
|
gxxy[i] = gxy[i]*Gamxxx[i] + (dyy[i] + ONE)*Gamyxx[i] + gyz[i]*Gamzxx[i];
|
||||||
gxyy[i] = gxy[i]*Gamxxy[i] + gyy[i]*Gamyxy[i] + gyz[i]*Gamzxy[i];
|
gxyy[i] = gxy[i]*Gamxxy[i] + (dyy[i] + ONE)*Gamyxy[i] + gyz[i]*Gamzxy[i];
|
||||||
gxzy[i] = gxy[i]*Gamxxz[i] + gyy[i]*Gamyxz[i] + gyz[i]*Gamzxz[i];
|
gxzy[i] = gxy[i]*Gamxxz[i] + (dyy[i] + ONE)*Gamyxz[i] + gyz[i]*Gamzxz[i];
|
||||||
gyyy[i] = gxy[i]*Gamxyy[i] + gyy[i]*Gamyyy[i] + gyz[i]*Gamzyy[i];
|
gyyy[i] = gxy[i]*Gamxyy[i] + (dyy[i] + ONE)*Gamyyy[i] + gyz[i]*Gamzyy[i];
|
||||||
gyzy[i] = gxy[i]*Gamxyz[i] + gyy[i]*Gamyyz[i] + gyz[i]*Gamzyz[i];
|
gyzy[i] = gxy[i]*Gamxyz[i] + (dyy[i] + ONE)*Gamyyz[i] + gyz[i]*Gamzyz[i];
|
||||||
gzzy[i] = gxy[i]*Gamxzz[i] + gyy[i]*Gamyzz[i] + gyz[i]*Gamzzz[i];
|
gzzy[i] = gxy[i]*Gamxzz[i] + (dyy[i] + ONE)*Gamyzz[i] + gyz[i]*Gamzzz[i];
|
||||||
|
|
||||||
gxxz[i] = gxz[i]*Gamxxx[i] + gyz[i]*Gamyxx[i] + gzz[i]*Gamzxx[i];
|
gxxz[i] = gxz[i]*Gamxxx[i] + gyz[i]*Gamyxx[i] + (dzz[i] + ONE)*Gamzxx[i];
|
||||||
gxyz[i] = gxz[i]*Gamxxy[i] + gyz[i]*Gamyxy[i] + gzz[i]*Gamzxy[i];
|
gxyz[i] = gxz[i]*Gamxxy[i] + gyz[i]*Gamyxy[i] + (dzz[i] + ONE)*Gamzxy[i];
|
||||||
gxzz[i] = gxz[i]*Gamxxz[i] + gyz[i]*Gamyxz[i] + gzz[i]*Gamzxz[i];
|
gxzz[i] = gxz[i]*Gamxxz[i] + gyz[i]*Gamyxz[i] + (dzz[i] + ONE)*Gamzxz[i];
|
||||||
gyyz[i] = gxz[i]*Gamxyy[i] + gyz[i]*Gamyyy[i] + gzz[i]*Gamzyy[i];
|
gyyz[i] = gxz[i]*Gamxyy[i] + gyz[i]*Gamyyy[i] + (dzz[i] + ONE)*Gamzyy[i];
|
||||||
gyzz[i] = gxz[i]*Gamxyz[i] + gyz[i]*Gamyyz[i] + gzz[i]*Gamzyz[i];
|
gyzz[i] = gxz[i]*Gamxyz[i] + gyz[i]*Gamyyz[i] + (dzz[i] + ONE)*Gamzyz[i];
|
||||||
gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + gzz[i]*Gamzzz[i];
|
gzzz[i] = gxz[i]*Gamxzz[i] + gyz[i]*Gamyzz[i] + (dzz[i] + ONE)*Gamzzz[i];
|
||||||
}
|
}
|
||||||
// 22.2ms //
|
// 22.2ms //
|
||||||
fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
|
fdderivs(ex,dxx,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
|
||||||
@@ -471,10 +430,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
// 14ms //
|
// 14ms //
|
||||||
/* 假设 all = ex1*ex2*ex3,所有量都是 length=all 的 double 数组(已按同一扁平化规则排布) */
|
/* 假设 all = ex1*ex2*ex3,所有量都是 length=all 的 double 数组(已按同一扁平化规则排布) */
|
||||||
for (int i = 0; i < all; i += 1) {
|
for (int i = 0; i < all; i += 1) {
|
||||||
|
const double gxa = gupxx[i]*Gamxxx[i] + gupyy[i]*Gamxyy[i] + gupzz[i]*Gamxzz[i]
|
||||||
|
+ TWO * ( gupxy[i]*Gamxxy[i] + gupxz[i]*Gamxxz[i] + gupyz[i]*Gamxyz[i] );
|
||||||
|
const double gya = gupxx[i]*Gamyxx[i] + gupyy[i]*Gamyyy[i] + gupzz[i]*Gamyzz[i]
|
||||||
|
+ TWO * ( gupxy[i]*Gamyxy[i] + gupxz[i]*Gamyxz[i] + gupyz[i]*Gamyyz[i] );
|
||||||
|
const double gza = gupxx[i]*Gamzxx[i] + gupyy[i]*Gamzyy[i] + gupzz[i]*Gamzzz[i]
|
||||||
|
+ TWO * ( gupxy[i]*Gamzxy[i] + gupxz[i]*Gamzxz[i] + gupyz[i]*Gamzyz[i] );
|
||||||
|
|
||||||
Rxx[i] =
|
Rxx[i] =
|
||||||
-HALF * Rxx[i]
|
-HALF * Rxx[i]
|
||||||
+ gxx[i] * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
|
+ (dxx[i] + ONE) * Gamxx[i] + gxy[i] * Gamyx[i] + gxz[i] * Gamzx[i]
|
||||||
+ Gamxa[i] * gxxx[i] + Gamya[i] * gxyx[i] + Gamza[i] * gxzx[i]
|
+ gxa * gxxx[i] + gya * gxyx[i] + gza * gxzx[i]
|
||||||
+ gupxx[i] * (
|
+ gupxx[i] * (
|
||||||
TWO * (Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxyx[i] + Gamzxx[i] * gxzx[i]) +
|
TWO * (Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxyx[i] + Gamzxx[i] * gxzx[i]) +
|
||||||
(Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxxy[i] + Gamzxx[i] * gxxz[i])
|
(Gamxxx[i] * gxxx[i] + Gamyxx[i] * gxxy[i] + Gamzxx[i] * gxxz[i])
|
||||||
@@ -508,8 +474,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
|
|
||||||
Ryy[i] =
|
Ryy[i] =
|
||||||
-HALF * Ryy[i]
|
-HALF * Ryy[i]
|
||||||
+ gxy[i] * Gamxy[i] + gyy[i] * Gamyy[i] + gyz[i] * Gamzy[i]
|
+ gxy[i] * Gamxy[i] + (dyy[i] + ONE) * Gamyy[i] + gyz[i] * Gamzy[i]
|
||||||
+ Gamxa[i] * gxyy[i] + Gamya[i] * gyyy[i] + Gamza[i] * gyzy[i]
|
+ gxa * gxyy[i] + gya * gyyy[i] + gza * gyzy[i]
|
||||||
+ gupxx[i] * (
|
+ gupxx[i] * (
|
||||||
TWO * (Gamxxy[i] * gxxy[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxzy[i]) +
|
TWO * (Gamxxy[i] * gxxy[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxzy[i]) +
|
||||||
(Gamxxy[i] * gxyx[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxyz[i])
|
(Gamxxy[i] * gxyx[i] + Gamyxy[i] * gxyy[i] + Gamzxy[i] * gxyz[i])
|
||||||
@@ -543,8 +509,8 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
|
|
||||||
Rzz[i] =
|
Rzz[i] =
|
||||||
-HALF * Rzz[i]
|
-HALF * Rzz[i]
|
||||||
+ gxz[i] * Gamxz[i] + gyz[i] * Gamyz[i] + gzz[i] * Gamzz[i]
|
+ gxz[i] * Gamxz[i] + gyz[i] * Gamyz[i] + (dzz[i] + ONE) * Gamzz[i]
|
||||||
+ Gamxa[i] * gxzz[i] + Gamya[i] * gyzz[i] + Gamza[i] * gzzz[i]
|
+ gxa * gxzz[i] + gya * gyzz[i] + gza * gzzz[i]
|
||||||
+ gupxx[i] * (
|
+ gupxx[i] * (
|
||||||
TWO * (Gamxxz[i] * gxxz[i] + Gamyxz[i] * gxyz[i] + Gamzxz[i] * gxzz[i]) +
|
TWO * (Gamxxz[i] * gxxz[i] + Gamyxz[i] * gxyz[i] + Gamzxz[i] * gxzz[i]) +
|
||||||
(Gamxxz[i] * gxzx[i] + Gamyxz[i] * gxzy[i] + Gamzxz[i] * gxzz[i])
|
(Gamxxz[i] * gxzx[i] + Gamyxz[i] * gxzy[i] + Gamzxz[i] * gxzz[i])
|
||||||
@@ -579,10 +545,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
Rxy[i] =
|
Rxy[i] =
|
||||||
HALF * (
|
HALF * (
|
||||||
-Rxy[i]
|
-Rxy[i]
|
||||||
+ gxx[i] * Gamxy[i] + gxy[i] * Gamyy[i] + gxz[i] * Gamzy[i]
|
+ (dxx[i] + ONE) * Gamxy[i] + gxy[i] * Gamyy[i] + gxz[i] * Gamzy[i]
|
||||||
+ gxy[i] * Gamxx[i] + gyy[i] * Gamyx[i] + gyz[i] * Gamzx[i]
|
+ gxy[i] * Gamxx[i] + (dyy[i] + ONE) * Gamyx[i] + gyz[i] * Gamzx[i]
|
||||||
+ Gamxa[i] * gxyx[i] + Gamya[i] * gyyx[i] + Gamza[i] * gyzx[i]
|
+ gxa * gxyx[i] + gya * gyyx[i] + gza * gyzx[i]
|
||||||
+ Gamxa[i] * gxxy[i] + Gamya[i] * gxyy[i] + Gamza[i] * gxzy[i]
|
+ gxa * gxxy[i] + gya * gxyy[i] + gza * gxzy[i]
|
||||||
)
|
)
|
||||||
+ gupxx[i] * (
|
+ gupxx[i] * (
|
||||||
Gamxxx[i] * gxxy[i] + Gamyxx[i] * gxyy[i] + Gamzxx[i] * gxzy[i]
|
Gamxxx[i] * gxxy[i] + Gamyxx[i] * gxyy[i] + Gamzxx[i] * gxzy[i]
|
||||||
@@ -627,10 +593,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
Rxz[i] =
|
Rxz[i] =
|
||||||
HALF * (
|
HALF * (
|
||||||
-Rxz[i]
|
-Rxz[i]
|
||||||
+ gxx[i] * Gamxz[i] + gxy[i] * Gamyz[i] + gxz[i] * Gamzz[i]
|
+ (dxx[i] + ONE) * Gamxz[i] + gxy[i] * Gamyz[i] + gxz[i] * Gamzz[i]
|
||||||
+ gxz[i] * Gamxx[i] + gyz[i] * Gamyx[i] + gzz[i] * Gamzx[i]
|
+ gxz[i] * Gamxx[i] + gyz[i] * Gamyx[i] + (dzz[i] + ONE) * Gamzx[i]
|
||||||
+ Gamxa[i] * gxzx[i] + Gamya[i] * gyzx[i] + Gamza[i] * gzzx[i]
|
+ gxa * gxzx[i] + gya * gyzx[i] + gza * gzzx[i]
|
||||||
+ Gamxa[i] * gxxz[i] + Gamya[i] * gxyz[i] + Gamza[i] * gxzz[i]
|
+ gxa * gxxz[i] + gya * gxyz[i] + gza * gxzz[i]
|
||||||
)
|
)
|
||||||
+ gupxx[i] * (
|
+ gupxx[i] * (
|
||||||
Gamxxx[i] * gxxz[i] + Gamyxx[i] * gxyz[i] + Gamzxx[i] * gxzz[i]
|
Gamxxx[i] * gxxz[i] + Gamyxx[i] * gxyz[i] + Gamzxx[i] * gxzz[i]
|
||||||
@@ -675,10 +641,10 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
Ryz[i] =
|
Ryz[i] =
|
||||||
HALF * (
|
HALF * (
|
||||||
-Ryz[i]
|
-Ryz[i]
|
||||||
+ gxy[i] * Gamxz[i] + gyy[i] * Gamyz[i] + gyz[i] * Gamzz[i]
|
+ gxy[i] * Gamxz[i] + (dyy[i] + ONE) * Gamyz[i] + gyz[i] * Gamzz[i]
|
||||||
+ gxz[i] * Gamxy[i] + gyz[i] * Gamyy[i] + gzz[i] * Gamzy[i]
|
+ gxz[i] * Gamxy[i] + gyz[i] * Gamyy[i] + (dzz[i] + ONE) * Gamzy[i]
|
||||||
+ Gamxa[i] * gxzy[i] + Gamya[i] * gyzy[i] + Gamza[i] * gzzy[i]
|
+ gxa * gxzy[i] + gya * gyzy[i] + gza * gzzy[i]
|
||||||
+ Gamxa[i] * gxyz[i] + Gamya[i] * gyyz[i] + Gamza[i] * gyzz[i]
|
+ gxa * gxyz[i] + gya * gyyz[i] + gza * gyzz[i]
|
||||||
)
|
)
|
||||||
+ gupxx[i] * (
|
+ gupxx[i] * (
|
||||||
Gamxxy[i] * gxxz[i] + Gamyxy[i] * gxyz[i] + Gamzxy[i] * gxzz[i]
|
Gamxxy[i] * gxxz[i] + Gamyxy[i] * gxyz[i] + Gamzxy[i] * gxzz[i]
|
||||||
@@ -739,9 +705,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i])
|
+ TWO * gupxy[i] * (fxy[i] - (F3o2 / chin1[i]) * chix[i] * chiy[i])
|
||||||
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i])
|
+ TWO * gupxz[i] * (fxz[i] - (F3o2 / chin1[i]) * chix[i] * chiz[i])
|
||||||
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]);
|
+ TWO * gupyz[i] * (fyz[i] - (F3o2 / chin1[i]) * chiy[i] * chiz[i]);
|
||||||
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + gxx[i] * f[i] ) / (chin1[i] * TWO);
|
Rxx[i] = Rxx[i] + ( fxx[i] - (chix[i] * chix[i]) / (chin1[i] * TWO) + (dxx[i] + ONE) * f[i] ) / (chin1[i] * TWO);
|
||||||
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + gyy[i] * f[i] ) / (chin1[i] * TWO);
|
Ryy[i] = Ryy[i] + ( fyy[i] - (chiy[i] * chiy[i]) / (chin1[i] * TWO) + (dyy[i] + ONE) * f[i] ) / (chin1[i] * TWO);
|
||||||
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + gzz[i] * f[i] ) / (chin1[i] * TWO);
|
Rzz[i] = Rzz[i] + ( fzz[i] - (chiz[i] * chiz[i]) / (chin1[i] * TWO) + (dzz[i] + ONE) * f[i] ) / (chin1[i] * TWO);
|
||||||
|
|
||||||
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO);
|
Rxy[i] = Rxy[i] + ( fxy[i] - (chix[i] * chiy[i]) / (chin1[i] * TWO) + gxy[i] * f[i] ) / (chin1[i] * TWO);
|
||||||
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO);
|
Rxz[i] = Rxz[i] + ( fxz[i] - (chix[i] * chiz[i]) / (chin1[i] * TWO) + gxz[i] * f[i] ) / (chin1[i] * TWO);
|
||||||
@@ -760,17 +726,17 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i];
|
gxxz[i] = (gupxz[i] * chix[i] + gupyz[i] * chiy[i] + gupzz[i] * chiz[i]) / chin1[i];
|
||||||
|
|
||||||
/* Christoffel 修正项 */
|
/* Christoffel 修正项 */
|
||||||
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - gxx[i] * gxxx[i] ) * HALF;
|
Gamxxx[i] = Gamxxx[i] - ( ((chix[i] + chix[i]) / chin1[i]) - (dxx[i] + ONE) * gxxx[i] ) * HALF;
|
||||||
Gamyxx[i] = Gamyxx[i] - ( 0.0 - gxx[i] * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */
|
Gamyxx[i] = Gamyxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxy[i] ) * HALF; /* 原式只有 -gxx*gxxy */
|
||||||
Gamzxx[i] = Gamzxx[i] - ( 0.0 - gxx[i] * gxxz[i] ) * HALF;
|
Gamzxx[i] = Gamzxx[i] - ( 0.0 - (dxx[i] + ONE) * gxxz[i] ) * HALF;
|
||||||
|
|
||||||
Gamxyy[i] = Gamxyy[i] - ( 0.0 - gyy[i] * gxxx[i] ) * HALF;
|
Gamxyy[i] = Gamxyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxx[i] ) * HALF;
|
||||||
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - gyy[i] * gxxy[i] ) * HALF;
|
Gamyyy[i] = Gamyyy[i] - ( ((chiy[i] + chiy[i]) / chin1[i]) - (dyy[i] + ONE) * gxxy[i] ) * HALF;
|
||||||
Gamzyy[i] = Gamzyy[i] - ( 0.0 - gyy[i] * gxxz[i] ) * HALF;
|
Gamzyy[i] = Gamzyy[i] - ( 0.0 - (dyy[i] + ONE) * gxxz[i] ) * HALF;
|
||||||
|
|
||||||
Gamxzz[i] = Gamxzz[i] - ( 0.0 - gzz[i] * gxxx[i] ) * HALF;
|
Gamxzz[i] = Gamxzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxx[i] ) * HALF;
|
||||||
Gamyzz[i] = Gamyzz[i] - ( 0.0 - gzz[i] * gxxy[i] ) * HALF;
|
Gamyzz[i] = Gamyzz[i] - ( 0.0 - (dzz[i] + ONE) * gxxy[i] ) * HALF;
|
||||||
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - gzz[i] * gxxz[i] ) * HALF;
|
Gamzzz[i] = Gamzzz[i] - ( ((chiz[i] + chiz[i]) / chin1[i]) - (dzz[i] + ONE) * gxxz[i] ) * HALF;
|
||||||
|
|
||||||
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF;
|
Gamxxy[i] = Gamxxy[i] - ( ( chiy[i] / chin1[i]) - gxy[i] * gxxx[i] ) * HALF;
|
||||||
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF;
|
Gamyxy[i] = Gamyxy[i] - ( ( chix[i] / chin1[i]) - gxy[i] * gxxy[i] ) * HALF;
|
||||||
@@ -792,14 +758,13 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
fxy[i] = fxy[i] - Gamxxy[i] * Lapx[i] - Gamyxy[i] * Lapy[i] - Gamzxy[i] * Lapz[i];
|
fxy[i] = fxy[i] - Gamxxy[i] * Lapx[i] - Gamyxy[i] * Lapy[i] - Gamzxy[i] * Lapz[i];
|
||||||
fxz[i] = fxz[i] - Gamxxz[i] * Lapx[i] - Gamyxz[i] * Lapy[i] - Gamzxz[i] * Lapz[i];
|
fxz[i] = fxz[i] - Gamxxz[i] * Lapx[i] - Gamyxz[i] * Lapy[i] - Gamzxz[i] * Lapz[i];
|
||||||
fyz[i] = fyz[i] - Gamxyz[i] * Lapx[i] - Gamyyz[i] * Lapy[i] - Gamzyz[i] * Lapz[i];
|
fyz[i] = fyz[i] - Gamxyz[i] * Lapx[i] - Gamyyz[i] * Lapy[i] - Gamzyz[i] * Lapz[i];
|
||||||
}
|
|
||||||
// 1ms //
|
|
||||||
for (int i=0;i<all;i+=1) {
|
|
||||||
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
|
trK_rhs[i] = gupxx[i] * fxx[i] + gupyy[i] * fyy[i] + gupzz[i] * fzz[i]
|
||||||
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
|
+ TWO * ( gupxy[i] * fxy[i] + gupxz[i] * fxz[i] + gupyz[i] * fyz[i] );
|
||||||
}
|
}
|
||||||
// 2.5ms //
|
// 2.5ms //
|
||||||
for (int i=0;i<all;i+=1) {
|
for (int i=0;i<all;i+=1) {
|
||||||
|
const double divb = betaxx[i] + betayy[i] + betazz[i];
|
||||||
|
|
||||||
S[i] = chin1[i] * (
|
S[i] = chin1[i] * (
|
||||||
gupxx[i] * Sxx[i] + gupyy[i] * Syy[i] + gupzz[i] * Szz[i]
|
gupxx[i] * Sxx[i] + gupyy[i] * Syy[i] + gupzz[i] * Szz[i]
|
||||||
@@ -850,23 +815,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
+ (alpn1[i] / chin1[i]) * f[i]
|
+ (alpn1[i] / chin1[i]) * f[i]
|
||||||
);
|
);
|
||||||
|
|
||||||
fxx[i] = alpn1[i] * (Rxx[i] - EIGHT * PI * Sxx[i]) - fxx[i];
|
double l_fxx = alpn1[i] * (Rxx[i] - EIGHT * PI * Sxx[i]) - fxx[i];
|
||||||
fxy[i] = alpn1[i] * (Rxy[i] - EIGHT * PI * Sxy[i]) - fxy[i];
|
double l_fxy = alpn1[i] * (Rxy[i] - EIGHT * PI * Sxy[i]) - fxy[i];
|
||||||
fxz[i] = alpn1[i] * (Rxz[i] - EIGHT * PI * Sxz[i]) - fxz[i];
|
double l_fxz = alpn1[i] * (Rxz[i] - EIGHT * PI * Sxz[i]) - fxz[i];
|
||||||
fyy[i] = alpn1[i] * (Ryy[i] - EIGHT * PI * Syy[i]) - fyy[i];
|
double l_fyy = alpn1[i] * (Ryy[i] - EIGHT * PI * Syy[i]) - fyy[i];
|
||||||
fyz[i] = alpn1[i] * (Ryz[i] - EIGHT * PI * Syz[i]) - fyz[i];
|
double l_fyz = alpn1[i] * (Ryz[i] - EIGHT * PI * Syz[i]) - fyz[i];
|
||||||
fzz[i] = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
|
double l_fzz = alpn1[i] * (Rzz[i] - EIGHT * PI * Szz[i]) - fzz[i];
|
||||||
}
|
|
||||||
// 8ms //
|
|
||||||
for (int i=0;i<all;i+=1) {
|
|
||||||
|
|
||||||
/* Aij_rhs = fij - gij * f */
|
/* Aij_rhs = fij - gij * f */
|
||||||
Axx_rhs[i] = fxx[i] - gxx[i] * f[i];
|
Axx_rhs[i] = l_fxx - (dxx[i] + ONE) * f[i];
|
||||||
Ayy_rhs[i] = fyy[i] - gyy[i] * f[i];
|
Ayy_rhs[i] = l_fyy - (dyy[i] + ONE) * f[i];
|
||||||
Azz_rhs[i] = fzz[i] - gzz[i] * f[i];
|
Azz_rhs[i] = l_fzz - (dzz[i] + ONE) * f[i];
|
||||||
Axy_rhs[i] = fxy[i] - gxy[i] * f[i];
|
Axy_rhs[i] = l_fxy - gxy[i] * f[i];
|
||||||
Axz_rhs[i] = fxz[i] - gxz[i] * f[i];
|
Axz_rhs[i] = l_fxz - gxz[i] * f[i];
|
||||||
Ayz_rhs[i] = fyz[i] - gyz[i] * f[i];
|
Ayz_rhs[i] = l_fyz - gyz[i] * f[i];
|
||||||
|
|
||||||
/* Now: store A_il A^l_j into fij: */
|
/* Now: store A_il A^l_j into fij: */
|
||||||
fxx[i] =
|
fxx[i] =
|
||||||
@@ -928,19 +890,19 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
f[i] * Axx_rhs[i]
|
f[i] * Axx_rhs[i]
|
||||||
+ alpn1[i] * ( trK[i] * Axx[i] - TWO * fxx[i] )
|
+ alpn1[i] * ( trK[i] * Axx[i] - TWO * fxx[i] )
|
||||||
+ TWO * ( Axx[i] * betaxx[i] + Axy[i] * betayx[i] + Axz[i] * betazx[i] )
|
+ TWO * ( Axx[i] * betaxx[i] + Axy[i] * betayx[i] + Axz[i] * betazx[i] )
|
||||||
- F2o3 * Axx[i] * div_beta[i];
|
- F2o3 * Axx[i] * divb;
|
||||||
|
|
||||||
Ayy_rhs[i] =
|
Ayy_rhs[i] =
|
||||||
f[i] * Ayy_rhs[i]
|
f[i] * Ayy_rhs[i]
|
||||||
+ alpn1[i] * ( trK[i] * Ayy[i] - TWO * fyy[i] )
|
+ alpn1[i] * ( trK[i] * Ayy[i] - TWO * fyy[i] )
|
||||||
+ TWO * ( Axy[i] * betaxy[i] + Ayy[i] * betayy[i] + Ayz[i] * betazy[i] )
|
+ TWO * ( Axy[i] * betaxy[i] + Ayy[i] * betayy[i] + Ayz[i] * betazy[i] )
|
||||||
- F2o3 * Ayy[i] * div_beta[i];
|
- F2o3 * Ayy[i] * divb;
|
||||||
|
|
||||||
Azz_rhs[i] =
|
Azz_rhs[i] =
|
||||||
f[i] * Azz_rhs[i]
|
f[i] * Azz_rhs[i]
|
||||||
+ alpn1[i] * ( trK[i] * Azz[i] - TWO * fzz[i] )
|
+ alpn1[i] * ( trK[i] * Azz[i] - TWO * fzz[i] )
|
||||||
+ TWO * ( Axz[i] * betaxz[i] + Ayz[i] * betayz[i] + Azz[i] * betazz[i] )
|
+ TWO * ( Axz[i] * betaxz[i] + Ayz[i] * betayz[i] + Azz[i] * betazz[i] )
|
||||||
- F2o3 * Azz[i] * div_beta[i];
|
- F2o3 * Azz[i] * divb;
|
||||||
|
|
||||||
Axy_rhs[i] =
|
Axy_rhs[i] =
|
||||||
f[i] * Axy_rhs[i]
|
f[i] * Axy_rhs[i]
|
||||||
@@ -949,7 +911,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
+ Axz[i] * betazy[i]
|
+ Axz[i] * betazy[i]
|
||||||
+ Ayy[i] * betayx[i]
|
+ Ayy[i] * betayx[i]
|
||||||
+ Ayz[i] * betazx[i]
|
+ Ayz[i] * betazx[i]
|
||||||
+ F1o3 * Axy[i] * div_beta[i]
|
+ F1o3 * Axy[i] * divb
|
||||||
- Axy[i] * betazz[i];
|
- Axy[i] * betazz[i];
|
||||||
|
|
||||||
Ayz_rhs[i] =
|
Ayz_rhs[i] =
|
||||||
@@ -959,7 +921,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
+ Ayy[i] * betayz[i]
|
+ Ayy[i] * betayz[i]
|
||||||
+ Axz[i] * betaxy[i]
|
+ Axz[i] * betaxy[i]
|
||||||
+ Azz[i] * betazy[i]
|
+ Azz[i] * betazy[i]
|
||||||
+ F1o3 * Ayz[i] * div_beta[i]
|
+ F1o3 * Ayz[i] * divb
|
||||||
- Ayz[i] * betaxx[i];
|
- Ayz[i] * betaxx[i];
|
||||||
|
|
||||||
Axz_rhs[i] =
|
Axz_rhs[i] =
|
||||||
@@ -969,7 +931,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
+ Axy[i] * betayz[i]
|
+ Axy[i] * betayz[i]
|
||||||
+ Ayz[i] * betayx[i]
|
+ Ayz[i] * betayx[i]
|
||||||
+ Azz[i] * betazx[i]
|
+ Azz[i] * betazx[i]
|
||||||
+ F1o3 * Axz[i] * div_beta[i]
|
+ F1o3 * Axz[i] * divb
|
||||||
- Axz[i] * betayy[i];
|
- Axz[i] * betayy[i];
|
||||||
|
|
||||||
/* Compute trace of S_ij */
|
/* Compute trace of S_ij */
|
||||||
@@ -1100,58 +1062,31 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
|||||||
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
|
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
// 26ms //
|
// advection + KO dissipation with shared symmetry buffer
|
||||||
lopsided(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA);
|
lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
|
||||||
lopsided(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS);
|
lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
|
||||||
lopsided(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA);
|
lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
|
||||||
lopsided(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS);
|
lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps);
|
||||||
lopsided(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS);
|
lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps);
|
||||||
lopsided(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA);
|
lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
|
||||||
lopsided(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA);
|
lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
|
||||||
lopsided(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS);
|
lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
|
||||||
lopsided(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS);
|
lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
|
||||||
lopsided(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS);
|
lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
|
||||||
lopsided(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA);
|
lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
|
||||||
lopsided(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA);
|
lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
|
||||||
lopsided(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA);
|
lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
|
||||||
lopsided(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS);
|
lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps);
|
||||||
lopsided(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS);
|
lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
|
||||||
lopsided(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS);
|
lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
|
||||||
// 20ms //
|
|
||||||
if(eps>0){
|
|
||||||
kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps);
|
|
||||||
kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps);
|
|
||||||
}
|
|
||||||
// 2ms //
|
// 2ms //
|
||||||
if(co==0){
|
if(co==0){
|
||||||
for (int i=0;i<all;i+=1) {
|
for (int i=0;i<all;i+=1) {
|
||||||
|
|||||||
@@ -71,149 +71,99 @@ void fdderivs(const int ex[3],
|
|||||||
const double Fdxdz = F1o144 / (dX * dZ);
|
const double Fdxdz = F1o144 / (dX * dZ);
|
||||||
const double Fdydz = F1o144 / (dY * dZ);
|
const double Fdydz = F1o144 / (dY * dZ);
|
||||||
|
|
||||||
/* 输出清零:fxx,fyy,fzz,fxy,fxz,fyz = 0 */
|
/* 只清零不被主循环覆盖的边界面 */
|
||||||
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
|
{
|
||||||
for (size_t p = 0; p < all; ++p) {
|
/* 高边界:k0=ex3-1 */
|
||||||
fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
|
for (int j0 = 0; j0 < ex2; ++j0)
|
||||||
fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
|
for (int i0 = 0; i0 < ex1; ++i0) {
|
||||||
|
const size_t p = idx_ex(i0, j0, ex3 - 1, ex);
|
||||||
|
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
||||||
|
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
||||||
|
}
|
||||||
|
/* 高边界:j0=ex2-1 */
|
||||||
|
for (int k0 = 0; k0 < ex3 - 1; ++k0)
|
||||||
|
for (int i0 = 0; i0 < ex1; ++i0) {
|
||||||
|
const size_t p = idx_ex(i0, ex2 - 1, k0, ex);
|
||||||
|
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
||||||
|
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
||||||
|
}
|
||||||
|
/* 高边界:i0=ex1-1 */
|
||||||
|
for (int k0 = 0; k0 < ex3 - 1; ++k0)
|
||||||
|
for (int j0 = 0; j0 < ex2 - 1; ++j0) {
|
||||||
|
const size_t p = idx_ex(ex1 - 1, j0, k0, ex);
|
||||||
|
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
||||||
|
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 低边界:当二阶模板也不可用时,对应 i0/j0/k0=0 面 */
|
||||||
|
if (kminF == 1) {
|
||||||
|
for (int j0 = 0; j0 < ex2; ++j0)
|
||||||
|
for (int i0 = 0; i0 < ex1; ++i0) {
|
||||||
|
const size_t p = idx_ex(i0, j0, 0, ex);
|
||||||
|
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
||||||
|
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (jminF == 1) {
|
||||||
|
for (int k0 = 0; k0 < ex3; ++k0)
|
||||||
|
for (int i0 = 0; i0 < ex1; ++i0) {
|
||||||
|
const size_t p = idx_ex(i0, 0, k0, ex);
|
||||||
|
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
||||||
|
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (iminF == 1) {
|
||||||
|
for (int k0 = 0; k0 < ex3; ++k0)
|
||||||
|
for (int j0 = 0; j0 < ex2; ++j0) {
|
||||||
|
const size_t p = idx_ex(0, j0, k0, ex);
|
||||||
|
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
||||||
|
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fortran:
|
* 两段式:
|
||||||
* do k=1,ex3-1
|
* 1) 二阶可用区域先计算二阶模板
|
||||||
* do j=1,ex2-1
|
* 2) 高阶可用区域再覆盖四阶模板
|
||||||
* do i=1,ex1-1
|
|
||||||
*/
|
*/
|
||||||
|
const int i2_lo = (iminF > 0) ? iminF : 0;
|
||||||
|
const int j2_lo = (jminF > 0) ? jminF : 0;
|
||||||
|
const int k2_lo = (kminF > 0) ? kminF : 0;
|
||||||
|
const int i2_hi = ex1 - 2;
|
||||||
|
const int j2_hi = ex2 - 2;
|
||||||
|
const int k2_hi = ex3 - 2;
|
||||||
|
|
||||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
|
||||||
const int kF = k0 + 1;
|
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
|
||||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
|
||||||
const int jF = j0 + 1;
|
const int i4_hi = ex1 - 3;
|
||||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
const int j4_hi = ex2 - 3;
|
||||||
const int iF = i0 + 1;
|
const int k4_hi = ex3 - 3;
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
/* 高阶分支:i±2,j±2,k±2 都在范围内 */
|
/*
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
* Strategy A:
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
* Avoid redundant work in overlap of 2nd/4th-order regions.
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
* Only compute 2nd-order on shell points that are NOT overwritten by
|
||||||
{
|
* the 4th-order pass.
|
||||||
fxx[p] = Fdxdx * (
|
*/
|
||||||
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
|
const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
|
||||||
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fyy[p] = Fdydy * (
|
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
|
||||||
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
|
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
const int kF = k0 + 1;
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
|
||||||
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
|
const int jF = j0 + 1;
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
|
||||||
);
|
if (has4 &&
|
||||||
|
i0 >= i4_lo && i0 <= i4_hi &&
|
||||||
fzz[p] = Fdzdz * (
|
j0 >= j4_lo && j0 <= j4_hi &&
|
||||||
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
|
k0 >= k4_lo && k0 <= k4_hi) {
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
continue;
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
/* fxy 高阶:完全照搬 Fortran 的括号结构 */
|
|
||||||
{
|
|
||||||
const double t_jm2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jm1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
|
|
||||||
|
|
||||||
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
|
|
||||||
}
|
}
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
/* fxz 高阶 */
|
|
||||||
{
|
|
||||||
const double t_km2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
|
|
||||||
|
|
||||||
const double t_km1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
|
|
||||||
|
|
||||||
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* fyz 高阶 */
|
|
||||||
{
|
|
||||||
const double t_km2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
|
|
||||||
|
|
||||||
const double t_km1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
|
|
||||||
|
|
||||||
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* 二阶分支:i±1,j±1,k±1 在范围内 */
|
|
||||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
||||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
||||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
||||||
{
|
|
||||||
fxx[p] = Sdxdx * (
|
fxx[p] = Sdxdx * (
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
||||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
||||||
@@ -252,13 +202,127 @@ void fdderivs(const int ex[3],
|
|||||||
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
|
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
||||||
);
|
);
|
||||||
}else{
|
}
|
||||||
fxx[p] = 0.0;
|
}
|
||||||
fyy[p] = 0.0;
|
}
|
||||||
fzz[p] = 0.0;
|
}
|
||||||
fxy[p] = 0.0;
|
|
||||||
fxz[p] = 0.0;
|
if (has4) {
|
||||||
fyz[p] = 0.0;
|
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
|
fxx[p] = Fdxdx * (
|
||||||
|
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
|
||||||
|
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
||||||
|
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
||||||
|
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
|
||||||
|
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
||||||
|
);
|
||||||
|
|
||||||
|
fyy[p] = Fdydy * (
|
||||||
|
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
|
||||||
|
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
||||||
|
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
||||||
|
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
|
||||||
|
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
||||||
|
);
|
||||||
|
|
||||||
|
fzz[p] = Fdzdz * (
|
||||||
|
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
|
||||||
|
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
||||||
|
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
||||||
|
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
|
||||||
|
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
||||||
|
);
|
||||||
|
|
||||||
|
{
|
||||||
|
const double t_jm2 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
|
||||||
|
|
||||||
|
const double t_jm1 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
|
||||||
|
|
||||||
|
const double t_jp1 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
|
||||||
|
|
||||||
|
const double t_jp2 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
|
||||||
|
|
||||||
|
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const double t_km2 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
|
||||||
|
|
||||||
|
const double t_km1 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
|
||||||
|
|
||||||
|
const double t_kp1 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
|
||||||
|
|
||||||
|
const double t_kp2 =
|
||||||
|
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
|
||||||
|
|
||||||
|
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const double t_km2 =
|
||||||
|
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
|
||||||
|
|
||||||
|
const double t_km1 =
|
||||||
|
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
|
||||||
|
|
||||||
|
const double t_kp1 =
|
||||||
|
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
|
||||||
|
|
||||||
|
const double t_kp2 =
|
||||||
|
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
|
||||||
|
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
|
||||||
|
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
|
||||||
|
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
|
||||||
|
|
||||||
|
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -81,26 +81,63 @@ void fderivs(const int ex[3],
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fortran loops:
|
* 两段式:
|
||||||
* do k=1,ex3-1
|
* 1) 先在二阶可用区域计算二阶模板
|
||||||
* do j=1,ex2-1
|
* 2) 再在高阶可用区域覆盖为四阶模板
|
||||||
* do i=1,ex1-1
|
|
||||||
*
|
*
|
||||||
* C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
|
* 与原 if/elseif 逻辑等价,但减少逐点分支判断。
|
||||||
*/
|
*/
|
||||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
const int i2_lo = (iminF > 0) ? iminF : 0;
|
||||||
const int kF = k0 + 1;
|
const int j2_lo = (jminF > 0) ? jminF : 0;
|
||||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
const int k2_lo = (kminF > 0) ? kminF : 0;
|
||||||
const int jF = j0 + 1;
|
const int i2_hi = ex1 - 2;
|
||||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
const int j2_hi = ex2 - 2;
|
||||||
const int iF = i0 + 1;
|
const int k2_hi = ex3 - 2;
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
|
||||||
|
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
|
||||||
|
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
|
||||||
|
const int i4_hi = ex1 - 3;
|
||||||
|
const int j4_hi = ex2 - 3;
|
||||||
|
const int k4_hi = ex3 - 3;
|
||||||
|
|
||||||
|
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
|
||||||
|
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
|
fx[p] = d2dx * (
|
||||||
|
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
||||||
|
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
||||||
|
);
|
||||||
|
|
||||||
|
fy[p] = d2dy * (
|
||||||
|
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
||||||
|
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
||||||
|
);
|
||||||
|
|
||||||
|
fz[p] = d2dz * (
|
||||||
|
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
||||||
|
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
|
||||||
|
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
// if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引)
|
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
||||||
{
|
|
||||||
fx[p] = d12dx * (
|
fx[p] = d12dx * (
|
||||||
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
|
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
|
||||||
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
||||||
@@ -122,26 +159,6 @@ void fderivs(const int ex[3],
|
|||||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
|
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// elseif(i+1 <= imax .and. i-1 >= imin ...)
|
|
||||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
||||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
||||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
||||||
{
|
|
||||||
fx[p] = d2dx * (
|
|
||||||
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fy[p] = d2dy * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fz[p] = d2dz * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1115,6 +1115,147 @@ end subroutine d2dump
|
|||||||
!------------------------------------------------------------------------------
|
!------------------------------------------------------------------------------
|
||||||
! Lagrangian polynomial interpolation
|
! Lagrangian polynomial interpolation
|
||||||
!------------------------------------------------------------------------------
|
!------------------------------------------------------------------------------
|
||||||
|
#ifndef POLINT6_USE_BARYCENTRIC
|
||||||
|
#define POLINT6_USE_BARYCENTRIC 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
|
||||||
|
subroutine polint6_neville(xa, ya, x, y, dy)
|
||||||
|
implicit none
|
||||||
|
|
||||||
|
real*8, dimension(6), intent(in) :: xa, ya
|
||||||
|
real*8, intent(in) :: x
|
||||||
|
real*8, intent(out) :: y, dy
|
||||||
|
|
||||||
|
integer :: i, m, ns, n_m
|
||||||
|
real*8, dimension(6) :: c, d, ho
|
||||||
|
real*8 :: dif, dift, hp, h, den_val
|
||||||
|
|
||||||
|
c = ya
|
||||||
|
d = ya
|
||||||
|
ho = xa - x
|
||||||
|
|
||||||
|
ns = 1
|
||||||
|
dif = abs(x - xa(1))
|
||||||
|
|
||||||
|
do i = 2, 6
|
||||||
|
dift = abs(x - xa(i))
|
||||||
|
if (dift < dif) then
|
||||||
|
ns = i
|
||||||
|
dif = dift
|
||||||
|
end if
|
||||||
|
end do
|
||||||
|
|
||||||
|
y = ya(ns)
|
||||||
|
ns = ns - 1
|
||||||
|
|
||||||
|
do m = 1, 5
|
||||||
|
n_m = 6 - m
|
||||||
|
do i = 1, n_m
|
||||||
|
hp = ho(i)
|
||||||
|
h = ho(i+m)
|
||||||
|
den_val = hp - h
|
||||||
|
|
||||||
|
if (den_val == 0.0d0) then
|
||||||
|
write(*,*) 'failure in polint for point',x
|
||||||
|
write(*,*) 'with input points: ',xa
|
||||||
|
stop
|
||||||
|
end if
|
||||||
|
|
||||||
|
den_val = (c(i+1) - d(i)) / den_val
|
||||||
|
|
||||||
|
d(i) = h * den_val
|
||||||
|
c(i) = hp * den_val
|
||||||
|
end do
|
||||||
|
|
||||||
|
if (2 * ns < n_m) then
|
||||||
|
dy = c(ns + 1)
|
||||||
|
else
|
||||||
|
dy = d(ns)
|
||||||
|
ns = ns - 1
|
||||||
|
end if
|
||||||
|
y = y + dy
|
||||||
|
end do
|
||||||
|
|
||||||
|
return
|
||||||
|
end subroutine polint6_neville
|
||||||
|
|
||||||
|
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
|
||||||
|
subroutine polint6_barycentric(xa, ya, x, y, dy)
|
||||||
|
implicit none
|
||||||
|
|
||||||
|
real*8, dimension(6), intent(in) :: xa, ya
|
||||||
|
real*8, intent(in) :: x
|
||||||
|
real*8, intent(out) :: y, dy
|
||||||
|
|
||||||
|
integer :: i, j
|
||||||
|
logical :: is_uniform
|
||||||
|
real*8, dimension(6) :: lambda
|
||||||
|
real*8 :: dx, den_i, term, num, den, step, tol
|
||||||
|
real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
|
||||||
|
|
||||||
|
do i = 1, 6
|
||||||
|
if (x == xa(i)) then
|
||||||
|
y = ya(i)
|
||||||
|
dy = 0.d0
|
||||||
|
return
|
||||||
|
end if
|
||||||
|
end do
|
||||||
|
|
||||||
|
step = xa(2) - xa(1)
|
||||||
|
is_uniform = (step /= 0.d0)
|
||||||
|
if (is_uniform) then
|
||||||
|
tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
|
||||||
|
do i = 3, 6
|
||||||
|
if (abs((xa(i) - xa(i-1)) - step) > tol) then
|
||||||
|
is_uniform = .false.
|
||||||
|
exit
|
||||||
|
end if
|
||||||
|
end do
|
||||||
|
end if
|
||||||
|
|
||||||
|
if (is_uniform) then
|
||||||
|
num = 0.d0
|
||||||
|
den = 0.d0
|
||||||
|
do i = 1, 6
|
||||||
|
term = c_uniform(i) / (x - xa(i))
|
||||||
|
num = num + term * ya(i)
|
||||||
|
den = den + term
|
||||||
|
end do
|
||||||
|
y = num / den
|
||||||
|
dy = 0.d0
|
||||||
|
return
|
||||||
|
end if
|
||||||
|
|
||||||
|
do i = 1, 6
|
||||||
|
den_i = 1.d0
|
||||||
|
do j = 1, 6
|
||||||
|
if (j /= i) then
|
||||||
|
dx = xa(i) - xa(j)
|
||||||
|
if (dx == 0.0d0) then
|
||||||
|
write(*,*) 'failure in polint for point',x
|
||||||
|
write(*,*) 'with input points: ',xa
|
||||||
|
stop
|
||||||
|
end if
|
||||||
|
den_i = den_i * dx
|
||||||
|
end if
|
||||||
|
end do
|
||||||
|
lambda(i) = 1.d0 / den_i
|
||||||
|
end do
|
||||||
|
|
||||||
|
num = 0.d0
|
||||||
|
den = 0.d0
|
||||||
|
do i = 1, 6
|
||||||
|
term = lambda(i) / (x - xa(i))
|
||||||
|
num = num + term * ya(i)
|
||||||
|
den = den + term
|
||||||
|
end do
|
||||||
|
|
||||||
|
y = num / den
|
||||||
|
dy = 0.d0
|
||||||
|
|
||||||
|
return
|
||||||
|
end subroutine polint6_barycentric
|
||||||
|
|
||||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint
|
!DIR$ ATTRIBUTES FORCEINLINE :: polint
|
||||||
subroutine polint(xa, ya, x, y, dy, ordn)
|
subroutine polint(xa, ya, x, y, dy, ordn)
|
||||||
@@ -1129,6 +1270,15 @@ end subroutine d2dump
|
|||||||
real*8, dimension(ordn) :: c, d, ho
|
real*8, dimension(ordn) :: c, d, ho
|
||||||
real*8 :: dif, dift, hp, h, den_val
|
real*8 :: dif, dift, hp, h, den_val
|
||||||
|
|
||||||
|
if (ordn == 6) then
|
||||||
|
#if POLINT6_USE_BARYCENTRIC
|
||||||
|
call polint6_barycentric(xa, ya, x, y, dy)
|
||||||
|
#else
|
||||||
|
call polint6_neville(xa, ya, x, y, dy)
|
||||||
|
#endif
|
||||||
|
return
|
||||||
|
end if
|
||||||
|
|
||||||
c = ya
|
c = ya
|
||||||
d = ya
|
d = ya
|
||||||
ho = xa - x
|
ho = xa - x
|
||||||
@@ -1178,6 +1328,41 @@ end subroutine d2dump
|
|||||||
return
|
return
|
||||||
end subroutine polint
|
end subroutine polint
|
||||||
!------------------------------------------------------------------------------
|
!------------------------------------------------------------------------------
|
||||||
|
! Compute Lagrange interpolation basis weights for one target point.
|
||||||
|
!------------------------------------------------------------------------------
|
||||||
|
!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
|
||||||
|
subroutine polint_lagrange_weights(xa, x, w, ordn)
|
||||||
|
implicit none
|
||||||
|
|
||||||
|
integer, intent(in) :: ordn
|
||||||
|
real*8, dimension(1:ordn), intent(in) :: xa
|
||||||
|
real*8, intent(in) :: x
|
||||||
|
real*8, dimension(1:ordn), intent(out) :: w
|
||||||
|
|
||||||
|
integer :: i, j
|
||||||
|
real*8 :: num, den, dx
|
||||||
|
|
||||||
|
do i = 1, ordn
|
||||||
|
num = 1.d0
|
||||||
|
den = 1.d0
|
||||||
|
do j = 1, ordn
|
||||||
|
if (j /= i) then
|
||||||
|
dx = xa(i) - xa(j)
|
||||||
|
if (dx == 0.0d0) then
|
||||||
|
write(*,*) 'failure in polint for point',x
|
||||||
|
write(*,*) 'with input points: ',xa
|
||||||
|
stop
|
||||||
|
end if
|
||||||
|
num = num * (x - xa(j))
|
||||||
|
den = den * dx
|
||||||
|
end if
|
||||||
|
end do
|
||||||
|
w(i) = num / den
|
||||||
|
end do
|
||||||
|
|
||||||
|
return
|
||||||
|
end subroutine polint_lagrange_weights
|
||||||
|
!------------------------------------------------------------------------------
|
||||||
!
|
!
|
||||||
! interpolation in 2 dimensions, follow yx order
|
! interpolation in 2 dimensions, follow yx order
|
||||||
!
|
!
|
||||||
@@ -1248,19 +1433,26 @@ end subroutine d2dump
|
|||||||
end do
|
end do
|
||||||
call polint(x1a,ymtmp,x1,y,dy,ordn)
|
call polint(x1a,ymtmp,x1,y,dy,ordn)
|
||||||
#else
|
#else
|
||||||
integer :: j, k
|
integer :: i, j, k
|
||||||
real*8, dimension(ordn,ordn) :: yatmp
|
real*8, dimension(ordn) :: w1, w2
|
||||||
real*8, dimension(ordn) :: ymtmp
|
real*8, dimension(ordn) :: ymtmp
|
||||||
real*8 :: dy_temp
|
real*8 :: yx_sum, x_sum
|
||||||
|
|
||||||
do k=1,ordn
|
call polint_lagrange_weights(x1a, x1, w1, ordn)
|
||||||
do j=1,ordn
|
call polint_lagrange_weights(x2a, x2, w2, ordn)
|
||||||
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
|
|
||||||
|
do k = 1, ordn
|
||||||
|
yx_sum = 0.d0
|
||||||
|
do j = 1, ordn
|
||||||
|
x_sum = 0.d0
|
||||||
|
do i = 1, ordn
|
||||||
|
x_sum = x_sum + w1(i) * ya(i,j,k)
|
||||||
|
end do
|
||||||
|
yx_sum = yx_sum + w2(j) * x_sum
|
||||||
end do
|
end do
|
||||||
|
ymtmp(k) = yx_sum
|
||||||
end do
|
end do
|
||||||
do k=1,ordn
|
|
||||||
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
|
|
||||||
end do
|
|
||||||
call polint(x3a, ymtmp, x3, y, dy, ordn)
|
call polint(x3a, ymtmp, x3, y, dy, ordn)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -1609,8 +1801,11 @@ deallocate(f_flat)
|
|||||||
! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
|
! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
|
||||||
|
|
||||||
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
|
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
|
||||||
|
integer :: i,j,k
|
||||||
|
|
||||||
fout = C1*f1+C2*f2+C3*f3
|
do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
|
||||||
|
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
|
||||||
|
end do
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
/* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
|
||||||
|
/* 更新:负载均衡问题已经通过优化插值函数解决,此profile静态均衡方案已弃用,本头文件现在未参与编译 */
|
||||||
/* Auto-generated from interp_lb_profile.bin — do not edit */
|
/* Auto-generated from interp_lb_profile.bin — do not edit */
|
||||||
#ifndef INTERP_LB_PROFILE_DATA_H
|
#ifndef INTERP_LB_PROFILE_DATA_H
|
||||||
#define INTERP_LB_PROFILE_DATA_H
|
#define INTERP_LB_PROFILE_DATA_H
|
||||||
|
|||||||
@@ -63,19 +63,28 @@ void kodis(const int ex[3],
|
|||||||
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
|
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
|
||||||
* 并定义 Fortran index: iF=i0+1, ...
|
* 并定义 Fortran index: iF=i0+1, ...
|
||||||
*/
|
*/
|
||||||
for (int k0 = 0; k0 < ex3; ++k0) {
|
// 收紧循环范围:只遍历满足 iF±3/jF±3/kF±3 条件的内部点
|
||||||
|
// iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1)
|
||||||
|
// iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4
|
||||||
|
const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
|
||||||
|
const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
|
||||||
|
const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
|
||||||
|
const int i0_hi = imaxF - 4; // inclusive
|
||||||
|
const int j0_hi = jmaxF - 4;
|
||||||
|
const int k0_hi = kmaxF - 4;
|
||||||
|
|
||||||
|
if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) {
|
||||||
|
free(fh);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
|
||||||
const int kF = k0 + 1;
|
const int kF = k0 + 1;
|
||||||
for (int j0 = 0; j0 < ex2; ++j0) {
|
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
|
||||||
const int jF = j0 + 1;
|
const int jF = j0 + 1;
|
||||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
|
||||||
const int iF = i0 + 1;
|
const int iF = i0 + 1;
|
||||||
|
|
||||||
// Fortran if 条件:
|
|
||||||
// i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引)
|
|
||||||
if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
|
|
||||||
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
|
|
||||||
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
|
|
||||||
{
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
|
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
|
||||||
@@ -100,7 +109,6 @@ void kodis(const int ex[3],
|
|||||||
// Fortran:
|
// Fortran:
|
||||||
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
|
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
|
||||||
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
248
AMSS_NCKU_source/lopsided_kodis_c.C
Normal file
248
AMSS_NCKU_source/lopsided_kodis_c.C
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
#include "tool.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Combined advection (lopsided) + KO dissipation (kodis).
|
||||||
|
* Uses one shared symmetry_bd buffer per call.
|
||||||
|
*/
|
||||||
|
void lopsided_kodis(const int ex[3],
|
||||||
|
const double *X, const double *Y, const double *Z,
|
||||||
|
const double *f, double *f_rhs,
|
||||||
|
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||||
|
int Symmetry, const double SoA[3], double eps)
|
||||||
|
{
|
||||||
|
const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
|
||||||
|
const double F6 = 6.0, F18 = 18.0;
|
||||||
|
const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
|
||||||
|
const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
|
||||||
|
const double cof = 64.0; // 2^6
|
||||||
|
|
||||||
|
const int NO_SYMM = 0, EQ_SYMM = 1;
|
||||||
|
|
||||||
|
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
||||||
|
|
||||||
|
const double dX = X[1] - X[0];
|
||||||
|
const double dY = Y[1] - Y[0];
|
||||||
|
const double dZ = Z[1] - Z[0];
|
||||||
|
|
||||||
|
const double d12dx = ONE / F12 / dX;
|
||||||
|
const double d12dy = ONE / F12 / dY;
|
||||||
|
const double d12dz = ONE / F12 / dZ;
|
||||||
|
|
||||||
|
const int imaxF = ex1;
|
||||||
|
const int jmaxF = ex2;
|
||||||
|
const int kmaxF = ex3;
|
||||||
|
|
||||||
|
int iminF = 1, jminF = 1, kminF = 1;
|
||||||
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
||||||
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
|
||||||
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
|
||||||
|
|
||||||
|
// fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3)
|
||||||
|
const size_t nx = (size_t)ex1 + 3;
|
||||||
|
const size_t ny = (size_t)ex2 + 3;
|
||||||
|
const size_t nz = (size_t)ex3 + 3;
|
||||||
|
const size_t fh_size = nx * ny * nz;
|
||||||
|
|
||||||
|
double *fh = (double*)malloc(fh_size * sizeof(double));
|
||||||
|
if (!fh) return;
|
||||||
|
|
||||||
|
symmetry_bd(3, ex, f, fh, SoA);
|
||||||
|
|
||||||
|
// Advection (same stencil logic as lopsided_c.C)
|
||||||
|
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
|
const double sfx = Sfx[p];
|
||||||
|
if (sfx > ZEO) {
|
||||||
|
if (i0 <= ex1 - 4) {
|
||||||
|
f_rhs[p] += sfx * d12dx *
|
||||||
|
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
||||||
|
} else if (i0 <= ex1 - 3) {
|
||||||
|
f_rhs[p] += sfx * d12dx *
|
||||||
|
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||||
|
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||||
|
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||||
|
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
||||||
|
} else if (i0 <= ex1 - 2) {
|
||||||
|
f_rhs[p] -= sfx * d12dx *
|
||||||
|
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
||||||
|
}
|
||||||
|
} else if (sfx < ZEO) {
|
||||||
|
if ((i0 - 2) >= iminF) {
|
||||||
|
f_rhs[p] -= sfx * d12dx *
|
||||||
|
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
||||||
|
} else if ((i0 - 1) >= iminF) {
|
||||||
|
f_rhs[p] += sfx * d12dx *
|
||||||
|
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
||||||
|
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||||
|
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||||
|
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
||||||
|
} else if (i0 >= iminF) {
|
||||||
|
f_rhs[p] += sfx * d12dx *
|
||||||
|
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const double sfy = Sfy[p];
|
||||||
|
if (sfy > ZEO) {
|
||||||
|
if (j0 <= ex2 - 4) {
|
||||||
|
f_rhs[p] += sfy * d12dy *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
||||||
|
} else if (j0 <= ex2 - 3) {
|
||||||
|
f_rhs[p] += sfy * d12dy *
|
||||||
|
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||||
|
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||||
|
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||||
|
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
||||||
|
} else if (j0 <= ex2 - 2) {
|
||||||
|
f_rhs[p] -= sfy * d12dy *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
||||||
|
}
|
||||||
|
} else if (sfy < ZEO) {
|
||||||
|
if ((j0 - 2) >= jminF) {
|
||||||
|
f_rhs[p] -= sfy * d12dy *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
||||||
|
} else if ((j0 - 1) >= jminF) {
|
||||||
|
f_rhs[p] += sfy * d12dy *
|
||||||
|
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
||||||
|
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||||
|
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||||
|
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
||||||
|
} else if (j0 >= jminF) {
|
||||||
|
f_rhs[p] += sfy * d12dy *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const double sfz = Sfz[p];
|
||||||
|
if (sfz > ZEO) {
|
||||||
|
if (k0 <= ex3 - 4) {
|
||||||
|
f_rhs[p] += sfz * d12dz *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
||||||
|
} else if (k0 <= ex3 - 3) {
|
||||||
|
f_rhs[p] += sfz * d12dz *
|
||||||
|
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||||
|
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||||
|
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||||
|
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
||||||
|
} else if (k0 <= ex3 - 2) {
|
||||||
|
f_rhs[p] -= sfz * d12dz *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
||||||
|
}
|
||||||
|
} else if (sfz < ZEO) {
|
||||||
|
if ((k0 - 2) >= kminF) {
|
||||||
|
f_rhs[p] -= sfz * d12dz *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
||||||
|
} else if ((k0 - 1) >= kminF) {
|
||||||
|
f_rhs[p] += sfz * d12dz *
|
||||||
|
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
||||||
|
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||||
|
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||||
|
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
||||||
|
} else if (k0 >= kminF) {
|
||||||
|
f_rhs[p] += sfz * d12dz *
|
||||||
|
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
||||||
|
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
||||||
|
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
||||||
|
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
||||||
|
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// KO dissipation (same domain restriction as kodiss_c.C)
|
||||||
|
if (eps > ZEO) {
|
||||||
|
const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
|
||||||
|
const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
|
||||||
|
const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
|
||||||
|
const int i0_hi = imaxF - 4; // inclusive
|
||||||
|
const int j0_hi = jmaxF - 4;
|
||||||
|
const int k0_hi = kmaxF - 4;
|
||||||
|
|
||||||
|
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
|
||||||
|
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
|
||||||
|
const int kF = k0 + 1;
|
||||||
|
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
|
||||||
|
const int jF = j0 + 1;
|
||||||
|
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
|
||||||
|
const int iF = i0 + 1;
|
||||||
|
const size_t p = idx_ex(i0, j0, k0, ex);
|
||||||
|
|
||||||
|
const double Dx_term =
|
||||||
|
((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
|
||||||
|
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
|
||||||
|
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
|
||||||
|
TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dX;
|
||||||
|
|
||||||
|
const double Dy_term =
|
||||||
|
((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
|
||||||
|
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
|
||||||
|
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
|
||||||
|
TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dY;
|
||||||
|
|
||||||
|
const double Dz_term =
|
||||||
|
((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
|
||||||
|
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
|
||||||
|
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
|
||||||
|
TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dZ;
|
||||||
|
|
||||||
|
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
free(fh);
|
||||||
|
}
|
||||||
@@ -2,6 +2,12 @@
|
|||||||
|
|
||||||
include makefile.inc
|
include makefile.inc
|
||||||
|
|
||||||
|
## polint(ordn=6) kernel selector:
|
||||||
|
## 1 (default): barycentric fast path
|
||||||
|
## 0 : fallback to Neville path
|
||||||
|
POLINT6_USE_BARY ?= 1
|
||||||
|
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||||
|
|
||||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||||
## make -> opt (PGO-guided, maximum performance)
|
## make -> opt (PGO-guided, maximum performance)
|
||||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||||
@@ -12,15 +18,17 @@ ifeq ($(PGO_MODE),instrument)
|
|||||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||||
-align array64byte -fpp -I${MKLROOT}/include
|
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||||
else
|
else
|
||||||
## opt (default): maximum performance with PGO profile data
|
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
||||||
|
## PGO has been turned off, now tested and found to be negative optimization
|
||||||
|
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
||||||
|
|
||||||
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
-fprofile-instr-use=$(PROFDATA) \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
||||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
-fprofile-instr-use=$(PROFDATA) \
|
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
||||||
-align array64byte -fpp -I${MKLROOT}/include
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
.SUFFIXES: .o .f90 .C .for .cu
|
.SUFFIXES: .o .f90 .C .for .cu
|
||||||
@@ -53,6 +61,9 @@ kodiss_c.o: kodiss_c.C
|
|||||||
lopsided_c.o: lopsided_c.C
|
lopsided_c.o: lopsided_c.C
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
|
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||||
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
@@ -76,7 +87,15 @@ ifeq ($(USE_CXX_KERNELS),0)
|
|||||||
CFILES =
|
CFILES =
|
||||||
else
|
else
|
||||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
||||||
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o
|
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
|
||||||
|
endif
|
||||||
|
|
||||||
|
## RK4 kernel switch (independent from USE_CXX_KERNELS)
|
||||||
|
ifeq ($(USE_CXX_RK4),1)
|
||||||
|
CFILES += rungekutta4_rout_c.o
|
||||||
|
RK4_F90_OBJ =
|
||||||
|
else
|
||||||
|
RK4_F90_OBJ = rungekutta4_rout.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||||
@@ -96,7 +115,7 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
|
|||||||
|
|
||||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||||
rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\
|
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
|
||||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||||
|
|||||||
@@ -10,6 +10,20 @@ filein = -I/usr/include/ -I${MKLROOT}/include
|
|||||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
||||||
|
|
||||||
|
## Memory allocator switch
|
||||||
|
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
||||||
|
## 0 : use system default allocator (ptmalloc)
|
||||||
|
USE_TBBMALLOC ?= 1
|
||||||
|
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
|
||||||
|
ifneq ($(wildcard $(TBBMALLOC_SO)),)
|
||||||
|
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
|
||||||
|
else
|
||||||
|
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
|
||||||
|
endif
|
||||||
|
ifeq ($(USE_TBBMALLOC),1)
|
||||||
|
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
||||||
|
endif
|
||||||
|
|
||||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
||||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
## opt : (default) maximum performance with PGO profile-guided optimization
|
||||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
||||||
@@ -33,6 +47,12 @@ endif
|
|||||||
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
||||||
## 0 : fall back to original Fortran kernels
|
## 0 : fall back to original Fortran kernels
|
||||||
USE_CXX_KERNELS ?= 1
|
USE_CXX_KERNELS ?= 1
|
||||||
|
|
||||||
|
## RK4 kernel implementation switch
|
||||||
|
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
|
||||||
|
## 0 : use original Fortran rungekutta4_rout.o
|
||||||
|
USE_CXX_RK4 ?= 1
|
||||||
|
|
||||||
f90 = ifx
|
f90 = ifx
|
||||||
f77 = ifx
|
f77 = ifx
|
||||||
CXX = icpx
|
CXX = icpx
|
||||||
|
|||||||
@@ -1934,18 +1934,32 @@
|
|||||||
! when if=1 -> ic=0, this is different to vertex center grid
|
! when if=1 -> ic=0, this is different to vertex center grid
|
||||||
real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc
|
real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc
|
||||||
integer,dimension(3) :: cxI
|
integer,dimension(3) :: cxI
|
||||||
integer :: i,j,k,ii,jj,kk
|
integer :: i,j,k,ii,jj,kk,px,py,pz
|
||||||
real*8, dimension(6,6) :: tmp2
|
real*8, dimension(6,6) :: tmp2
|
||||||
real*8, dimension(6) :: tmp1
|
real*8, dimension(6) :: tmp1
|
||||||
|
integer, dimension(extf(1)) :: cix
|
||||||
|
integer, dimension(extf(2)) :: ciy
|
||||||
|
integer, dimension(extf(3)) :: ciz
|
||||||
|
integer, dimension(extf(1)) :: pix
|
||||||
|
integer, dimension(extf(2)) :: piy
|
||||||
|
integer, dimension(extf(3)) :: piz
|
||||||
|
|
||||||
real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
|
real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
|
||||||
real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
|
real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
|
||||||
|
real*8, dimension(6,2), parameter :: WC = reshape((/&
|
||||||
|
C1,C2,C3,C4,C5,C6,&
|
||||||
|
C6,C5,C4,C3,C2,C1/), (/6,2/))
|
||||||
|
|
||||||
integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
|
integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
|
||||||
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
|
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
|
||||||
|
integer::maxcx,maxcy,maxcz
|
||||||
|
|
||||||
real*8,dimension(3) :: CD,FD
|
real*8,dimension(3) :: CD,FD
|
||||||
|
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
||||||
|
real*8 :: tmp_xyz_line(extc(1)) ! 存储整条 X 线上完成 Y 向融合后的结果
|
||||||
|
real*8 :: v1, v2, v3, v4, v5, v6
|
||||||
|
integer :: ic, jc, kc, ix_offset,ix,iy,iz
|
||||||
|
real*8 :: res_line
|
||||||
if(wei.ne.3)then
|
if(wei.ne.3)then
|
||||||
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
||||||
write(*,*)"dim = ",wei
|
write(*,*)"dim = ",wei
|
||||||
@@ -2020,144 +2034,117 @@
|
|||||||
return
|
return
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
do i = imino,imaxo
|
||||||
|
ii = i + lbf(1) - 1
|
||||||
|
cix(i) = ii/2 - lbc(1) + 1
|
||||||
|
if(ii/2*2 == ii)then
|
||||||
|
pix(i) = 1
|
||||||
|
else
|
||||||
|
pix(i) = 2
|
||||||
|
endif
|
||||||
|
enddo
|
||||||
|
do j = jmino,jmaxo
|
||||||
|
jj = j + lbf(2) - 1
|
||||||
|
ciy(j) = jj/2 - lbc(2) + 1
|
||||||
|
if(jj/2*2 == jj)then
|
||||||
|
piy(j) = 1
|
||||||
|
else
|
||||||
|
piy(j) = 2
|
||||||
|
endif
|
||||||
|
enddo
|
||||||
|
do k = kmino,kmaxo
|
||||||
|
kk = k + lbf(3) - 1
|
||||||
|
ciz(k) = kk/2 - lbc(3) + 1
|
||||||
|
if(kk/2*2 == kk)then
|
||||||
|
piz(k) = 1
|
||||||
|
else
|
||||||
|
piz(k) = 2
|
||||||
|
endif
|
||||||
|
enddo
|
||||||
|
|
||||||
|
maxcx = maxval(cix(imino:imaxo))
|
||||||
|
maxcy = maxval(ciy(jmino:jmaxo))
|
||||||
|
maxcz = maxval(ciz(kmino:kmaxo))
|
||||||
|
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
|
||||||
|
write(*,*)"error in prolong"
|
||||||
|
return
|
||||||
|
endif
|
||||||
|
|
||||||
call symmetry_bd(3,extc,func,funcc,SoA)
|
call symmetry_bd(3,extc,func,funcc,SoA)
|
||||||
|
|
||||||
!~~~~~~> prolongation start...
|
!~~~~~~> prolongation start...
|
||||||
do k = kmino,kmaxo
|
do k = kmino, kmaxo
|
||||||
do j = jmino,jmaxo
|
pz = piz(k)
|
||||||
do i = imino,imaxo
|
kc = ciz(k)
|
||||||
cxI(1) = i
|
|
||||||
cxI(2) = j
|
do j = jmino, jmaxo
|
||||||
cxI(3) = k
|
py = piy(j)
|
||||||
! change to coarse level reference
|
jc = ciy(j)
|
||||||
!|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---|
|
|
||||||
!|=======x===============x===============x===============x=======|
|
! --- 步骤 1 & 2 融合:分段处理 X 轴,提升 Cache 命中率 ---
|
||||||
cxI = (cxI+lbf-1)/2
|
! 我们将 ii 循环逻辑重组,减少对 funcc 的跨行重复访问
|
||||||
! change to array index
|
do ii = 1, extc(1)
|
||||||
cxI = cxI - lbc + 1
|
! 1. 先做 Z 方向的 6 条线插值(针对当前的 ii 和当前的 6 个 iy)
|
||||||
|
! 我们直接在这里把 Y 方向的加权也做了,省去 tmp_yz 数组
|
||||||
|
! 这样 funcc 的数据读进来后立即完成所有维度的贡献,不再写回内存
|
||||||
|
|
||||||
|
res_line = 0.0d0
|
||||||
|
do jj = 1, 6
|
||||||
|
iy = jc - 3 + jj
|
||||||
|
! 这一行代码是核心:一次性完成 Z 插值并加上 Y 的权重
|
||||||
|
! 编译器会把 WC(jj, py) 存在寄存器里
|
||||||
|
res_line = res_line + WC(jj, py) * ( &
|
||||||
|
WC(1, pz) * funcc(ii, iy, kc-2) + &
|
||||||
|
WC(2, pz) * funcc(ii, iy, kc-1) + &
|
||||||
|
WC(3, pz) * funcc(ii, iy, kc ) + &
|
||||||
|
WC(4, pz) * funcc(ii, iy, kc+1) + &
|
||||||
|
WC(5, pz) * funcc(ii, iy, kc+2) + &
|
||||||
|
WC(6, pz) * funcc(ii, iy, kc+3) )
|
||||||
|
end do
|
||||||
|
tmp_xyz_line(ii) = res_line
|
||||||
|
end do
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if(any(cxI+3 > extc)) write(*,*)"error in prolong"
|
|
||||||
ii=i+lbf(1)-1
|
|
||||||
jj=j+lbf(2)-1
|
|
||||||
kk=k+lbf(3)-1
|
|
||||||
#if 0
|
#if 0
|
||||||
if(ii/2*2==ii)then
|
! 1. 【降维:Z 向】对当前 (j,k) 相关的 6 条 Y 偏置线进行 Z 向插值
|
||||||
if(jj/2*2==jj)then
|
! 结果存入 tmp_yz(x_index, y_offset)
|
||||||
if(kk/2*2==kk)then
|
do jj = 1, 6
|
||||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
iy = jc - 3 + jj
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
do ii = 1, extc(1)
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
tmp_yz(ii, jj) = WC(1,pz)*funcc(ii, iy, kc-2) + &
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
WC(2,pz)*funcc(ii, iy, kc-1) + &
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
WC(3,pz)*funcc(ii, iy, kc ) + &
|
||||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
WC(4,pz)*funcc(ii, iy, kc+1) + &
|
||||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
WC(5,pz)*funcc(ii, iy, kc+2) + &
|
||||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
WC(6,pz)*funcc(ii, iy, kc+3)
|
||||||
else
|
end do
|
||||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
end do
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
|
||||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
|
||||||
endif
|
|
||||||
else
|
|
||||||
if(kk/2*2==kk)then
|
|
||||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
|
||||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
|
||||||
else
|
|
||||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
|
||||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
else
|
|
||||||
if(jj/2*2==jj)then
|
|
||||||
if(kk/2*2==kk)then
|
|
||||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
|
||||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
|
||||||
else
|
|
||||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
|
||||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
|
||||||
endif
|
|
||||||
else
|
|
||||||
if(kk/2*2==kk)then
|
|
||||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
|
||||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
|
||||||
else
|
|
||||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
|
||||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
#else
|
|
||||||
if(kk/2*2==kk)then
|
|
||||||
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
else
|
|
||||||
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
|
||||||
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
|
||||||
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
|
||||||
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
|
||||||
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
|
||||||
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
|
||||||
endif
|
|
||||||
|
|
||||||
if(jj/2*2==jj)then
|
! 2. 【降维:Y 向】将 Z 向结果合并,得到整条 X 轴线上的 Y-Z 融合值
|
||||||
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
do ii = 1, extc(1)
|
||||||
else
|
tmp_xyz_line(ii) = WC(1,py)*tmp_yz(ii, 1) + WC(2,py)*tmp_yz(ii, 2) + &
|
||||||
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
WC(3,py)*tmp_yz(ii, 3) + WC(4,py)*tmp_yz(ii, 4) + &
|
||||||
endif
|
WC(5,py)*tmp_yz(ii, 5) + WC(6,py)*tmp_yz(ii, 6)
|
||||||
|
end do
|
||||||
if(ii/2*2==ii)then
|
|
||||||
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
|
||||||
else
|
|
||||||
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
|
||||||
endif
|
|
||||||
#endif
|
#endif
|
||||||
enddo
|
! 3. 【降维:X 向】最后在最内层只处理 X 方向的 6 点加权
|
||||||
enddo
|
! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
|
||||||
enddo
|
do i = imino, imaxo
|
||||||
|
px = pix(i)
|
||||||
|
ic = cix(i)
|
||||||
|
|
||||||
|
! 直接从预计算好的 line 中读取连续的 6 个点
|
||||||
|
! ic-2 到 ic+3 对应原始 6 点算子
|
||||||
|
funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
|
||||||
|
WC(2,px)*tmp_xyz_line(ic-1) + &
|
||||||
|
WC(3,px)*tmp_xyz_line(ic ) + &
|
||||||
|
WC(4,px)*tmp_xyz_line(ic+1) + &
|
||||||
|
WC(5,px)*tmp_xyz_line(ic+2) + &
|
||||||
|
WC(6,px)*tmp_xyz_line(ic+3)
|
||||||
|
end do
|
||||||
|
end do
|
||||||
|
end do
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
212
AMSS_NCKU_source/rungekutta4_rout_c.C
Normal file
212
AMSS_NCKU_source/rungekutta4_rout_c.C
Normal file
@@ -0,0 +1,212 @@
|
|||||||
|
#include "rungekutta4_rout.h"
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <complex>
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
inline void rk4_stage0(std::size_t n,
|
||||||
|
const double *__restrict f0,
|
||||||
|
const double *__restrict frhs,
|
||||||
|
double *__restrict f1,
|
||||||
|
double c) {
|
||||||
|
std::size_t i = 0;
|
||||||
|
#if defined(__AVX512F__)
|
||||||
|
const __m512d vc = _mm512_set1_pd(c);
|
||||||
|
for (; i + 7 < n; i += 8) {
|
||||||
|
const __m512d v0 = _mm512_loadu_pd(f0 + i);
|
||||||
|
const __m512d vr = _mm512_loadu_pd(frhs + i);
|
||||||
|
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0));
|
||||||
|
}
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
const __m256d vc = _mm256_set1_pd(c);
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
const __m256d v0 = _mm256_loadu_pd(f0 + i);
|
||||||
|
const __m256d vr = _mm256_loadu_pd(frhs + i);
|
||||||
|
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#pragma ivdep
|
||||||
|
for (; i < n; ++i) {
|
||||||
|
f1[i] = f0[i] + c * frhs[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void rk4_rhs_accum(std::size_t n,
|
||||||
|
const double *__restrict f1,
|
||||||
|
double *__restrict frhs) {
|
||||||
|
std::size_t i = 0;
|
||||||
|
#if defined(__AVX512F__)
|
||||||
|
const __m512d v2 = _mm512_set1_pd(2.0);
|
||||||
|
for (; i + 7 < n; i += 8) {
|
||||||
|
const __m512d v1 = _mm512_loadu_pd(f1 + i);
|
||||||
|
const __m512d vrhs = _mm512_loadu_pd(frhs + i);
|
||||||
|
_mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs));
|
||||||
|
}
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
const __m256d v2 = _mm256_set1_pd(2.0);
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
const __m256d v1 = _mm256_loadu_pd(f1 + i);
|
||||||
|
const __m256d vrhs = _mm256_loadu_pd(frhs + i);
|
||||||
|
_mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#pragma ivdep
|
||||||
|
for (; i < n; ++i) {
|
||||||
|
frhs[i] = frhs[i] + 2.0 * f1[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void rk4_f1_from_f0_f1(std::size_t n,
|
||||||
|
const double *__restrict f0,
|
||||||
|
double *__restrict f1,
|
||||||
|
double c) {
|
||||||
|
std::size_t i = 0;
|
||||||
|
#if defined(__AVX512F__)
|
||||||
|
const __m512d vc = _mm512_set1_pd(c);
|
||||||
|
for (; i + 7 < n; i += 8) {
|
||||||
|
const __m512d v0 = _mm512_loadu_pd(f0 + i);
|
||||||
|
const __m512d v1 = _mm512_loadu_pd(f1 + i);
|
||||||
|
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0));
|
||||||
|
}
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
const __m256d vc = _mm256_set1_pd(c);
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
const __m256d v0 = _mm256_loadu_pd(f0 + i);
|
||||||
|
const __m256d v1 = _mm256_loadu_pd(f1 + i);
|
||||||
|
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#pragma ivdep
|
||||||
|
for (; i < n; ++i) {
|
||||||
|
f1[i] = f0[i] + c * f1[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void rk4_stage3(std::size_t n,
|
||||||
|
const double *__restrict f0,
|
||||||
|
double *__restrict f1,
|
||||||
|
const double *__restrict frhs,
|
||||||
|
double c) {
|
||||||
|
std::size_t i = 0;
|
||||||
|
#if defined(__AVX512F__)
|
||||||
|
const __m512d vc = _mm512_set1_pd(c);
|
||||||
|
for (; i + 7 < n; i += 8) {
|
||||||
|
const __m512d v0 = _mm512_loadu_pd(f0 + i);
|
||||||
|
const __m512d v1 = _mm512_loadu_pd(f1 + i);
|
||||||
|
const __m512d vr = _mm512_loadu_pd(frhs + i);
|
||||||
|
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0));
|
||||||
|
}
|
||||||
|
#elif defined(__AVX2__)
|
||||||
|
const __m256d vc = _mm256_set1_pd(c);
|
||||||
|
for (; i + 3 < n; i += 4) {
|
||||||
|
const __m256d v0 = _mm256_loadu_pd(f0 + i);
|
||||||
|
const __m256d v1 = _mm256_loadu_pd(f1 + i);
|
||||||
|
const __m256d vr = _mm256_loadu_pd(frhs + i);
|
||||||
|
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#pragma ivdep
|
||||||
|
for (; i < n; ++i) {
|
||||||
|
f1[i] = f0[i] + c * (f1[i] + frhs[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
|
||||||
|
constexpr double F1o6 = 1.0 / 6.0;
|
||||||
|
constexpr double HLF = 0.5;
|
||||||
|
constexpr double TWO = 2.0;
|
||||||
|
|
||||||
|
switch (RK4) {
|
||||||
|
case 0:
|
||||||
|
f1 = f0 + HLF * dT * f_rhs;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
f_rhs = f_rhs + TWO * f1;
|
||||||
|
f1 = f0 + HLF * dT * f1;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
f_rhs = f_rhs + TWO * f1;
|
||||||
|
f1 = f0 + dT * f1;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
f1 = f0 + F1o6 * dT * (f1 + f_rhs);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
|
||||||
|
std::abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void rungekutta4_cplxscalar_(double &dT,
|
||||||
|
std::complex<double> &f0,
|
||||||
|
std::complex<double> &f1,
|
||||||
|
std::complex<double> &f_rhs,
|
||||||
|
int &RK4) {
|
||||||
|
constexpr double F1o6 = 1.0 / 6.0;
|
||||||
|
constexpr double HLF = 0.5;
|
||||||
|
constexpr double TWO = 2.0;
|
||||||
|
|
||||||
|
switch (RK4) {
|
||||||
|
case 0:
|
||||||
|
f1 = f0 + HLF * dT * f_rhs;
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
f_rhs = f_rhs + TWO * f1;
|
||||||
|
f1 = f0 + HLF * dT * f1;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
f_rhs = f_rhs + TWO * f1;
|
||||||
|
f1 = f0 + dT * f1;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
f1 = f0 + F1o6 * dT * (f1 + f_rhs);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
|
||||||
|
std::abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int f_rungekutta4_rout(int *ex, double &dT,
|
||||||
|
double *f0, double *f1, double *f_rhs,
|
||||||
|
int &RK4) {
|
||||||
|
const std::size_t n = static_cast<std::size_t>(ex[0]) *
|
||||||
|
static_cast<std::size_t>(ex[1]) *
|
||||||
|
static_cast<std::size_t>(ex[2]);
|
||||||
|
const double *const __restrict f0r = f0;
|
||||||
|
double *const __restrict f1r = f1;
|
||||||
|
double *const __restrict frhs = f_rhs;
|
||||||
|
|
||||||
|
if (__builtin_expect(static_cast<unsigned>(RK4) > 3u, 0)) {
|
||||||
|
std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4);
|
||||||
|
std::abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (RK4) {
|
||||||
|
case 0:
|
||||||
|
rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
rk4_rhs_accum(n, f1r, frhs);
|
||||||
|
rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
rk4_rhs_accum(n, f1r, frhs);
|
||||||
|
rk4_f1_from_f0_f1(n, f0r, f1r, dT);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // extern "C"
|
||||||
@@ -5,6 +5,7 @@
|
|||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
/* 主网格:0-based -> 1D */
|
/* 主网格:0-based -> 1D */
|
||||||
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
|
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
|
||||||
const int ex1 = ex[0], ex2 = ex[1];
|
const int ex1 = ex[0], ex2 = ex[1];
|
||||||
@@ -87,60 +88,159 @@ static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc
|
|||||||
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
||||||
* enddo
|
* enddo
|
||||||
*/
|
*/
|
||||||
|
static inline void symmetry_bd_impl(int ord,
|
||||||
|
int shift,
|
||||||
|
const int extc[3],
|
||||||
|
const double *__restrict func,
|
||||||
|
double *__restrict funcc,
|
||||||
|
const double SoA[3])
|
||||||
|
{
|
||||||
|
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
|
||||||
|
const int nx = extc1 + ord;
|
||||||
|
const int ny = extc2 + ord;
|
||||||
|
|
||||||
|
const size_t snx = (size_t)nx;
|
||||||
|
const size_t splane = (size_t)nx * (size_t)ny;
|
||||||
|
const size_t interior_i = (size_t)shift + 1u; /* iF = 1 */
|
||||||
|
const size_t interior_j = ((size_t)shift + 1u) * snx; /* jF = 1 */
|
||||||
|
const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */
|
||||||
|
const size_t interior0 = interior_k + interior_j + interior_i;
|
||||||
|
|
||||||
|
/* 1) funcc(1:extc1,1:extc2,1:extc3) = func */
|
||||||
|
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||||
|
const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
|
||||||
|
const size_t dst_k0 = interior0 + (size_t)k0 * splane;
|
||||||
|
for (int j0 = 0; j0 < extc2; ++j0) {
|
||||||
|
const double *src = src_k + (size_t)j0 * (size_t)extc1;
|
||||||
|
double *dst = funcc + dst_k0 + (size_t)j0 * snx;
|
||||||
|
memcpy(dst, src, (size_t)extc1 * sizeof(double));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */
|
||||||
|
const double s1 = SoA[0];
|
||||||
|
if (s1 == 1.0) {
|
||||||
|
for (int ii = 0; ii < ord; ++ii) {
|
||||||
|
const size_t dst_i = (size_t)(shift - ii);
|
||||||
|
const size_t src_i = (size_t)(shift + ii + 1);
|
||||||
|
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||||
|
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
|
||||||
|
for (int j0 = 0; j0 < extc2; ++j0) {
|
||||||
|
const size_t off = kbase + (size_t)j0 * snx;
|
||||||
|
funcc[off + dst_i] = funcc[off + src_i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (s1 == -1.0) {
|
||||||
|
for (int ii = 0; ii < ord; ++ii) {
|
||||||
|
const size_t dst_i = (size_t)(shift - ii);
|
||||||
|
const size_t src_i = (size_t)(shift + ii + 1);
|
||||||
|
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||||
|
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
|
||||||
|
for (int j0 = 0; j0 < extc2; ++j0) {
|
||||||
|
const size_t off = kbase + (size_t)j0 * snx;
|
||||||
|
funcc[off + dst_i] = -funcc[off + src_i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int ii = 0; ii < ord; ++ii) {
|
||||||
|
const size_t dst_i = (size_t)(shift - ii);
|
||||||
|
const size_t src_i = (size_t)(shift + ii + 1);
|
||||||
|
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||||
|
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
|
||||||
|
for (int j0 = 0; j0 < extc2; ++j0) {
|
||||||
|
const size_t off = kbase + (size_t)j0 * snx;
|
||||||
|
funcc[off + dst_i] = funcc[off + src_i] * s1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */
|
||||||
|
const double s2 = SoA[1];
|
||||||
|
if (s2 == 1.0) {
|
||||||
|
for (int jj = 0; jj < ord; ++jj) {
|
||||||
|
const size_t dst_j = (size_t)(shift - jj) * snx;
|
||||||
|
const size_t src_j = (size_t)(shift + jj + 1) * snx;
|
||||||
|
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||||
|
const size_t kbase = interior_k + (size_t)k0 * splane;
|
||||||
|
double *dst = funcc + kbase + dst_j;
|
||||||
|
const double *src = funcc + kbase + src_j;
|
||||||
|
for (int i = 0; i < nx; ++i) dst[i] = src[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (s2 == -1.0) {
|
||||||
|
for (int jj = 0; jj < ord; ++jj) {
|
||||||
|
const size_t dst_j = (size_t)(shift - jj) * snx;
|
||||||
|
const size_t src_j = (size_t)(shift + jj + 1) * snx;
|
||||||
|
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||||
|
const size_t kbase = interior_k + (size_t)k0 * splane;
|
||||||
|
double *dst = funcc + kbase + dst_j;
|
||||||
|
const double *src = funcc + kbase + src_j;
|
||||||
|
for (int i = 0; i < nx; ++i) dst[i] = -src[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int jj = 0; jj < ord; ++jj) {
|
||||||
|
const size_t dst_j = (size_t)(shift - jj) * snx;
|
||||||
|
const size_t src_j = (size_t)(shift + jj + 1) * snx;
|
||||||
|
for (int k0 = 0; k0 < extc3; ++k0) {
|
||||||
|
const size_t kbase = interior_k + (size_t)k0 * splane;
|
||||||
|
double *dst = funcc + kbase + dst_j;
|
||||||
|
const double *src = funcc + kbase + src_j;
|
||||||
|
for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */
|
||||||
|
const double s3 = SoA[2];
|
||||||
|
if (s3 == 1.0) {
|
||||||
|
for (int kk = 0; kk < ord; ++kk) {
|
||||||
|
const size_t dst_k = (size_t)(shift - kk) * splane;
|
||||||
|
const size_t src_k = (size_t)(shift + kk + 1) * splane;
|
||||||
|
double *dst = funcc + dst_k;
|
||||||
|
const double *src = funcc + src_k;
|
||||||
|
for (size_t p = 0; p < splane; ++p) dst[p] = src[p];
|
||||||
|
}
|
||||||
|
} else if (s3 == -1.0) {
|
||||||
|
for (int kk = 0; kk < ord; ++kk) {
|
||||||
|
const size_t dst_k = (size_t)(shift - kk) * splane;
|
||||||
|
const size_t src_k = (size_t)(shift + kk + 1) * splane;
|
||||||
|
double *dst = funcc + dst_k;
|
||||||
|
const double *src = funcc + src_k;
|
||||||
|
for (size_t p = 0; p < splane; ++p) dst[p] = -src[p];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int kk = 0; kk < ord; ++kk) {
|
||||||
|
const size_t dst_k = (size_t)(shift - kk) * splane;
|
||||||
|
const size_t src_k = (size_t)(shift + kk + 1) * splane;
|
||||||
|
double *dst = funcc + dst_k;
|
||||||
|
const double *src = funcc + src_k;
|
||||||
|
for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static inline void symmetry_bd(int ord,
|
static inline void symmetry_bd(int ord,
|
||||||
const int extc[3],
|
const int extc[3],
|
||||||
const double *func,
|
const double *func,
|
||||||
double *funcc,
|
double *funcc,
|
||||||
const double SoA[3])
|
const double SoA[3])
|
||||||
{
|
{
|
||||||
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
|
if (ord <= 0) return;
|
||||||
|
|
||||||
// 1) funcc(1:extc1,1:extc2,1:extc3) = func
|
/* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */
|
||||||
// Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
|
if (ord == 2) {
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
|
||||||
for (int j0 = 0; j0 < extc2; ++j0) {
|
return;
|
||||||
for (int i0 = 0; i0 < extc1; ++i0) {
|
}
|
||||||
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
if (ord == 3) {
|
||||||
funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
|
symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
|
||||||
}
|
return;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
|
symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
|
||||||
for (int ii = 0; ii <= ord - 1; ++ii) {
|
|
||||||
const int iF_dst = -ii; // 0, -1, -2, ...
|
|
||||||
const int iF_src = ii + 1; // 1, 2, 3, ...
|
|
||||||
for (int kF = 1; kF <= extc3; ++kF) {
|
|
||||||
for (int jF = 1; jF <= extc2; ++jF) {
|
|
||||||
funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
|
|
||||||
funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
|
|
||||||
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
|
|
||||||
for (int jj = 0; jj <= ord - 1; ++jj) {
|
|
||||||
const int jF_dst = -jj;
|
|
||||||
const int jF_src = jj + 1;
|
|
||||||
for (int kF = 1; kF <= extc3; ++kF) {
|
|
||||||
for (int iF = -ord + 1; iF <= extc1; ++iF) {
|
|
||||||
funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
|
|
||||||
funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
|
|
||||||
for (int kk = 0; kk <= ord - 1; ++kk) {
|
|
||||||
const int kF_dst = -kk;
|
|
||||||
const int kF_src = kk + 1;
|
|
||||||
for (int jF = -ord + 1; jF <= extc2; ++jF) {
|
|
||||||
for (int iF = -ord + 1; iF <= extc1; ++iF) {
|
|
||||||
funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
|
|
||||||
funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -25,3 +25,9 @@ void lopsided(const int ex[3],
|
|||||||
const double *f, double *f_rhs,
|
const double *f, double *f_rhs,
|
||||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||||
int Symmetry, const double SoA[3]);
|
int Symmetry, const double SoA[3]);
|
||||||
|
|
||||||
|
void lopsided_kodis(const int ex[3],
|
||||||
|
const double *X, const double *Y, const double *Z,
|
||||||
|
const double *f, double *f_rhs,
|
||||||
|
const double *Sfx, const double *Sfy, const double *Sfz,
|
||||||
|
int Symmetry, const double SoA[3], double eps);
|
||||||
|
|||||||
@@ -43,7 +43,8 @@ def get_last_n_cores_per_socket(n=32):
|
|||||||
cpu_str = ",".join(segments)
|
cpu_str = ",".join(segments)
|
||||||
total = len(segments) * n
|
total = len(segments) * n
|
||||||
print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)")
|
print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)")
|
||||||
return f"taskset -c {cpu_str}"
|
#return f"taskset -c {cpu_str}"
|
||||||
|
return f""
|
||||||
|
|
||||||
|
|
||||||
## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
|
## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
|
||||||
@@ -69,7 +70,7 @@ def makefile_ABE():
|
|||||||
|
|
||||||
## Build command with CPU binding to nohz_full cores
|
## Build command with CPU binding to nohz_full cores
|
||||||
if (input_data.GPU_Calculation == "no"):
|
if (input_data.GPU_Calculation == "no"):
|
||||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE"
|
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
|
||||||
elif (input_data.GPU_Calculation == "yes"):
|
elif (input_data.GPU_Calculation == "yes"):
|
||||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -1,97 +0,0 @@
|
|||||||
# AMSS-NCKU PGO Profile Analysis Report
|
|
||||||
|
|
||||||
## 1. Profiling Environment
|
|
||||||
|
|
||||||
| Item | Value |
|
|
||||||
|------|-------|
|
|
||||||
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
|
|
||||||
| Instrumentation Flag | `-fprofile-instr-generate` |
|
|
||||||
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
|
|
||||||
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
|
|
||||||
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
|
|
||||||
| Merged Profile | `default.profdata` (394 KB) |
|
|
||||||
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
|
|
||||||
|
|
||||||
## 2. Reduced Simulation Parameters (for profiling run)
|
|
||||||
|
|
||||||
| Parameter | Production Value | Profiling Value |
|
|
||||||
|-----------|-----------------|-----------------|
|
|
||||||
| MPI_processes | 64 | 1 |
|
|
||||||
| grid_level | 9 | 4 |
|
|
||||||
| static_grid_level | 5 | 3 |
|
|
||||||
| static_grid_number | 96 | 24 |
|
|
||||||
| moving_grid_number | 48 | 16 |
|
|
||||||
| largest_box_xyz_max | 320^3 | 160^3 |
|
|
||||||
| Final_Evolution_Time | 1000.0 | 10.0 |
|
|
||||||
| Evolution_Step_Number | 10,000,000 | 1,000 |
|
|
||||||
| Detector_Number | 12 | 2 |
|
|
||||||
|
|
||||||
## 3. Profile Summary
|
|
||||||
|
|
||||||
| Metric | Value |
|
|
||||||
|--------|-------|
|
|
||||||
| Total instrumented functions | 1,392 |
|
|
||||||
| Functions with non-zero counts | 117 (8.4%) |
|
|
||||||
| Functions with zero counts | 1,275 (91.6%) |
|
|
||||||
| Maximum function entry count | 386,459,248 |
|
|
||||||
| Maximum internal block count | 370,477,680 |
|
|
||||||
| Total block count | 4,198,023,118 |
|
|
||||||
|
|
||||||
## 4. Top 20 Hotspot Functions
|
|
||||||
|
|
||||||
| Rank | Total Count | Max Block Count | Function | Category |
|
|
||||||
|------|------------|-----------------|----------|----------|
|
|
||||||
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
|
|
||||||
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
|
|
||||||
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
|
|
||||||
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
|
|
||||||
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
|
|
||||||
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
|
|
||||||
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
|
|
||||||
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
|
|
||||||
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
|
|
||||||
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
|
|
||||||
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
|
|
||||||
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
|
|
||||||
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
|
|
||||||
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
|
|
||||||
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
|
|
||||||
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
|
|
||||||
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
|
|
||||||
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
|
|
||||||
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
|
|
||||||
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
|
|
||||||
|
|
||||||
## 5. Hotspot Category Breakdown
|
|
||||||
|
|
||||||
Top 20 functions account for ~98% of total execution counts:
|
|
||||||
|
|
||||||
| Category | Functions | Combined Count | Share |
|
|
||||||
|----------|-----------|---------------|-------|
|
|
||||||
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
|
|
||||||
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
|
|
||||||
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
|
|
||||||
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
|
|
||||||
| Dissipation | kodis_ | ~92M | ~2% |
|
|
||||||
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
|
|
||||||
|
|
||||||
## 6. Conclusions
|
|
||||||
|
|
||||||
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
|
|
||||||
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
|
|
||||||
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
|
|
||||||
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
|
|
||||||
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
|
|
||||||
|
|
||||||
## 7. PGO Phase 2 Usage
|
|
||||||
|
|
||||||
To apply the profile, use the following flags in `makefile.inc`:
|
|
||||||
|
|
||||||
```makefile
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
|
||||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
|
|
||||||
-align array64byte -fpp -I${MKLROOT}/include
|
|
||||||
```
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user