Compare commits

..

4 Commits

Author SHA1 Message Date
jaunatisblue
588fb675a0 尝试划分4block但是效果不好,转为研究访存 2026-02-28 21:17:02 +08:00
aabe74c098 短暂的4划分但是以失败告终 2026-02-28 08:23:30 +08:00
jaunatisblue
f147f79ffa 修改block划分,对负载高的rank所在block进行划分,添加到空rank,空rank是平移得到的 2026-02-26 09:40:46 +08:00
jaunatisblue
8abac8dd88 对rank运行时间统计,两个函数分别在不同的计算中被调用,因此我对两个重载的函数分别进行了mpi实际计算时间的统计,对于第一个PatList_Interp_Points 调用 Interp_points,我取排名前三的rank时间,发现每次只有一个rank时间较长,Rank [ 52]: Calc 0.000012 s
Rank [  20]: Calc 0.000003 s

Rank [  35]: Calc 0.000003 s

Rank [  10]: Calc 0.000010 s

Rank [  17]: Calc 0.000005 s

Rank [  32]: Calc 0.000003 s,而且rank不固定,一般就是rank 10 和 rank 52;
但尽管有很多,比前者时间还是少很多
对于第二个Surf_Wave 调用 Interp_points,我发现前四个rank时间最长,比较固定,就是下面四个rank

Rank [  27]: Calc 0.331978 s

Rank [  35]: Calc 0.242219 s

Rank [  28]: Calc 0.242132 s

Rank [  36]: Calc 0.197024 s
因此下面surf_wave是核心
2026-02-24 14:34:24 +08:00
62 changed files with 20209 additions and 31612 deletions

4
.gitignore vendored
View File

@@ -1,6 +1,6 @@
__pycache__ __pycache__
GW150914 GW150914
GW150914* GW150914-origin
docs docs
*.tmp *.tmp
.codex

View File

@@ -66,7 +66,8 @@ if os.path.exists(File_directory):
## Prompt whether to overwrite the existing directory ## Prompt whether to overwrite the existing directory
while True: while True:
try: try:
inputvalue = input() ## inputvalue = input()
inputvalue = "continue"
## If the user agrees to overwrite, proceed and remove the existing directory ## If the user agrees to overwrite, proceed and remove the existing directory
if ( inputvalue == "continue" ): if ( inputvalue == "continue" ):
print( " Continue the calculation !!! " ) print( " Continue the calculation !!! " )
@@ -177,9 +178,6 @@ print( " AMSS-NCKU macro file macrodef.h has been generated. " )
generate_macrodef.generate_macrodef_fh() generate_macrodef.generate_macrodef_fh()
print( " AMSS-NCKU macro file macrodef.fh has been generated. " ) print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
generate_macrodef.generate_build_config()
print( " AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. " )
################################################################## ##################################################################
@@ -222,11 +220,9 @@ shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
macrodef_h_path = os.path.join(File_directory, "macrodef.h") macrodef_h_path = os.path.join(File_directory, "macrodef.h")
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh") macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
shutil.copy2(macrodef_h_path, AMSS_NCKU_source_copy) shutil.copy2(macrodef_h_path, AMSS_NCKU_source_copy)
shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy) shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
# Notes on copying files: # Notes on copying files:
# shutil.copy2 preserves file metadata such as modification time. # shutil.copy2 preserves file metadata such as modification time.
@@ -275,12 +271,6 @@ if not os.path.exists( ABE_file ):
## Copy the executable ABE (or ABEGPU) into the run directory ## Copy the executable ABE (or ABEGPU) into the run directory
shutil.copy2(ABE_file, output_directory) shutil.copy2(ABE_file, output_directory)
## Copy interp load balance profile if present (for optimize pass)
interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
if os.path.exists(interp_lb_profile):
shutil.copy2(interp_lb_profile, output_directory)
print( " Copied interp_lb_profile.bin to run directory " )
########################### ###########################
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory ## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory

View File

@@ -1,100 +0,0 @@
##################################################################
##
## AMSS-NCKU Plot-Only Restart Script
## Author: Xiaoqu / Claude
## 2026/05/12
##
## This script checks for existing output data from AMSS_NCKU_Program.py.
## If data exists, it skips all computation and goes directly to plotting,
## saving time when plotting was interrupted.
## If no data is found, it exits with a message.
##
##################################################################
## Guard against re-execution by multiprocessing child processes.
if __name__ != '__main__':
import sys as _sys
_sys.exit(0)
import os
import sys
import AMSS_NCKU_Input as input_data
##################################################################
## Construct paths from input configuration
File_directory = os.path.join(input_data.File_directory)
output_directory = os.path.join(File_directory, "AMSS_NCKU_output")
binary_results_directory = os.path.join(output_directory, input_data.Output_directory)
figure_directory = os.path.join(File_directory, "figure")
##################################################################
## Check whether the required output data files exist
required_files = [
os.path.join(binary_results_directory, "bssn_BH.dat"),
os.path.join(binary_results_directory, "bssn_ADMQs.dat"),
os.path.join(binary_results_directory, "bssn_psi4.dat"),
os.path.join(binary_results_directory, "bssn_constraint.dat"),
]
missing_files = [f for f in required_files if not os.path.exists(f)]
if missing_files:
print(" No existing AMSS_NCKU_Program.py output data found. ")
print(" The following required files are missing: ")
for f in missing_files:
print(f" {f}")
print()
print(" Please run AMSS_NCKU_Program.py first to generate the simulation data. ")
print(" Exiting. ")
sys.exit(1)
print(" Found existing AMSS_NCKU_Program.py output data. " )
print(" Skipping all computation and going directly to plotting. " )
print()
## Ensure the figure directory exists (it should, but be safe)
os.makedirs(figure_directory, exist_ok=True)
##################################################################
## Plot the AMSS-NCKU program results
import plot_xiaoqu
import plot_GW_strain_amplitude_xiaoqu
from parallel_plot_helper import run_plot_tasks_parallel
plot_tasks = []
## Plot black hole trajectory
plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot, (binary_results_directory, figure_directory)))
plot_tasks.append((plot_xiaoqu.generate_puncture_orbit_plot3D, (binary_results_directory, figure_directory)))
## Plot black hole separation vs. time
plot_tasks.append((plot_xiaoqu.generate_puncture_distence_plot, (binary_results_directory, figure_directory)))
## Plot gravitational waveforms (psi4 and strain amplitude)
for i in range(input_data.Detector_Number):
plot_tasks.append((plot_xiaoqu.generate_gravitational_wave_psi4_plot, (binary_results_directory, figure_directory, i)))
plot_tasks.append((plot_GW_strain_amplitude_xiaoqu.generate_gravitational_wave_amplitude_plot, (binary_results_directory, figure_directory, i)))
## Plot ADM mass evolution
for i in range(input_data.Detector_Number):
plot_tasks.append((plot_xiaoqu.generate_ADMmass_plot, (binary_results_directory, figure_directory, i)))
## Plot Hamiltonian constraint violation over time
for i in range(input_data.grid_level):
plot_tasks.append((plot_xiaoqu.generate_constraint_check_plot, (binary_results_directory, figure_directory, i)))
run_plot_tasks_parallel(plot_tasks)
## Plot stored binary data (runs serially, not in the parallel pool)
plot_xiaoqu.generate_binary_data_plot(binary_results_directory, figure_directory)
print()
print(" Plotting completed successfully. ")
print()

View File

@@ -1,19 +1,10 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version) AMSS-NCKU GW150914 Simulation Regression Test Script
Verification Requirements: Verification Requirements:
1. RMS errors < 1% for: 1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
- 3D Vector Total RMS
- X Component RMS
- Y Component RMS
- Z Component RMS
2. ADM constraint violation < 2 (Grid Level 0) 2. ADM constraint violation < 2 (Grid Level 0)
3. The following figure PDFs must match GW150914-origin exactly after rasterization:
- ADM_Constraint_Grid_Level_0.pdf
- BH_Trajectory_21_XY.pdf
- BH_Trajectory_XY.pdf
The script also reports the percentage of differing pixels for each figure.
RMS Calculation Method: RMS Calculation Method:
- Computes trajectory deviation on the XY plane independently for BH1 and BH2 - Computes trajectory deviation on the XY plane independently for BH1 and BH2
@@ -28,10 +19,6 @@ Reference: GW150914-origin (baseline simulation)
import numpy as np import numpy as np
import sys import sys
import os import os
import shutil
import subprocess
import tempfile
from PIL import Image
# ANSI Color Codes # ANSI Color Codes
class Color: class Color:
@@ -71,187 +58,78 @@ def load_constraint_data(filepath):
return np.array(data) return np.array(data)
def resolve_figure_dir(path): def calculate_rms_error(bh_data_ref, bh_data_target):
"""Resolve the sibling figure directory from an output or figure path."""
normalized = os.path.normpath(path)
if os.path.basename(normalized) == "figure":
return normalized
return os.path.join(os.path.dirname(normalized), "figure")
def render_pdf_to_images(pdf_path, dpi=150):
"""Render a PDF to RGB images using Ghostscript."""
gs_path = shutil.which("gs")
if gs_path is None:
raise RuntimeError("Ghostscript executable 'gs' was not found in PATH")
with tempfile.TemporaryDirectory(prefix="amss_verify_pdf_") as temp_dir:
output_pattern = os.path.join(temp_dir, "page-%03d.ppm")
cmd = [
gs_path,
"-q",
"-dSAFER",
"-dBATCH",
"-dNOPAUSE",
"-sDEVICE=ppmraw",
f"-r{dpi}",
f"-o{output_pattern}",
pdf_path
]
try:
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
except subprocess.CalledProcessError as exc:
message = exc.stderr.strip() or str(exc)
raise RuntimeError(f"Failed to render PDF '{pdf_path}': {message}") from exc
ppm_files = sorted(
os.path.join(temp_dir, filename)
for filename in os.listdir(temp_dir)
if filename.endswith(".ppm")
)
if not ppm_files:
raise RuntimeError(f"No rendered pages were produced for '{pdf_path}'")
images = []
for ppm_file in ppm_files:
with Image.open(ppm_file) as img:
images.append(np.array(img.convert("RGB"), dtype=np.uint8))
return images
def compare_rendered_pages(ref_img, target_img):
"""Return (different_pixels, total_pixels) for two rendered RGB pages."""
ref_h, ref_w = ref_img.shape[:2]
tgt_h, tgt_w = target_img.shape[:2]
total_pixels = max(ref_h, tgt_h) * max(ref_w, tgt_w)
if ref_h == tgt_h and ref_w == tgt_w:
different_pixels = int(np.count_nonzero(np.any(ref_img != target_img, axis=2)))
return different_pixels, total_pixels
diff_mask = np.ones((max(ref_h, tgt_h), max(ref_w, tgt_w)), dtype=bool)
overlap_h = min(ref_h, tgt_h)
overlap_w = min(ref_w, tgt_w)
overlap_diff = np.any(ref_img[:overlap_h, :overlap_w] != target_img[:overlap_h, :overlap_w], axis=2)
diff_mask[:overlap_h, :overlap_w] = overlap_diff
different_pixels = int(np.count_nonzero(diff_mask))
return different_pixels, total_pixels
def compare_pdf_images(ref_pdf, target_pdf, dpi=150, threshold_percent=0.001):
"""Compare two PDFs by rasterizing them and counting differing pixels."""
ref_pages = render_pdf_to_images(ref_pdf, dpi=dpi)
target_pages = render_pdf_to_images(target_pdf, dpi=dpi)
total_pixels = 0
different_pixels = 0
max_pages = max(len(ref_pages), len(target_pages))
for page_idx in range(max_pages):
if page_idx < len(ref_pages) and page_idx < len(target_pages):
page_diff, page_total = compare_rendered_pages(ref_pages[page_idx], target_pages[page_idx])
else:
existing_page = ref_pages[page_idx] if page_idx < len(ref_pages) else target_pages[page_idx]
page_total = existing_page.shape[0] * existing_page.shape[1]
page_diff = page_total
total_pixels += page_total
different_pixels += page_diff
diff_percent = (different_pixels / total_pixels * 100.0) if total_pixels else 0.0
return {
"different_pixels": different_pixels,
"total_pixels": total_pixels,
"diff_percent": diff_percent,
"pages_ref": len(ref_pages),
"pages_target": len(target_pages),
"passed": diff_percent < threshold_percent
}
def compare_required_figures(reference_figure_dir, target_figure_dir):
"""Compare the required GW150914 figure PDFs."""
figure_names = [
"ADM_Constraint_Grid_Level_0.pdf",
"BH_Trajectory_21_XY.pdf",
"BH_Trajectory_XY.pdf"
]
results = []
for figure_name in figure_names:
ref_pdf = os.path.join(reference_figure_dir, figure_name)
target_pdf = os.path.join(target_figure_dir, figure_name)
if not os.path.exists(ref_pdf):
raise FileNotFoundError(f"Reference figure not found: {ref_pdf}")
if not os.path.exists(target_pdf):
raise FileNotFoundError(f"Target figure not found: {target_pdf}")
comparison = compare_pdf_images(ref_pdf, target_pdf)
comparison["name"] = figure_name
results.append(comparison)
return results
def calculate_all_rms_errors(bh_data_ref, bh_data_target):
""" """
Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently. Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
Uses r = sqrt(x^2 + y^2) as the denominator for all error normalizations.
Returns the maximum error between BH1 and BH2 for each category. This function computes the RMS error independently for BH1 and BH2 trajectories,
then returns the maximum of the two as the final RMS error metric.
For each black hole, the RMS is calculated as:
RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
where:
Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
Args:
bh_data_ref: Reference (baseline) trajectory data
bh_data_target: Target (optimized) trajectory data
Returns:
rms_value: Final RMS error as a percentage (max of BH1 and BH2)
error: Error message if any
""" """
# Align data: truncate to the length of the shorter dataset
M = min(len(bh_data_ref['time']), len(bh_data_target['time'])) M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
if M < 10: if M < 10:
return None, "Insufficient data points for comparison" return None, "Insufficient data points for comparison"
results = {} # Extract XY coordinates for both black holes
x1_ref = bh_data_ref['x1'][:M]
y1_ref = bh_data_ref['y1'][:M]
x2_ref = bh_data_ref['x2'][:M]
y2_ref = bh_data_ref['y2'][:M]
for bh in ['1', '2']: x1_new = bh_data_target['x1'][:M]
x_r, y_r, z_r = bh_data_ref[f'x{bh}'][:M], bh_data_ref[f'y{bh}'][:M], bh_data_ref[f'z{bh}'][:M] y1_new = bh_data_target['y1'][:M]
x_n, y_n, z_n = bh_data_target[f'x{bh}'][:M], bh_data_target[f'y{bh}'][:M], bh_data_target[f'z{bh}'][:M] x2_new = bh_data_target['x2'][:M]
y2_new = bh_data_target['y2'][:M]
# 核心修改:根据组委会的邮件指示,分母统一使用 r = sqrt(x^2 + y^2) # Calculate RMS for BH1
r_ref = np.sqrt(x_r**2 + y_r**2) delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
r_new = np.sqrt(x_n**2 + y_n**2) r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
denom_max = np.maximum(r_ref, r_new) r1_new = np.sqrt(x1_new**2 + y1_new**2)
r1_max = np.maximum(r1_ref, r1_new)
valid = denom_max > 1e-15 # Calculate RMS for BH2
if np.sum(valid) < 10: delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
results[f'BH{bh}'] = { '3D_Vector': 0.0, 'X_Component': 0.0, 'Y_Component': 0.0, 'Z_Component': 0.0 } r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
continue r2_new = np.sqrt(x2_new**2 + y2_new**2)
r2_max = np.maximum(r2_ref, r2_new)
def calc_rms(delta): # Avoid division by zero for BH1
# 将对应分量的偏差除以统一的轨道半径分母 denom_max valid_mask1 = r1_max > 1e-15
return np.sqrt(np.mean((delta[valid] / denom_max[valid])**2)) * 100 if np.sum(valid_mask1) < 10:
return None, "Insufficient valid data points for BH1"
# 1. Total 3D Vector RMS terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
delta_vec = np.sqrt((x_r - x_n)**2 + (y_r - y_n)**2 + (z_r - z_n)**2) rms_bh1 = np.sqrt(np.mean(terms1)) * 100
rms_3d = calc_rms(delta_vec)
# 2. Component-wise RMS (分离计算各轴,但共用半径分母) # Avoid division by zero for BH2
rms_x = calc_rms(np.abs(x_r - x_n)) valid_mask2 = r2_max > 1e-15
rms_y = calc_rms(np.abs(y_r - y_n)) if np.sum(valid_mask2) < 10:
rms_z = calc_rms(np.abs(z_r - z_n)) return None, "Insufficient valid data points for BH2"
results[f'BH{bh}'] = { terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
'3D_Vector': rms_3d, rms_bh2 = np.sqrt(np.mean(terms2)) * 100
'X_Component': rms_x,
'Y_Component': rms_y,
'Z_Component': rms_z
}
# 获取 BH1 BH2 中的最大误差 # Final RMS is the maximum of BH1 and BH2
max_rms = { rms_final = max(rms_bh1, rms_bh2)
'3D_Vector': max(results['BH1']['3D_Vector'], results['BH2']['3D_Vector']),
'X_Component': max(results['BH1']['X_Component'], results['BH2']['X_Component']), return rms_final, None
'Y_Component': max(results['BH1']['Y_Component'], results['BH2']['Y_Component']),
'Z_Component': max(results['BH1']['Z_Component'], results['BH2']['Z_Component'])
}
return max_rms, None
def analyze_constraint_violation(constraint_data, n_levels=9): def analyze_constraint_violation(constraint_data, n_levels=9):
""" """
@@ -277,32 +155,34 @@ def analyze_constraint_violation(constraint_data, n_levels=9):
def print_header(): def print_header():
"""Print report header"""
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + " AMSS-NCKU GW150914 Comprehensive Regression Test" + Color.RESET) print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
def print_rms_results(rms_dict, error, threshold=1.0):
print(f"\n{Color.BOLD}1. RMS Error Analysis (Maximums of BH1 & BH2){Color.RESET}") def print_rms_results(rms_rel, error, threshold=1.0):
print("-" * 65) """Print RMS error results"""
print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
print("-" * 45)
if error: if error:
print(f" {Color.RED}Error: {error}{Color.RESET}") print(f" {Color.RED}Error: {error}{Color.RESET}")
return False return False
all_passed = True passed = rms_rel < threshold
print(f" Requirement: < {threshold}%\n")
for key, val in rms_dict.items(): print(f" RMS relative error: {rms_rel:.4f}%")
passed = val < threshold print(f" Requirement: < {threshold}%")
all_passed = all_passed and passed print(f" Status: {get_status_text(passed)}")
status = get_status_text(passed)
print(f" {key:15}: {val:8.4f}% | Status: {status}") return passed
return all_passed
def print_constraint_results(results, threshold=2.0): def print_constraint_results(results, threshold=2.0):
"""Print constraint violation results"""
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}") print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
print("-" * 65) print("-" * 45)
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz'] names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
for i, name in enumerate(names): for i, name in enumerate(names):
@@ -319,45 +199,19 @@ def print_constraint_results(results, threshold=2.0):
return passed return passed
def print_figure_results(results, threshold_percent=0.001): def print_summary(rms_passed, constraint_passed):
print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}") """Print summary"""
print("-" * 65)
print(f" Requirement: < {threshold_percent:.3f}% differing pixels\n")
all_passed = True
for result in results:
passed = result["passed"]
all_passed = all_passed and passed
status = get_status_text(passed)
print(f" {result['name']:32}: {result['diff_percent']:10.6f}% | Status: {status}")
if result["pages_ref"] != result["pages_target"]:
print(f" {'':32} pages(ref/target): {result['pages_ref']}/{result['pages_target']}")
return all_passed
def print_figure_error(error_message):
print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}")
print("-" * 65)
print(f" {Color.RED}Error: {error_message}{Color.RESET}")
return False
def print_summary(rms_passed, constraint_passed, figure_passed):
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
print(Color.BOLD + "Verification Summary" + Color.RESET) print(Color.BOLD + "Verification Summary" + Color.RESET)
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET) print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
all_passed = rms_passed and constraint_passed and figure_passed all_passed = rms_passed and constraint_passed
res_rms = get_status_text(rms_passed) res_rms = get_status_text(rms_passed)
res_con = get_status_text(constraint_passed) res_con = get_status_text(constraint_passed)
res_fig = get_status_text(figure_passed)
print(f" [1] Comprehensive RMS check: {res_rms}") print(f" [1] RMS trajectory check: {res_rms}")
print(f" [2] ADM constraint check: {res_con}") print(f" [2] ADM constraint check: {res_con}")
print(f" [3] Figure pixel comparison: {res_fig}")
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}" final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
print(f"\n Overall result: {final_status}") print(f"\n Overall result: {final_status}")
@@ -365,58 +219,61 @@ def print_summary(rms_passed, constraint_passed, figure_passed):
return all_passed return all_passed
def main(): def main():
# Determine target (optimized) output directory
if len(sys.argv) > 1: if len(sys.argv) > 1:
target_dir = sys.argv[1] target_dir = sys.argv[1]
else: else:
script_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.dirname(os.path.abspath(__file__))
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output") target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
# Determine reference (baseline) directory
script_dir = os.path.dirname(os.path.abspath(__file__)) script_dir = os.path.dirname(os.path.abspath(__file__))
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output") reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
target_figure_dir = resolve_figure_dir(target_dir)
reference_figure_dir = os.path.join(script_dir, "GW150914-origin/figure")
# Data file paths
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat") bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
bh_file_target = os.path.join(target_dir, "bssn_BH.dat") bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
constraint_file = os.path.join(target_dir, "bssn_constraint.dat") constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
# Check if files exist
if not os.path.exists(bh_file_ref): if not os.path.exists(bh_file_ref):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}") print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
sys.exit(1) sys.exit(1)
if not os.path.exists(bh_file_target): if not os.path.exists(bh_file_target):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}") print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
sys.exit(1) sys.exit(1)
if not os.path.exists(constraint_file): if not os.path.exists(constraint_file):
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}") print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
sys.exit(1) sys.exit(1)
# Print header
print_header() print_header()
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}") print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}") print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
print(f"{Color.BOLD}Reference Figures: {Color.RESET} {Color.BLUE}{reference_figure_dir}{Color.RESET}")
print(f"{Color.BOLD}Target Figures: {Color.RESET} {Color.BLUE}{target_figure_dir}{Color.RESET}")
# Load data
bh_data_ref = load_bh_trajectory(bh_file_ref) bh_data_ref = load_bh_trajectory(bh_file_ref)
bh_data_target = load_bh_trajectory(bh_file_target) bh_data_target = load_bh_trajectory(bh_file_target)
constraint_data = load_constraint_data(constraint_file) constraint_data = load_constraint_data(constraint_file)
# Output modified RMS results # Calculate RMS error
rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target) rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
rms_passed = print_rms_results(rms_dict, error) rms_passed = print_rms_results(rms_rel, error)
# Output constraint results # Analyze constraint violation
constraint_results = analyze_constraint_violation(constraint_data) constraint_results = analyze_constraint_violation(constraint_data)
constraint_passed = print_constraint_results(constraint_results) constraint_passed = print_constraint_results(constraint_results)
try: # Print summary
figure_results = compare_required_figures(reference_figure_dir, target_figure_dir) all_passed = print_summary(rms_passed, constraint_passed)
figure_passed = print_figure_results(figure_results)
except (FileNotFoundError, RuntimeError) as exc:
figure_passed = print_figure_error(str(exc))
all_passed = print_summary(rms_passed, constraint_passed, figure_passed) # Return exit code
sys.exit(0 if all_passed else 1) sys.exit(0 if all_passed else 1)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -7,178 +7,12 @@
#include <string> #include <string>
#include <cmath> #include <cmath>
#include <new> #include <new>
#include <vector>
using namespace std; using namespace std;
#include "misc.h" #include "misc.h"
#include "MPatch.h" #include "MPatch.h"
#include "Parallel.h" #include "Parallel.h"
#include "fmisc.h" #include "fmisc.h"
#ifdef INTERP_LB_PROFILE
#include "interp_lb_profile.h"
#endif
namespace
{
struct InterpBlockView
{
Block *bp;
double llb[dim];
double uub[dim];
};
struct BlockBinIndex
{
int bins[dim];
double lo[dim];
double inv[dim];
vector<InterpBlockView> views;
vector<vector<int>> bin_to_blocks;
bool valid;
BlockBinIndex() : valid(false)
{
for (int i = 0; i < dim; i++)
{
bins[i] = 1;
lo[i] = 0.0;
inv[i] = 0.0;
}
}
};
inline int clamp_int(int v, int lo, int hi)
{
return (v < lo) ? lo : ((v > hi) ? hi : v);
}
inline int coord_to_bin(double x, double lo, double inv, int nb)
{
if (nb <= 1 || inv <= 0.0)
return 0;
int b = int(floor((x - lo) * inv));
return clamp_int(b, 0, nb - 1);
}
inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
{
return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
}
inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
{
for (int i = 0; i < dim; i++)
{
if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
return false;
}
return true;
}
void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
{
index = BlockBinIndex();
MyList<Block> *Bp = patch->blb;
while (Bp)
{
Block *BP = Bp->data;
InterpBlockView view;
view.bp = BP;
for (int i = 0; i < dim; i++)
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
}
index.views.push_back(view);
if (Bp == patch->ble)
break;
Bp = Bp->next;
}
const int nblocks = int(index.views.size());
if (nblocks <= 0)
return;
int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
bins_1d = clamp_int(bins_1d, 1, 32);
for (int i = 0; i < dim; i++)
{
index.bins[i] = bins_1d;
index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
if (hi > index.lo[i] && bins_1d > 1)
index.inv[i] = bins_1d / (hi - index.lo[i]);
else
index.inv[i] = 0.0;
}
index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
for (int bi = 0; bi < nblocks; bi++)
{
const InterpBlockView &view = index.views[bi];
int bmin[dim], bmax[dim];
for (int d = 0; d < dim; d++)
{
const double low = view.llb[d] - DH[d] / 2;
const double up = view.uub[d] + DH[d] / 2;
bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
if (bmax[d] < bmin[d])
{
int t = bmin[d];
bmin[d] = bmax[d];
bmax[d] = t;
}
}
for (int bz = bmin[2]; bz <= bmax[2]; bz++)
for (int by = bmin[1]; by <= bmax[1]; by++)
for (int bx = bmin[0]; bx <= bmax[0]; bx++)
index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
}
index.valid = true;
}
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
{
if (!index.valid)
return -1;
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
for (size_t ci = 0; ci < cand.size(); ci++)
{
const int bi = cand[ci];
if (point_in_block_view(index.views[bi], pox, DH))
return bi;
}
// Fallback to full scan for numerical edge cases around bin boundaries.
for (size_t bi = 0; bi < index.views.size(); bi++)
if (point_in_block_view(index.views[bi], pox, DH))
return int(bi);
return -1;
}
} // namespace
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
{ {
@@ -530,11 +364,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
for (int j = 0; j < NN; j++) for (int j = 0; j < NN; j++)
owner_rank[j] = -1; owner_rank[j] = -1;
double DH[dim]; double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
DH[i] = getdX(i); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -557,27 +389,59 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
const int block_i = find_block_index_for_point(block_index, pox, DH); MyList<Block> *Bp = blb;
if (block_i >= 0) bool notfind = true;
while (notfind && Bp) // run along Blocks
{ {
Block *BP = block_index.views[block_i].bp; Block *BP = Bp->data;
owner_rank[j] = BP->rank;
if (myrank == BP->rank) bool flag = true;
for (int i = 0; i < dim; i++)
{ {
//---> interpolation #ifdef Vertex
varl = VarList; #ifdef Cell
int k = 0; #error Both Cell and Vertex are defined
while (varl) // run along variables #endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{ {
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], flag = false;
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); break;
varl = varl->next;
k++;
} }
} }
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
} }
} }
// Replace MPI_Allreduce with per-owner MPI_Bcast: // Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group. // Group consecutive points by owner rank and broadcast each group.
// Since each point's data is non-zero only on the owner rank, // Since each point's data is non-zero only on the owner rank,
@@ -642,9 +506,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
// Targeted point-to-point overload: each owner sends each point only to // Targeted point-to-point overload: each owner sends each point only to
// the one rank that needs it for integration (consumer), reducing // the one rank that needs it for integration (consumer), reducing
// communication volume by ~nprocs times compared to the Bcast version. // communication volume by ~nprocs times compared to the Bcast version.
#ifdef INTERP_LB_PROFILE /*
double t_interp_start = MPI_Wtime(); double t_calc_end, t_calc_total = 0;
#endif double t_calc_start = MPI_Wtime();*/
int myrank, nprocs; int myrank, nprocs;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -667,11 +531,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
for (int j = 0; j < NN; j++) for (int j = 0; j < NN; j++)
owner_rank[j] = -1; owner_rank[j] = -1;
double DH[dim]; double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
DH[i] = getdX(i); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
// --- Interpolation phase (identical to original) --- // --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++) for (int j = 0; j < NN; j++)
@@ -695,36 +557,67 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
const int block_i = find_block_index_for_point(block_index, pox, DH); MyList<Block> *Bp = blb;
if (block_i >= 0) bool notfind = true;
while (notfind && Bp)
{ {
Block *BP = block_index.views[block_i].bp; Block *BP = Bp->data;
owner_rank[j] = BP->rank;
if (myrank == BP->rank) bool flag = true;
for (int i = 0; i < dim; i++)
{ {
varl = VarList; #ifdef Vertex
int k = 0; #ifdef Cell
while (varl) #error Both Cell and Vertex are defined
#endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{ {
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], flag = false;
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); break;
varl = varl->next;
k++;
} }
} }
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
} }
} }
/*
#ifdef INTERP_LB_PROFILE t_calc_end = MPI_Wtime();
double t_interp_end = MPI_Wtime(); t_calc_total = t_calc_end - t_calc_start;*/
double t_interp_local = t_interp_end - t_interp_start;
#endif
// --- Error check for unfound points --- // --- Error check for unfound points ---
for (int j = 0; j < NN; j++) for (int j = 0; j < NN; j++)
{ {
if (owner_rank[j] < 0 && myrank == 0) if (owner_rank[j] < 0 && myrank == 0)
{ {
cout<<owner_rank[j-1]<<endl;
cout << "ERROR: Patch::Interp_Points fails to find point ("; cout << "ERROR: Patch::Interp_Points fails to find point (";
for (int d = 0; d < dim; d++) for (int d = 0; d < dim; d++)
{ {
@@ -876,31 +769,63 @@ void Patch::Interp_Points(MyList<var> *VarList,
delete[] recv_count; delete[] recv_count;
delete[] consumer_rank; delete[] consumer_rank;
delete[] owner_rank; delete[] owner_rank;
/*
// 4. 汇总并输出真正干活最慢的 Top 4
struct RankStats {
int rank;
double calc_time; // 净计算时间
};
#ifdef INTERP_LB_PROFILE // 创建当前进程的统计数据
{ RankStats local_stat;
static bool profile_written = false; local_stat.rank = myrank;
if (!profile_written) { local_stat.calc_time = t_calc_total;
double *all_times = nullptr;
if (myrank == 0) all_times = new double[nprocs]; // 为所有进程的统计数据分配内存
MPI_Gather(&t_interp_local, 1, MPI_DOUBLE, RankStats *all_stats = nullptr;
all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); if (myrank == 0) {
if (myrank == 0) { all_stats = new RankStats[nprocs];
int heavy[64];
int nh = InterpLBProfile::identify_heavy_ranks(
all_times, nprocs, 2.5, heavy, 64);
InterpLBProfile::write_profile(
"interp_lb_profile.bin", nprocs,
all_times, heavy, nh, 2.5);
printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
for (int i = 0; i < nh; i++)
printf(" Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
delete[] all_times;
}
profile_written = true;
}
} }
#endif
// 使用MPI_Gather收集所有进程的数据到rank 0
MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
all_stats, sizeof(RankStats), MPI_BYTE,
0, MPI_COMM_WORLD);
// 准备输出前4个rank的信息所有rank都参与确保广播后一致
int top10_ranks[10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
double top10_times[10] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
int num_top10 = 0;
if (myrank == 0) {
// 按 calc_time净计算时间排序
std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
return a.calc_time > b.calc_time;
});
// 取前4个
num_top10 = (nprocs < 10) ? nprocs : 10;
for (int i = 0; i < num_top10; i++) {
top10_ranks[i] = all_stats[i].rank;
top10_times[i] = all_stats[i].calc_time;
}
printf("\n--- Top %d Ranks by ACTIVE COMPUTATION (CPU Time) ---\n", num_top10);
for (int i = 0; i < num_top10; i++) {
printf("Rank [%4d]: Calc %.6f s\n", top10_ranks[i], top10_times[i]);
}
// 清理分配的内存
delete[] all_stats;
}
// 广播前4个rank的信息给所有进程
MPI_Bcast(&num_top10, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (num_top10 > 0) {
MPI_Bcast(top10_ranks, 10, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(top10_times, 10, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
*/
} }
void Patch::Interp_Points(MyList<var> *VarList, void Patch::Interp_Points(MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
@@ -934,11 +859,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
MPI_Comm_group(MPI_COMM_WORLD, &world_group); MPI_Comm_group(MPI_COMM_WORLD, &world_group);
MPI_Comm_group(Comm_here, &local_group); MPI_Comm_group(Comm_here, &local_group);
double DH[dim]; double DH[dim], llb[dim], uub[dim];
for (int i = 0; i < dim; i++) for (int i = 0; i < dim; i++)
DH[i] = getdX(i); DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
for (int j = 0; j < NN; j++) // run along points for (int j = 0; j < NN; j++) // run along points
{ {
@@ -961,24 +884,57 @@ void Patch::Interp_Points(MyList<var> *VarList,
} }
} }
const int block_i = find_block_index_for_point(block_index, pox, DH); MyList<Block> *Bp = blb;
if (block_i >= 0) bool notfind = true;
while (notfind && Bp) // run along Blocks
{ {
Block *BP = block_index.views[block_i].bp; Block *BP = Bp->data;
owner_rank[j] = BP->rank;
if (myrank == BP->rank) bool flag = true;
for (int i = 0; i < dim; i++)
{ {
//---> interpolation #ifdef Vertex
varl = VarList; #ifdef Cell
int k = 0; #error Both Cell and Vertex are defined
while (varl) // run along variables #endif
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
#else
#ifdef Cell
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
{ {
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], flag = false;
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); break;
varl = varl->next;
k++;
} }
} }
if (flag)
{
notfind = false;
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
//---> interpolation
varl = VarList;
int k = 0;
while (varl) // run along variables
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
k++;
}
}
}
if (Bp == ble)
break;
Bp = Bp->next;
} }
} }

View File

@@ -24,6 +24,7 @@ using namespace std;
#endif #endif
#include <mpi.h> #include <mpi.h>
#include <memory.h>
#include "MyList.h" #include "MyList.h"
#include "Block.h" #include "Block.h"
#include "Parallel.h" #include "Parallel.h"

File diff suppressed because it is too large Load Diff

View File

@@ -11,7 +11,7 @@
#include <cmath> #include <cmath>
#include <new> #include <new>
using namespace std; using namespace std;
#include <memory.h>
#include "Parallel_bam.h" #include "Parallel_bam.h"
#include "var.h" #include "var.h"
#include "MPatch.h" #include "MPatch.h"
@@ -32,16 +32,25 @@ namespace Parallel
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape); int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim, Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
int ib0_orig, int ib3_orig, int ib0_orig, int ib3_orig,
int jb1_orig, int jb4_orig, int jb1_orig, int jb4_orig,
int kb2_orig, int kb5_orig, int kb2_orig, int kb5_orig,
Patch* PP, int r_left, int r_right, Patch* PP, int r_1, int r_2,
int ingfsi, int fngfsi, bool periodic, int ingfsi, int fngfsi, bool periodic,
Block* &split_first_block, Block* &split_last_block); Block* &split_first_block, Block* &split_last_block);
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
int ib0_orig, int ib3_orig,
int jb1_orig, int jb4_orig,
int kb2_orig, int kb5_orig,
Patch* PP, int r_1, int r_2, int r_3, int r_4,
int ingfsi, int fngfsi, bool periodic,
Block* &split_first_block, Block* &split_last_block);
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox, Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
int block_id, int ingfsi, int fngfsi, int lev); int block_id, int ingfsi, int fngfsi, int lev);
void KillBlocks(MyList<Patch> *PatchLIST); void KillBlocks(MyList<Patch> *PatchLIST);
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z)); void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
@@ -108,9 +117,6 @@ namespace Parallel
MPI_Status *stats; MPI_Status *stats;
int max_reqs; int max_reqs;
bool lengths_valid; bool lengths_valid;
int *tc_req_node;
int *tc_req_is_recv;
int *tc_completed;
SyncCache(); SyncCache();
void invalidate(); void invalidate();
void destroy(); void destroy();
@@ -124,10 +130,7 @@ namespace Parallel
struct AsyncSyncState { struct AsyncSyncState {
int req_no; int req_no;
bool active; bool active;
int *req_node; AsyncSyncState() : req_no(0), active(false) {}
int *req_is_recv;
int pending_recv;
AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
}; };
void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
@@ -183,7 +186,6 @@ namespace Parallel
MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst); MyList<Parallel::gridseg> **out_src, MyList<Parallel::gridseg> **out_dst);
void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry); void PeriodicBD(Patch *Pat, MyList<var> *VarList, int Symmetry);
double L2Norm(Patch *Pat, var *vf); double L2Norm(Patch *Pat, var *vf);
void L2Norm7(Patch *Pat, var **vf, double *norms);
void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only); void checkgsl(MyList<Parallel::gridseg> *pp, bool first_only);
void checkvarl(MyList<var> *pp, bool first_only); void checkvarl(MyList<var> *pp, bool first_only);
MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat); MyList<Parallel::gridseg> *divide_gsl(MyList<Parallel::gridseg> *p, Patch *Pat);
@@ -219,13 +221,24 @@ namespace Parallel
void checkpatchlist(MyList<Patch> *PatL, bool buflog); void checkpatchlist(MyList<Patch> *PatL, bool buflog);
double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here); double L2Norm(Patch *Pat, var *vf, MPI_Comm Comm_here);
void L2Norm7(Patch *Pat, var **vf, double *norms, MPI_Comm Comm_here);
bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList, bool PatList_Interp_Points(MyList<Patch> *PatL, MyList<var> *VarList,
int NN, double **XX, int NN, double **XX,
double *Shellf, int Symmetry, MPI_Comm Comm_here); double *Shellf, int Symmetry, MPI_Comm Comm_here);
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi, MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0); bool periodic, int start_rank, int end_rank, int nodes = 0);
// Redistribute blocks with time statistics for load balancing
MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
int cpusize, int ingfsi, int fngfsi,
bool periodic, int start_rank, int end_rank, int nodes = 0);
#endif #endif
}
#endif /*PARALLEL_H */ // Dynamic load balancing: split blocks for heavy ranks
void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
int split_factor, int cpusize, int ingfsi, int fngfsi);
// Check if load balancing is needed based on interpolation times
bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
}
#endif /*PARALLEL_H */

File diff suppressed because it is too large Load Diff

View File

@@ -102,16 +102,6 @@ public:
//-1: means no dumy dimension at all; 0: means rho; 1: means sigma //-1: means no dumy dimension at all; 0: means rho; 1: means sigma
}; };
// Thread-safe search result (no pointers to shared mutable state)
struct PointSearchResult
{
bool found;
Block *Bg;
double gx, gy, gz; // global Cartesian coordinates
double lx, ly, lz; // local coordinates within the found block
int ssst; // source shell-patch type (-1 = Cartesian)
};
int myrank; int myrank;
int shape[dim]; // for (rho, sigma, R), for rho and sigma means number of points for every pi/2 int shape[dim]; // for (rho, sigma, R), for rho and sigma means number of points for every pi/2
double Rrange[2]; // for Rmin and Rmax double Rrange[2]; // for Rmin and Rmax
@@ -185,12 +175,6 @@ public:
MyList<Patch> *Pp, double CDH[dim], MyList<pointstru> *pss); MyList<Patch> *Pp, double CDH[dim], MyList<pointstru> *pss);
bool prolongpointstru(MyList<pointstru> *&psul, bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim], bool prolongpointstru(MyList<pointstru> *&psul, bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
MyList<Patch> *Pp, double CDH[dim], double x, double y, double z, int Symmetry, int rank_in); MyList<Patch> *Pp, double CDH[dim], double x, double y, double z, int Symmetry, int rank_in);
// Read-only point search — thread-safe (no shared mutable state modified)
PointSearchResult prolongpointstru_search(bool ssyn, int tsst, MyList<ss_patch> *sPp, double DH[dim],
MyList<Patch> *Pp, double CDH[dim], double x, double y, double z,
int Symmetry, int rank_in);
// Append a search result to a linked list — use inside omp critical section
void prolongpointstru_append(MyList<pointstru> *&psul, const PointSearchResult &sr, int tsst);
void setupintintstuff(int cpusize, MyList<Patch> *CPatL, int Symmetry); void setupintintstuff(int cpusize, MyList<Patch> *CPatL, int Symmetry);
void intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst, void intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */, MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
@@ -214,7 +198,6 @@ public:
void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax, void write_Pablo_file_ss(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename, int sst); char *filename, int sst);
double L2Norm(var *vf); double L2Norm(var *vf);
void L2Norm7(var **vf, double *norms);
void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf); void Find_Maximum(MyList<var> *VarList, double *XX, double *Shellf);
}; };

View File

@@ -94,31 +94,29 @@
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, & Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry,Lev,eps,co) Symmetry,Lev,eps,co)
if (co == 0) then
#if (ABV == 0) #if (ABV == 0)
call ricci_gamma(ex, X, Y, Z, & call ricci_gamma(ex, X, Y, Z, &
chi, & chi, &
dxx , gxy , gxz , dyy , gyz , dzz,& dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , & Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,& Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,& Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,& Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,& Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry) Symmetry)
#endif #endif
call constraint_bssn(ex, X, Y, Z,& call constraint_bssn(ex, X, Y, Z,&
chi,trK, & chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, & dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, & Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,& Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,& Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, & Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, & Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, & Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, & Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, & Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry) Symmetry)
endif
return return
@@ -229,7 +227,6 @@
call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta) call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
!!! sanity check !!! sanity check
#ifdef DEBUG
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) & dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) & +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) & +sum(Gamx)+sum(Gamy)+sum(Gamz) &
@@ -264,7 +261,6 @@
gont = 1 gont = 1
return return
endif endif
#endif
PI = dacos(-ONE) PI = dacos(-ONE)
@@ -1267,32 +1263,30 @@
endif endif
if (co == 0) then
#if (ABV == 0) #if (ABV == 0)
call ricci_gamma(ex, X, Y, Z, & call ricci_gamma(ex, X, Y, Z, &
chi, & chi, &
dxx , gxy , gxz , dyy , gyz , dzz,& dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , & Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,& Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,& Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,& Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,& Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry) Symmetry)
#endif #endif
call constraint_bssn(ex, X, Y, Z,& call constraint_bssn(ex, X, Y, Z,&
chi,trK, & chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, & dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, & Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,& Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,& Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, & Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, & Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, & Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, & Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, & Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry) Symmetry)
endif
gont = 0 gont = 0

View File

@@ -122,7 +122,6 @@
call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta) call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
!!! sanity check !!! sanity check
#ifdef DEBUG
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) & dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) & +sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) & +sum(Gamx)+sum(Gamy)+sum(Gamz) &
@@ -157,7 +156,6 @@
gont = 1 gont = 1
return return
endif endif
#endif
PI = dacos(-ONE) PI = dacos(-ONE)
@@ -1390,43 +1388,41 @@
call kodis_sh(ex,crho,sigma,R,TZ,TZ_rhs,SSS,Symmetry,eps,sst) call kodis_sh(ex,crho,sigma,R,TZ,TZ_rhs,SSS,Symmetry,eps,sst)
endif endif
if (co == 0) then
#if (ABV == 1) #if (ABV == 1)
call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z, & call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z, &
drhodx, drhody, drhodz, & drhodx, drhody, drhodz, &
dsigmadx,dsigmady,dsigmadz, & dsigmadx,dsigmady,dsigmadz, &
dRdx,dRdy,dRdz, & dRdx,dRdy,dRdz, &
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, & drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, & dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, & dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
chi, & chi, &
dxx , gxy , gxz , dyy , gyz , dzz,& dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , & Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,& Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,& Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,& Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,& Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry,Lev,sst) Symmetry,Lev,sst)
call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z, &
drhodx, drhody, drhodz, &
dsigmadx,dsigmady,dsigmadz, &
dRdx,dRdy,dRdz, &
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry,Lev,sst)
#endif #endif
call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z, &
drhodx, drhody, drhodz, &
dsigmadx,dsigmady,dsigmadz, &
dRdx,dRdy,dRdz, &
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry,Lev,sst)
endif
gont = 0 gont = 0

View File

@@ -258,8 +258,6 @@ void bssnEM_class::Initialize()
PhysTime = StartTime; PhysTime = StartTime;
Setup_Black_Hole_position(); Setup_Black_Hole_position();
} }
setup_transfer_caches();
} }
//================================================================================================ //================================================================================================

View File

@@ -26,12 +26,6 @@ using namespace std;
#include "shellfunctions.h" #include "shellfunctions.h"
#include "parameters.h" #include "parameters.h"
#if BSSN_USE_ESCALAR_C_KERNEL
#define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar_c
#else
#define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar
#endif
#ifdef With_AHF #ifdef With_AHF
#include "derivatives.h" #include "derivatives.h"
#include "myglobal.h" #include "myglobal.h"
@@ -139,9 +133,6 @@ void bssnEScalar_class::Initialize()
} }
GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor); GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
ConstraintRefreshLevels = new int[GH->levels];
for (int il = 0; il < GH->levels; il++)
ConstraintRefreshLevels[il] = 0;
if (checkrun) if (checkrun)
CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry); CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
else else
@@ -174,8 +165,6 @@ void bssnEScalar_class::Initialize()
PhysTime = StartTime; PhysTime = StartTime;
Setup_Black_Hole_position(); Setup_Black_Hole_position();
} }
setup_transfer_caches();
} }
//================================================================================================ //================================================================================================
@@ -241,9 +230,6 @@ void bssnEScalar_class::Read_Ansorg()
} }
int BH_NM; int BH_NM;
double *Porg_here; double *Porg_here;
double *pmom_local;
double *spin_local;
double *mass_local;
// read parameter from file // read parameter from file
{ {
const int LEN = 256; const int LEN = 256;
@@ -285,9 +271,9 @@ void bssnEScalar_class::Read_Ansorg()
} }
Porg_here = new double[3 * BH_NM]; Porg_here = new double[3 * BH_NM];
pmom_local = new double[3 * BH_NM]; Pmom = new double[3 * BH_NM];
spin_local = new double[3 * BH_NM]; Spin = new double[3 * BH_NM];
mass_local = new double[BH_NM]; Mass = new double[BH_NM];
// read parameter from file // read parameter from file
{ {
const int LEN = 256; const int LEN = 256;
@@ -322,7 +308,7 @@ void bssnEScalar_class::Read_Ansorg()
if (sgrp == "BSSN" && sind < BH_NM) if (sgrp == "BSSN" && sind < BH_NM)
{ {
if (skey == "Mass") if (skey == "Mass")
mass_local[sind] = atof(sval.c_str()); Mass[sind] = atof(sval.c_str());
else if (skey == "Porgx") else if (skey == "Porgx")
Porg_here[sind * 3] = atof(sval.c_str()); Porg_here[sind * 3] = atof(sval.c_str());
else if (skey == "Porgy") else if (skey == "Porgy")
@@ -330,17 +316,17 @@ void bssnEScalar_class::Read_Ansorg()
else if (skey == "Porgz") else if (skey == "Porgz")
Porg_here[sind * 3 + 2] = atof(sval.c_str()); Porg_here[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Spinx") else if (skey == "Spinx")
spin_local[sind * 3] = atof(sval.c_str()); Spin[sind * 3] = atof(sval.c_str());
else if (skey == "Spiny") else if (skey == "Spiny")
spin_local[sind * 3 + 1] = atof(sval.c_str()); Spin[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Spinz") else if (skey == "Spinz")
spin_local[sind * 3 + 2] = atof(sval.c_str()); Spin[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Pmomx") else if (skey == "Pmomx")
pmom_local[sind * 3] = atof(sval.c_str()); Pmom[sind * 3] = atof(sval.c_str());
else if (skey == "Pmomy") else if (skey == "Pmomy")
pmom_local[sind * 3 + 1] = atof(sval.c_str()); Pmom[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Pmomz") else if (skey == "Pmomz")
pmom_local[sind * 3 + 2] = atof(sval.c_str()); Pmom[sind * 3 + 2] = atof(sval.c_str());
} }
} }
inf.close(); inf.close();
@@ -376,7 +362,7 @@ void bssnEScalar_class::Read_Ansorg()
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn], cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn], cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn], cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM); Mass, Porg_here, Pmom, Spin, BH_NM);
} }
if (BL == Pp->data->ble) if (BL == Pp->data->ble)
break; break;
@@ -418,7 +404,7 @@ void bssnEScalar_class::Read_Ansorg()
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn], cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn], cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn], cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM); Mass, Porg_here, Pmom, Spin, BH_NM);
} }
if (BL == Pp->data->ble) if (BL == Pp->data->ble)
break; break;
@@ -429,9 +415,6 @@ void bssnEScalar_class::Read_Ansorg()
#endif #endif
delete[] Porg_here; delete[] Porg_here;
delete[] pmom_local;
delete[] spin_local;
delete[] mass_local;
// dump read_in initial data // dump read_in initial data
// for(int lev=0;lev<GH->levels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT); // for(int lev=0;lev<GH->levels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT);
} }
@@ -472,9 +455,6 @@ void bssnEScalar_class::Read_Pablo()
} }
int BH_NM; int BH_NM;
double *Porg_here; double *Porg_here;
double *pmom_local;
double *spin_local;
double *mass_local;
// read parameter from file // read parameter from file
{ {
const int LEN = 256; const int LEN = 256;
@@ -516,9 +496,9 @@ void bssnEScalar_class::Read_Pablo()
} }
Porg_here = new double[3 * BH_NM]; Porg_here = new double[3 * BH_NM];
pmom_local = new double[3 * BH_NM]; Pmom = new double[3 * BH_NM];
spin_local = new double[3 * BH_NM]; Spin = new double[3 * BH_NM];
mass_local = new double[BH_NM]; Mass = new double[BH_NM];
// read parameter from file // read parameter from file
{ {
const int LEN = 256; const int LEN = 256;
@@ -553,7 +533,7 @@ void bssnEScalar_class::Read_Pablo()
if (sgrp == "BSSN" && sind < BH_NM) if (sgrp == "BSSN" && sind < BH_NM)
{ {
if (skey == "Mass") if (skey == "Mass")
mass_local[sind] = atof(sval.c_str()); Mass[sind] = atof(sval.c_str());
else if (skey == "Porgx") else if (skey == "Porgx")
Porg_here[sind * 3] = atof(sval.c_str()); Porg_here[sind * 3] = atof(sval.c_str());
else if (skey == "Porgy") else if (skey == "Porgy")
@@ -561,17 +541,17 @@ void bssnEScalar_class::Read_Pablo()
else if (skey == "Porgz") else if (skey == "Porgz")
Porg_here[sind * 3 + 2] = atof(sval.c_str()); Porg_here[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Spinx") else if (skey == "Spinx")
spin_local[sind * 3] = atof(sval.c_str()); Spin[sind * 3] = atof(sval.c_str());
else if (skey == "Spiny") else if (skey == "Spiny")
spin_local[sind * 3 + 1] = atof(sval.c_str()); Spin[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Spinz") else if (skey == "Spinz")
spin_local[sind * 3 + 2] = atof(sval.c_str()); Spin[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Pmomx") else if (skey == "Pmomx")
pmom_local[sind * 3] = atof(sval.c_str()); Pmom[sind * 3] = atof(sval.c_str());
else if (skey == "Pmomy") else if (skey == "Pmomy")
pmom_local[sind * 3 + 1] = atof(sval.c_str()); Pmom[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Pmomz") else if (skey == "Pmomz")
pmom_local[sind * 3 + 2] = atof(sval.c_str()); Pmom[sind * 3 + 2] = atof(sval.c_str());
} }
} }
inf.close(); inf.close();
@@ -618,7 +598,7 @@ void bssnEScalar_class::Read_Pablo()
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn], cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn], cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn], cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM); Mass, Porg_here, Pmom, Spin, BH_NM);
} }
if (BL == Pp->data->ble) if (BL == Pp->data->ble)
break; break;
@@ -682,7 +662,7 @@ void bssnEScalar_class::Read_Pablo()
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn], cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn], cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn], cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM); Mass, Porg_here, Pmom, Spin, BH_NM);
} }
if (BL == Pp->data->ble) if (BL == Pp->data->ble)
break; break;
@@ -706,9 +686,6 @@ void bssnEScalar_class::Read_Pablo()
#endif #endif
delete[] Porg_here; delete[] Porg_here;
delete[] pmom_local;
delete[] spin_local;
delete[] mass_local;
if (flag && myrank == 0) if (flag && myrank == 0)
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
// dump read_in initial data // dump read_in initial data
@@ -762,7 +739,7 @@ void bssnEScalar_class::Step(int lev, int YN)
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif #endif
if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -1016,8 +993,7 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
#endif #endif
Parallel::AsyncSyncState async_pre; Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
sync_predictor_start(lev, SynchList_pre, async_pre);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -1036,7 +1012,6 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
} }
#endif #endif
sync_predictor_finish(lev, async_pre, SynchList_pre);
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
@@ -1106,7 +1081,7 @@ void bssnEScalar_class::Step(int lev, int YN)
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif #endif
if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1374,8 +1349,7 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
#endif #endif
Parallel::AsyncSyncState async_cor; Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
sync_corrector_start(lev, SynchList_cor, async_cor);
#ifdef WithShell #ifdef WithShell
if (lev == 0) if (lev == 0)
@@ -1394,7 +1368,6 @@ void bssnEScalar_class::Step(int lev, int YN)
} }
} }
#endif #endif
sync_corrector_finish(lev, async_cor, SynchList_cor);
// for black hole position // for black hole position
if (BH_num > 0 && lev == GH->levels - 1) if (BH_num > 0 && lev == GH->levels - 1)
{ {
@@ -1862,11 +1835,8 @@ void bssnEScalar_class::AnalysisStuff_EScalar(int lev, double dT_lev)
//================================================================================================ //================================================================================================
void bssnEScalar_class::Interp_Constraint(bool infg) void bssnEScalar_class::Interp_Constraint()
{ {
if (!infg)
return;
// we do not support a_lev != 0 yet. // we do not support a_lev != 0 yet.
if (a_lev > 0) if (a_lev > 0)
return; return;
@@ -1888,7 +1858,7 @@ void bssnEScalar_class::Interp_Constraint(bool infg)
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
if (lev > 0) if (lev > 0)
BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -2108,7 +2078,7 @@ void bssnEScalar_class::Constraint_Out()
if (myrank == cg->rank) if (myrank == cg->rank)
{ {
if (lev > 0) if (lev > 0)
BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],

View File

@@ -51,7 +51,7 @@ public:
void Compute_Psi4(int lev); void Compute_Psi4(int lev);
void Step(int lev, int YN); void Step(int lev, int YN);
void AnalysisStuff_EScalar(int lev, double dT_lev); void AnalysisStuff_EScalar(int lev, double dT_lev);
void Interp_Constraint(bool infg); void Interp_Constraint();
void Constraint_Out(); void Constraint_Out();
protected: protected:

File diff suppressed because it is too large Load Diff

View File

@@ -33,14 +33,6 @@ using namespace std;
extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN); extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
#ifndef BSSN_USE_TRANSFER_CACHE
#define BSSN_USE_TRANSFER_CACHE 1
#endif
#ifndef BSSN_USE_ESCALAR_C_KERNEL
#define BSSN_USE_ESCALAR_C_KERNEL 1
#endif
class bssn_class class bssn_class
{ {
public: public:
@@ -56,7 +48,6 @@ public:
double StartTime, TotalTime; double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime; double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut; double LastAnas, LastConsOut;
int *ConstraintRefreshLevels;
double Courant; double Courant;
double numepss, numepsb, numepsh; double numepss, numepsb, numepsh;
int Symmetry; int Symmetry;
@@ -139,11 +130,9 @@ public:
Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync Parallel::SyncCache *sync_cache_cor; // per-level cache for corrector sync
Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1] Parallel::SyncCache *sync_cache_rp_coarse; // RestrictProlong sync on PatL[lev-1]
Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev] Parallel::SyncCache *sync_cache_rp_fine; // RestrictProlong sync on PatL[lev]
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor, *TimingMonitor; monitor *ConVMonitor;
surface_integral *Waveshell; surface_integral *Waveshell;
checkpoint *CheckPoint; checkpoint *CheckPoint;
@@ -179,17 +168,6 @@ public:
void testOutBd(); void testOutBd();
bool check_Stdin_Abort(); bool check_Stdin_Abort();
bool use_transfer_cache() const;
void setup_transfer_caches();
void invalidate_transfer_caches();
void destroy_transfer_caches();
void sync_predictor_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
void sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
void sync_corrector_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
void sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
void sync_evolution(int lev, MyList<var> *VarList, Parallel::SyncCache *cache_array = 0);
void restrict_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
void outbdlow2hi_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
virtual void Setup_Initial_Data_Cao(); virtual void Setup_Initial_Data_Cao();
virtual void Setup_Initial_Data_Lousto(); virtual void Setup_Initial_Data_Lousto();

View File

@@ -1,323 +0,0 @@
#include "macrodef.h"
#include "bssn_rhs.h"
#include "share_func.h"
#include "tool.h"
#include <cstddef>
/*
* C 版 BSSN-EM RHS kernel — replaces empart.f90 + bssn_rhs.f90 for BSSN+Maxwell.
*
* Computes:
* 1. All metric and EM field derivatives
* 2. Physical metric, Christoffel-like terms
* 3. EM field RHS (E, B, Kpsi, Kphi)
* 4. Stress-energy tensor (rho, Si, Sij)
* 5. Calls f_compute_rhs_bssn (C BSSN RHS) with stress-energy
* 6. Advection + KO dissipation for EM fields
* 7. NaN check
*/
int f_compute_rhs_bssn_em_c(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *Ex, double *Ey, double *Ez,
double *Bx, double *By, double *Bz,
double *Kpsi, double *Kphi,
double *Jx, double *Jy, double *Jz, double *qchar,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs,
double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs,
double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *Ex_rhs, double *Ey_rhs, double *Ez_rhs,
double *Bx_rhs, double *By_rhs, double *Bz_rhs,
double *Kpsi_rhs, double *Kphi_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz,
double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz,
double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz,
double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz,
double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz,
double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co)
{
(void)T;
int gont = 0;
const int nx = ex[0], ny = ex[1], nz = ex[2];
const int all = nx * ny * nz;
const size_t n = (size_t)all;
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, FOUR = 4.0, EIT = 8.0;
const double HALF = 0.5, THR = 3.0, F3o2 = 1.5, PI = 3.14159265358979323846;
const double SYM = 1.0, ANTI = -1.0;
const double kappa = 1.0;
const double SSS[3]={SYM,SYM,SYM}, AAS[3]={ANTI,ANTI,SYM};
const double ASA[3]={ANTI,SYM,ANTI}, SAA[3]={SYM,ANTI,ANTI};
const double ASS[3]={ANTI,SYM,SYM}, SAS[3]={SYM,ANTI,SYM};
const double SSA[3]={SYM,SYM,ANTI};
/* ---- allocate temporary arrays ---- */
double *chix = (double*)malloc(n*sizeof(double));
double *chiy = (double*)malloc(n*sizeof(double));
double *chiz = (double*)malloc(n*sizeof(double));
double *Exx=(double*)malloc(n*sizeof(double)),*Exy=(double*)malloc(n*sizeof(double)),*Exz=(double*)malloc(n*sizeof(double));
double *Eyx=(double*)malloc(n*sizeof(double)),*Eyy=(double*)malloc(n*sizeof(double)),*Eyz=(double*)malloc(n*sizeof(double));
double *Ezx=(double*)malloc(n*sizeof(double)),*Ezy=(double*)malloc(n*sizeof(double)),*Ezz=(double*)malloc(n*sizeof(double));
double *Bxx=(double*)malloc(n*sizeof(double)),*Bxy=(double*)malloc(n*sizeof(double)),*Bxz=(double*)malloc(n*sizeof(double));
double *Byx=(double*)malloc(n*sizeof(double)),*Byy=(double*)malloc(n*sizeof(double)),*Byz=(double*)malloc(n*sizeof(double));
double *Bzx=(double*)malloc(n*sizeof(double)),*Bzy=(double*)malloc(n*sizeof(double)),*Bzz=(double*)malloc(n*sizeof(double));
double *Kpsix=(double*)malloc(n*sizeof(double)),*Kpsiy=(double*)malloc(n*sizeof(double)),*Kpsiz=(double*)malloc(n*sizeof(double));
double *Kphix=(double*)malloc(n*sizeof(double)),*Kphiy=(double*)malloc(n*sizeof(double)),*Kphiz=(double*)malloc(n*sizeof(double));
double *Lapx=(double*)malloc(n*sizeof(double)),*Lapy=(double*)malloc(n*sizeof(double)),*Lapz=(double*)malloc(n*sizeof(double));
double *betaxx=(double*)malloc(n*sizeof(double)),*betaxy=(double*)malloc(n*sizeof(double)),*betaxz=(double*)malloc(n*sizeof(double));
double *betayx=(double*)malloc(n*sizeof(double)),*betayy=(double*)malloc(n*sizeof(double)),*betayz=(double*)malloc(n*sizeof(double));
double *betazx=(double*)malloc(n*sizeof(double)),*betazy=(double*)malloc(n*sizeof(double)),*betazz=(double*)malloc(n*sizeof(double));
double *gxxx=(double*)malloc(n*sizeof(double)),*gxxy=(double*)malloc(n*sizeof(double)),*gxxz=(double*)malloc(n*sizeof(double));
double *gxyx=(double*)malloc(n*sizeof(double)),*gxyy=(double*)malloc(n*sizeof(double)),*gxyz=(double*)malloc(n*sizeof(double));
double *gxzx=(double*)malloc(n*sizeof(double)),*gxzy=(double*)malloc(n*sizeof(double)),*gxzz=(double*)malloc(n*sizeof(double));
double *gyyx=(double*)malloc(n*sizeof(double)),*gyyy=(double*)malloc(n*sizeof(double)),*gyyz=(double*)malloc(n*sizeof(double));
double *gyzx=(double*)malloc(n*sizeof(double)),*gyzy=(double*)malloc(n*sizeof(double)),*gyzz=(double*)malloc(n*sizeof(double));
double *gzzx=(double*)malloc(n*sizeof(double)),*gzzy=(double*)malloc(n*sizeof(double)),*gzzz=(double*)malloc(n*sizeof(double));
double *gupxx=(double*)malloc(n*sizeof(double)),*gupxy=(double*)malloc(n*sizeof(double)),*gupxz=(double*)malloc(n*sizeof(double));
double *gupyy=(double*)malloc(n*sizeof(double)),*gupyz=(double*)malloc(n*sizeof(double)),*gupzz=(double*)malloc(n*sizeof(double));
if (!chix||!chiy||!chiz||!Exx||!Exy||!Exz||!Eyx||!Eyy||!Eyz||!Ezx||!Ezy||!Ezz||
!Bxx||!Bxy||!Bxz||!Byx||!Byy||!Byz||!Bzx||!Bzy||!Bzz||
!Kpsix||!Kpsiy||!Kpsiz||!Kphix||!Kphiy||!Kphiz||
!Lapx||!Lapy||!Lapz||
!betaxx||!betaxy||!betaxz||!betayx||!betayy||!betayz||!betazx||!betazy||!betazz||
!gxxx||!gxxy||!gxxz||!gxyx||!gxyy||!gxyz||!gxzx||!gxzy||!gxzz||
!gyyx||!gyyy||!gyyz||!gyzx||!gyzy||!gyzz||!gzzx||!gzzy||!gzzz||
!gupxx||!gupxy||!gupxz||!gupyy||!gupyz||!gupzz) {
gont = 1;
}
/* ==== 1. Compute all derivatives ==== */
if (!gont) {
/* metric derivatives */
fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, ANTI, SYM, SYM, Symmetry, Lev);
fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, SYM, ANTI, SYM, Symmetry, Lev);
fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, SYM, SYM, ANTI, Symmetry, Lev);
fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, ANTI, ANTI, SYM, Symmetry, Lev);
fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, ANTI, SYM, ANTI, Symmetry, Lev);
fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, SYM, ANTI, ANTI, Symmetry, Lev);
fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
/* EM field derivatives */
fderivs(ex, Kpsi, Kpsix, Kpsiy, Kpsiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
fderivs(ex, Kphi, Kphix, Kphiy, Kphiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
fderivs(ex, Ex, Exx, Exy, Exz, X, Y, Z, ANTI, SYM, SYM, Symmetry, Lev);
fderivs(ex, Ey, Eyx, Eyy, Eyz, X, Y, Z, SYM, ANTI, SYM, Symmetry, Lev);
fderivs(ex, Ez, Ezx, Ezy, Ezz, X, Y, Z, SYM, SYM, ANTI, Symmetry, Lev);
fderivs(ex, Bx, Bxx, Bxy, Bxz, X, Y, Z, SYM, ANTI, ANTI, Symmetry, Lev);
fderivs(ex, By, Byx, Byy, Byz, X, Y, Z, ANTI, SYM, ANTI, Symmetry, Lev);
fderivs(ex, Bz, Bzx, Bzy, Bzz, X, Y, Z, ANTI, ANTI, SYM, Symmetry, Lev);
/* ==== 2. Compute EM RHS and stress-energy ==== */
const double F1o4PI = ONE / (FOUR * PI);
for (size_t i = 0; i < n; ++i) {
const double alpn1 = Lap[i] + ONE;
const double chin1 = chi[i] + ONE;
const double chi3o2 = sqrt(chin1) * chin1; // chi^{3/2}
const double ichi = ONE / chin1;
/* physical metric */
const double pgxx = (dxx[i] + ONE) * ichi;
const double pgyy = (dyy[i] + ONE) * ichi;
const double pgzz = (dzz[i] + ONE) * ichi;
const double pgxy = gxy[i] * ichi;
const double pgxz = gxz[i] * ichi;
const double pgyz = gyz[i] * ichi;
/* inverse physical metric */
const double det = pgxx * pgyy * pgzz + pgxy * pgyz * pgxz + pgxz * pgxy * pgyz
- pgxz * pgyy * pgxz - pgxy * pgxy * pgzz - pgxx * pgyz * pgyz;
const double idet = ONE / det;
const double upxx = (pgyy * pgzz - pgyz * pgyz) * idet;
const double upxy = -(pgxy * pgzz - pgyz * pgxz) * idet;
const double upxz = (pgxy * pgyz - pgyy * pgxz) * idet;
const double upyy = (pgxx * pgzz - pgxz * pgxz) * idet;
const double upyz = -(pgxx * pgyz - pgxy * pgxz) * idet;
const double upzz = (pgxx * pgyy - pgxy * pgxy) * idet;
gupxx[i]=upxx; gupxy[i]=upxy; gupxz[i]=upxz;
gupyy[i]=upyy; gupyz[i]=upyz; gupzz[i]=upzz;
/* E-field RHS */
/* curl(B) part: epsilon^{ijk} ∂_j (alpha * B_k) in coordinate basis */
/* Using lower-index B fields: B_i_lower = pg_{ij} * B^j */
const double BxL = pgxx*Bx[i] + pgxy*By[i] + pgxz*Bz[i];
const double ByL = pgxy*Bx[i] + pgyy*By[i] + pgyz*Bz[i];
const double BzL = pgxz*Bx[i] + pgyz*By[i] + pgzz*Bz[i];
/* Physical metric derivatives (chain rule from conformal) */
const double pgxx_x = (gxxx[i] - pgxx*chix[i]) * ichi;
/* const double pgxx_y = (gxxy[i] - pgxx*chiy[i]) * ichi; */
const double pgxy_x = (gxyx[i] - pgxy*chix[i]) * ichi;
const double pgxy_y = (gxyy[i] - pgxy*chiy[i]) * ichi;
const double pgxz_x = (gxzx[i] - pgxz*chix[i]) * ichi;
const double pgxz_z = (gxzz[i] - pgxz*chiz[i]) * ichi;
const double pgyy_y = (gyyy[i] - pgyy*chiy[i]) * ichi;
const double pgyz_y = (gyzy[i] - pgyz*chiy[i]) * ichi;
const double pgyz_z = (gyzz[i] - pgyz*chiz[i]) * ichi;
const double pgzz_z = (gzzz[i] - pgzz*chiz[i]) * ichi;
/* Curl_x(B) = ∂_y (alpha*BzL) - ∂_z (alpha*ByL) */
const double aBx = alpn1*BxL, aBy = alpn1*ByL, aBz = alpn1*BzL;
const double curlBx = (aBz*Lapy[i] + alpn1*(pgxz*Bxy[i]+pgyz*Byy[i]+pgzz*Bzy[i]) + alpn1*(Bx[i]*gxzy[i]+By[i]*gyzy[i]+Bz[i]*gzzy[i]))
- (aBy*Lapz[i] + alpn1*(pgxy*Bxz[i]+pgyy*Byz[i]+pgyz*Bzz[i]) + alpn1*(Bx[i]*gxyz[i]+By[i]*gyyz[i]+Bz[i]*gyzz[i]));
double curlBy = (aBx*Lapz[i] + alpn1*(pgxx*Bxz[i]+pgxy*Byz[i]+pgxz*Bzz[i]) + alpn1*(Bx[i]*gxxz[i]+By[i]*gxyz[i]+Bz[i]*gxzz[i]))
- (aBz*Lapx[i] + alpn1*(pgxz*Bxx[i]+pgyz*Byx[i]+pgzz*Bzx[i]) + alpn1*(Bx[i]*gxzx[i]+By[i]*gyzx[i]+Bz[i]*gzzx[i]));
double curlBz = (aBy*Lapx[i] + alpn1*(pgxy*Bxx[i]+pgyy*Byx[i]+pgyz*Bzx[i]) + alpn1*(Bx[i]*gxyx[i]+By[i]*gyyx[i]+Bz[i]*gyzx[i]))
- (aBx*Lapy[i] + alpn1*(pgxx*Bxy[i]+pgxy*Byy[i]+pgxz*Bzy[i]) + alpn1*(Bx[i]*gxxy[i]+By[i]*gxyy[i]+Bz[i]*gxzy[i]));
/* Advection part: -beta^j * ∂_j E^i */
const double advEx = Ex[i]*betaxx[i] + Ey[i]*betaxy[i] + Ez[i]*betaxz[i];
const double advEy = Ex[i]*betayx[i] + Ey[i]*betayy[i] + Ez[i]*betayz[i];
const double advEz = Ex[i]*betazx[i] + Ey[i]*betazy[i] + Ez[i]*betazz[i];
/* grad(Kpsi) contracted with inverse metric */
const double gupKx = upxx*Kpsix[i] + upxy*Kpsiy[i] + upxz*Kpsiz[i];
const double gupKy = upxy*Kpsix[i] + upyy*Kpsiy[i] + upyz*Kpsiz[i];
const double gupKz = upxz*Kpsix[i] + upyz*Kpsiy[i] + upzz*Kpsiz[i];
Ex_rhs[i] = alpn1*trK[i]*Ex[i] - advEx - FOUR*PI*alpn1*Jx[i] - alpn1*gupKx + chi3o2*curlBx;
Ey_rhs[i] = alpn1*trK[i]*Ey[i] - advEy - FOUR*PI*alpn1*Jy[i] - alpn1*gupKy + chi3o2*curlBy;
Ez_rhs[i] = alpn1*trK[i]*Ez[i] - advEz - FOUR*PI*alpn1*Jz[i] - alpn1*gupKz + chi3o2*curlBz;
/* B-field RHS: similar but with -chi^{3/2} * curl(E) and grad(Kphi) */
const double ExL = pgxx*Ex[i] + pgxy*Ey[i] + pgxz*Ez[i];
const double EyL = pgxy*Ex[i] + pgyy*Ey[i] + pgyz*Ez[i];
const double EzL = pgxz*Ex[i] + pgyz*Ey[i] + pgzz*Ez[i];
const double aEx = alpn1*ExL, aEy = alpn1*EyL, aEz = alpn1*EzL;
const double curlEx = (aEz*Lapy[i] + alpn1*(pgxz*Exy[i]+pgyz*Eyy[i]+pgzz*Ezy[i]) + alpn1*(Ex[i]*gxzy[i]+Ey[i]*gyzy[i]+Ez[i]*gzzy[i]))
- (aEy*Lapz[i] + alpn1*(pgxy*Exz[i]+pgyy*Eyz[i]+pgyz*Ezz[i]) + alpn1*(Ex[i]*gxyz[i]+Ey[i]*gyyz[i]+Ez[i]*gyzz[i]));
double curlEy = (aEx*Lapz[i] + alpn1*(pgxx*Exz[i]+pgxy*Eyz[i]+pgxz*Ezz[i]) + alpn1*(Ex[i]*gxxz[i]+Ey[i]*gxyz[i]+Ez[i]*gxzz[i]))
- (aEz*Lapx[i] + alpn1*(pgxz*Exx[i]+pgyz*Eyx[i]+pgzz*Ezx[i]) + alpn1*(Ex[i]*gxzx[i]+Ey[i]*gyzx[i]+Ez[i]*gzzx[i]));
double curlEz = (aEy*Lapx[i] + alpn1*(pgxy*Exx[i]+pgyy*Eyx[i]+pgyz*Ezx[i]) + alpn1*(Ex[i]*gxyx[i]+Ey[i]*gyyx[i]+Ez[i]*gyzx[i]))
- (aEx*Lapy[i] + alpn1*(pgxx*Exy[i]+pgxy*Eyy[i]+pgxz*Ezy[i]) + alpn1*(Ex[i]*gxxy[i]+Ey[i]*gxyy[i]+Ez[i]*gxzy[i]));
const double advBx = Bx[i]*betaxx[i] + By[i]*betaxy[i] + Bz[i]*betaxz[i];
const double advBy = Bx[i]*betayx[i] + By[i]*betayy[i] + Bz[i]*betayz[i];
const double advBz = Bx[i]*betazx[i] + By[i]*betazy[i] + Bz[i]*betazz[i];
const double gupKphix = upxx*Kphix[i] + upxy*Kphiy[i] + upxz*Kphiz[i];
const double gupKphiy = upxy*Kphix[i] + upyy*Kphiy[i] + upyz*Kphiz[i];
const double gupKphiz = upxz*Kphix[i] + upyz*Kphiy[i] + upzz*Kphiz[i];
Bx_rhs[i] = alpn1*trK[i]*Bx[i] - advBx - alpn1*gupKphix - chi3o2*curlEx;
By_rhs[i] = alpn1*trK[i]*By[i] - advBy - alpn1*gupKphiy - chi3o2*curlEy;
Bz_rhs[i] = alpn1*trK[i]*Bz[i] - advBz - alpn1*gupKphiz - chi3o2*curlEz;
/* Scalar potential RHS */
const double divE = Exx[i] + Eyy[i] + Ezz[i];
const double divB = Bxx[i] + Byy[i] + Bzz[i];
const double chiCont = F3o2 * ichi * (chix[i]*Ex[i] + chiy[i]*Ey[i] + chiz[i]*Ez[i]);
Kpsi_rhs[i] = FOUR*PI*alpn1*qchar[i] - alpn1*kappa*Kpsi[i] - alpn1*(divE - chiCont);
Kphi_rhs[i] = -alpn1*kappa*Kphi[i] - alpn1*(divB - F3o2*ichi*(chix[i]*Bx[i] + chiy[i]*By[i] + chiz[i]*Bz[i]));
/* Stress-energy tensor */
const double E2 = pgxx*Ex[i]*Ex[i] + pgyy*Ey[i]*Ey[i] + pgzz*Ez[i]*Ez[i]
+ TWO*(pgxy*Ex[i]*Ey[i] + pgxz*Ex[i]*Ez[i] + pgyz*Ey[i]*Ez[i]);
const double B2 = pgxx*Bx[i]*Bx[i] + pgyy*By[i]*By[i] + pgzz*Bz[i]*Bz[i]
+ TWO*(pgxy*Bx[i]*By[i] + pgxz*Bx[i]*Bz[i] + pgyz*By[i]*Bz[i]);
rho[i] = (E2 + B2) / (EIT * PI);
const double ichi3o2 = ONE / chi3o2;
Sx[i] = (Ey[i]*Bz[i] - Ez[i]*By[i]) * F1o4PI * ichi3o2;
Sy[i] = (Ez[i]*Bx[i] - Ex[i]*Bz[i]) * F1o4PI * ichi3o2;
Sz[i] = (Ex[i]*By[i] - Ey[i]*Bx[i]) * F1o4PI * ichi3o2;
const double lExi = pgxx*Ex[i] + pgxy*Ey[i] + pgxz*Ez[i];
const double lEyi = pgxy*Ex[i] + pgyy*Ey[i] + pgyz*Ez[i];
const double lEzi = pgxz*Ex[i] + pgyz*Ey[i] + pgzz*Ez[i];
const double lBxi = pgxx*Bx[i] + pgxy*By[i] + pgxz*Bz[i];
const double lByi = pgxy*Bx[i] + pgyy*By[i] + pgyz*Bz[i];
const double lBzi = pgxz*Bx[i] + pgyz*By[i] + pgzz*Bz[i];
Sxx[i] = rho[i]*pgxx - (lExi*lExi + lBxi*lBxi) * F1o4PI;
Sxy[i] = rho[i]*pgxy - (lExi*lEyi + lBxi*lByi) * F1o4PI;
Sxz[i] = rho[i]*pgxz - (lExi*lEzi + lBxi*lBzi) * F1o4PI;
Syy[i] = rho[i]*pgyy - (lEyi*lEyi + lByi*lByi) * F1o4PI;
Syz[i] = rho[i]*pgyz - (lEyi*lEzi + lByi*lBzi) * F1o4PI;
Szz[i] = rho[i]*pgzz - (lEzi*lEzi + lBzi*lBzi) * F1o4PI;
}
/* ==== 3. Call BSSN RHS with EM stress-energy ==== */
gont = f_compute_rhs_bssn(ex, T, X, Y, Z,
chi, trK, dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz, Lap, betax, betay, betaz, dtSfx, dtSfy, dtSfz,
chi_rhs, trK_rhs,
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
Gamx_rhs, Gamy_rhs, Gamz_rhs, Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
ham_Res, movx_Res, movy_Res, movz_Res,
Gmx_Res, Gmy_Res, Gmz_Res,
Symmetry, Lev, eps, co);
if (!gont) {
/* ==== 4. Advection terms for EM fields ==== */
lopsided(ex, X, Y, Z, Kpsi, Kpsi_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Kphi, Kphi_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Ex, Ex_rhs, betax, betay, betaz, Symmetry, ASS);
lopsided(ex, X, Y, Z, Ey, Ey_rhs, betax, betay, betaz, Symmetry, SAS);
lopsided(ex, X, Y, Z, Ez, Ez_rhs, betax, betay, betaz, Symmetry, SSA);
lopsided(ex, X, Y, Z, Bx, Bx_rhs, betax, betay, betaz, Symmetry, SAA);
lopsided(ex, X, Y, Z, By, By_rhs, betax, betay, betaz, Symmetry, ASA);
lopsided(ex, X, Y, Z, Bz, Bz_rhs, betax, betay, betaz, Symmetry, AAS);
/* ==== 5. KO dissipation for EM fields ==== */
if (eps > ZEO) {
kodis(ex, X, Y, Z, Kpsi, Kpsi_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Kphi, Kphi_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Ex, Ex_rhs, ASS, Symmetry, eps);
kodis(ex, X, Y, Z, Ey, Ey_rhs, SAS, Symmetry, eps);
kodis(ex, X, Y, Z, Ez, Ez_rhs, SSA, Symmetry, eps);
kodis(ex, X, Y, Z, Bx, Bx_rhs, SAA, Symmetry, eps);
kodis(ex, X, Y, Z, By, By_rhs, ASA, Symmetry, eps);
kodis(ex, X, Y, Z, Bz, Bz_rhs, AAS, Symmetry, eps);
}
/* ==== 6. NaN check ==== */
for (int i = 0; i < all; ++i) {
if (!isfinite(Ex_rhs[i]+Ey_rhs[i]+Ez_rhs[i]+Bx_rhs[i]+By_rhs[i]+Bz_rhs[i]+Kpsi_rhs[i]+Kphi_rhs[i])) {
gont = 1; break;
}
}
} /* inner if (!gont) */
} /* outer if (!gont) */
free(chix);free(chiy);free(chiz);
free(Exx);free(Exy);free(Exz);free(Eyx);free(Eyy);free(Eyz);free(Ezx);free(Ezy);free(Ezz);
free(Bxx);free(Bxy);free(Bxz);free(Byx);free(Byy);free(Byz);free(Bzx);free(Bzy);free(Bzz);
free(Kpsix);free(Kpsiy);free(Kpsiz);
free(Kphix);free(Kphiy);free(Kphiz);
free(Lapx);free(Lapy);free(Lapz);
free(betaxx);free(betaxy);free(betaxz);free(betayx);free(betayy);free(betayz);free(betazx);free(betazy);free(betazz);
free(gxxx);free(gxxy);free(gxxz);free(gxyx);free(gxyy);free(gxyz);free(gxzx);free(gxzy);free(gxzz);
free(gyyx);free(gyyy);free(gyyz);free(gyzx);free(gyzy);free(gyzz);free(gzzx);free(gzzy);free(gzzz);
free(gupxx);free(gupxy);free(gupxz);free(gupyy);free(gupyz);free(gupzz);
return gont;
}

View File

@@ -1,169 +0,0 @@
#include "macrodef.h"
#include "bssn_rhs.h"
#include "share_func.h"
#include "tool.h"
#include <vector>
namespace
{
// Reuse the temporary workspace across block calls to avoid repeated heap churn
// in the EScalar wrapper. MPI ranks execute this path sequentially, so a single
// process-local buffer is sufficient here.
std::vector<double> g_escalar_tmp_store;
}
#ifdef fortran1
#define f_frpotential frpotential
#endif
#ifdef fortran2
#define f_frpotential FRPOTENTIAL
#endif
#ifdef fortran3
#define f_frpotential frpotential_
#endif
extern "C"
{
void f_frpotential(int *, double *, double *, double *);
}
int f_compute_rhs_bssn_escalar_c(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *Sphi, double *Spi,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *Sphi_rhs, double *Spi_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co)
{
const int nx = ex[0], ny = ex[1], nz = ex[2];
const int all = nx * ny * nz;
const size_t workspace_size = size_t(all) * 17;
if (g_escalar_tmp_store.size() < workspace_size)
g_escalar_tmp_store.resize(workspace_size);
double *tmp_ptr = g_escalar_tmp_store.data();
auto alloc_tmp = [&](int n = 1) -> double *
{
double *ptr = tmp_ptr;
tmp_ptr += size_t(all) * n;
return ptr;
};
double *chix = alloc_tmp(), *chiy = alloc_tmp(), *chiz = alloc_tmp();
double *Kx = alloc_tmp(), *Ky = alloc_tmp(), *Kz = alloc_tmp();
double *fxx = alloc_tmp(), *fxy = alloc_tmp(), *fxz = alloc_tmp();
double *fyy = alloc_tmp(), *fyz = alloc_tmp(), *fzz = alloc_tmp();
double *Lapx = alloc_tmp(), *Lapy = alloc_tmp(), *Lapz = alloc_tmp();
double *V = alloc_tmp(), *dVdSphi = alloc_tmp();
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, HALF = 0.5;
const double SSS[3] = {1.0, 1.0, 1.0};
fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Sphi, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, Sphi, fxx, fxy, fxz, fyy, fyz, fzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
f_frpotential(ex, Sphi, V, dVdSphi);
for (int i = 0; i < all; ++i)
{
const double alpn1 = Lap[i] + ONE;
const double chin1 = chi[i] + ONE;
const double gxx = dxx[i] + ONE;
const double gyy = dyy[i] + ONE;
const double gzz = dzz[i] + ONE;
const double det = gxx * gyy * gzz + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i]
- gxz[i] * gyy * gxz[i] - gxy[i] * gxy[i] * gzz - gxx * gyz[i] * gyz[i];
const double gupxx = (gyy * gzz - gyz[i] * gyz[i]) / det;
const double gupxy = -(gxy[i] * gzz - gyz[i] * gxz[i]) / det;
const double gupxz = (gxy[i] * gyz[i] - gyy * gxz[i]) / det;
const double gupyy = (gxx * gzz - gxz[i] * gxz[i]) / det;
const double gupyz = -(gxx * gyz[i] - gxy[i] * gxz[i]) / det;
const double gupzz = (gxx * gyy - gxy[i] * gxy[i]) / det;
Sphi_rhs[i] = alpn1 * Spi[i];
Spi_rhs[i] = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
+ TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
- ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) / TWO / chin1) * Kx[i]
+ (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) / TWO / chin1) * Ky[i]
+ (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) / TWO / chin1) * Kz[i]);
Spi_rhs[i] = Spi_rhs[i] * alpn1
+ gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
+ gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
+ gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i];
Spi_rhs[i] = Spi_rhs[i] * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi[i]);
rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
+ gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
+ Spi[i] * Spi[i] * HALF + V[i];
Sx[i] = -Spi[i] * Kx[i];
Sy[i] = -Spi[i] * Ky[i];
Sz[i] = -Spi[i] * Kz[i];
const double pressure = (rho[i] - Spi[i] * Spi[i]) / chin1;
Sxx[i] = Kx[i] * Kx[i] - pressure * gxx;
Sxy[i] = Kx[i] * Ky[i] - pressure * gxy[i];
Sxz[i] = Kx[i] * Kz[i] - pressure * gxz[i];
Syy[i] = Ky[i] * Ky[i] - pressure * gyy;
Syz[i] = Ky[i] * Kz[i] - pressure * gyz[i];
Szz[i] = Kz[i] * Kz[i] - pressure * gzz;
}
if (f_compute_rhs_bssn(ex, T, X, Y, Z,
chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz,
dtSfx, dtSfy, dtSfz,
chi_rhs, trK_rhs,
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
Gamx_rhs, Gamy_rhs, Gamz_rhs,
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
rho, Sx, Sy, Sz,
Sxx, Sxy, Sxz, Syy, Syz, Szz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
ham_Res, movx_Res, movy_Res, movz_Res,
Gmx_Res, Gmy_Res, Gmz_Res,
Symmetry, Lev, eps, co))
return 1;
lopsided_kodis(ex, X, Y, Z, Sphi, Sphi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
lopsided_kodis(ex, X, Y, Z, Spi, Spi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
for (int i = 0; i < all; ++i)
{
if (Sphi_rhs[i] != Sphi_rhs[i] || Spi_rhs[i] != Spi_rhs[i] || rho[i] != rho[i])
return 1;
}
return 0;
}

View File

@@ -62,7 +62,6 @@
real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
! gont = 0: success; gont = 1: something wrong ! gont = 0: success; gont = 1: something wrong
integer::gont integer::gont
integer :: i,j,k
!~~~~~~> Other variables: !~~~~~~> Other variables:
@@ -86,13 +85,6 @@
real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
real*8 :: dX, dY, dZ, PI real*8 :: dX, dY, dZ, PI
real*8 :: divb_loc,det_loc
real*8 :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc
real*8 :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc
real*8 :: fxx_loc,fxy_loc,fxz_loc
real*8 :: Gamxa_loc,Gamya_loc,Gamza_loc
real*8 :: f_loc,chin_loc
real*8 :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0 real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0 real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0 real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
@@ -105,7 +97,7 @@
#endif #endif
#if (GAUGE == 6 || GAUGE == 7) #if (GAUGE == 6 || GAUGE == 7)
integer :: BHN integer :: BHN,i,j,k
real*8, dimension(9) :: Porg real*8, dimension(9) :: Porg
real*8, dimension(3) :: Mass real*8, dimension(3) :: Mass
real*8 :: r1,r2,M,A,w1,w2,C1,C2 real*8 :: r1,r2,M,A,w1,w2,C1,C2
@@ -153,24 +145,22 @@
dY = Y(2) - Y(1) dY = Y(2) - Y(1)
dZ = Z(2) - Z(1) dZ = Z(2) - Z(1)
do k=1,ex(3) alpn1 = Lap + ONE
do j=1,ex(2) chin1 = chi + ONE
do i=1,ex(1) gxx = dxx + ONE
alpn1(i,j,k) = Lap(i,j,k) + ONE gyy = dyy + ONE
chin1(i,j,k) = chi(i,j,k) + ONE gzz = dzz + ONE
gxx(i,j,k) = dxx(i,j,k) + ONE
gyy(i,j,k) = dyy(i,j,k) + ONE
gzz(i,j,k) = dzz(i,j,k) + ONE
enddo
enddo
enddo
call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev) call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev) call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev) call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
div_beta = betaxx + betayy + betazz
call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
@@ -178,179 +168,151 @@
call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev) call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
do k=1,ex(3) gxx_rhs = - TWO * alpn1 * Axx - F2o3 * gxx * div_beta + &
do j=1,ex(2) TWO *( gxx * betaxx + gxy * betayx + gxz * betazx)
do i=1,ex(1)
divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k)
div_beta(i,j,k) = divb_loc
chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc) gyy_rhs = - TWO * alpn1 * Ayy - F2o3 * gyy * div_beta + &
TWO *( gxy * betaxy + gyy * betayy + gyz * betazy)
gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + & gzz_rhs = - TWO * alpn1 * Azz - F2o3 * gzz * div_beta + &
TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) ) TWO *( gxz * betaxz + gyz * betayz + gzz * betazz)
gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + & gxy_rhs = - TWO * alpn1 * Axy + F1o3 * gxy * div_beta + &
TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) ) gxx * betaxy + gxz * betazy + &
gyy * betayx + gyz * betazx &
- gxy * betazz
gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + & gyz_rhs = - TWO * alpn1 * Ayz + F1o3 * gyz * div_beta + &
TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) ) gxy * betaxz + gyy * betayz + &
gxz * betaxy + gzz * betazy &
- gyz * betaxx
gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + & gxz_rhs = - TWO * alpn1 * Axz + F1o3 * gxz * div_beta + &
gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + & gxx * betaxz + gxy * betayz + &
gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k) gyz * betayx + gzz * betazx &
- gxz * betayy !rhs for gij
gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + & ! invert tilted metric
gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + & gupzz = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k) gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
gupxx = ( gyy * gzz - gyz * gyz ) / gupzz
gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
gupxz = ( gxy * gyz - gyy * gxz ) / gupzz
gupyy = ( gxx * gzz - gxz * gxz ) / gupzz
gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
gupzz = ( gxx * gyy - gxy * gxy ) / gupzz
gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + & if(co == 0)then
gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + & ! Gam^i_Res = Gam^i + gup^ij_,j
gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k) Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
+gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
+gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
+gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
+gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
+gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
+gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
+gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
+gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
+gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
+gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
+gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
+gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
endif
det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + & ! second kind of connection
gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - & Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k) Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc
gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc
gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc
gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc
gupxx(i,j,k) = gupxx_loc
gupxy(i,j,k) = gupxy_loc
gupxz(i,j,k) = gupxz_loc
gupyy(i,j,k) = gupyy_loc
gupyz(i,j,k) = gupyz_loc
gupzz(i,j,k) = gupzz_loc
if(co == 0)then Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
Gmx_Res(i,j,k) = Gamx(i,j,k) - ( & Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + & Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + &
gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmy_Res(i,j,k) = Gamy(i,j,k) - ( &
gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + &
gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + &
gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + &
gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + &
gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
endif
Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k))) Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k))) Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k))) Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k))) Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k))) Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k))) Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k)) Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k)) Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k)) Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) ) Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) ) Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) ) Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) )
Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) )
Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) )
Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) )
enddo
enddo
enddo
! Raise indices of \tilde A_{ij} and store in R_ij ! Raise indices of \tilde A_{ij} and store in R_ij
Rxx = gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz)
Ryy = gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + &
TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
Rzz = gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + &
TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz)
Rxy = gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + &
(gupxx * gupyy + gupxy * gupxy)* Axy + &
(gupxx * gupyz + gupxz * gupxy)* Axz + &
(gupxy * gupyz + gupxz * gupyy)* Ayz
Rxz = gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + &
(gupxx * gupyz + gupxy * gupxz)* Axy + &
(gupxx * gupzz + gupxz * gupxz)* Axz + &
(gupxy * gupzz + gupxz * gupyz)* Ayz
Ryz = gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + &
(gupxy * gupyz + gupyy * gupxz)* Axy + &
(gupxy * gupzz + gupyz * gupxz)* Axz + &
(gupyy * gupzz + gupyz * gupyz)* Ayz
! Right hand side for Gam^i without shift terms... ! Right hand side for Gam^i without shift terms...
call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev) call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
gupxx_loc = gupxx(i,j,k)
gupxy_loc = gupxy(i,j,k)
gupxz_loc = gupxz(i,j,k)
gupyy_loc = gupyy(i,j,k)
gupyz_loc = gupyz(i,j,k)
gupzz_loc = gupzz(i,j,k)
Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + & Gamx_rhs = - TWO * ( Lapx * Rxx + Lapy * Rxy + Lapz * Rxz ) + &
TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k)) TWO * alpn1 * ( &
Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + & -F3o2/chin1 * ( chix * Rxx + chiy * Rxy + chiz * Rxz ) - &
TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k)) gupxx * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + & gupxy * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k)) gupxz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + & Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz + &
(gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + & TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
(gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + &
(gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k)
Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
(gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + &
(gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + &
(gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k)
Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + &
(gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + &
(gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + &
(gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k)
Rxx(i,j,k) = Rxx_loc
Ryy(i,j,k) = Ryy_loc
Rzz(i,j,k) = Rzz_loc
Rxy(i,j,k) = Rxy_loc
Rxz(i,j,k) = Rxz_loc
Ryz(i,j,k) = Ryz_loc
Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + & Gamy_rhs = - TWO * ( Lapx * Rxy + Lapy * Ryy + Lapz * Ryz ) + &
TWO * alpn1(i,j,k) * ( & TWO * alpn1 * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - & -F3o2/chin1 * ( chix * Rxy + chiy * Ryy + chiz * Ryz ) - &
gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - & gupxy * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - & gupyy * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + & gupyz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + & Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz + &
TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc)) TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + & Gamz_rhs = - TWO * ( Lapx * Rxz + Lapy * Ryz + Lapz * Rzz ) + &
TWO * alpn1(i,j,k) * ( & TWO * alpn1 * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - & -F3o2/chin1 * ( chix * Rxz + chiy * Ryz + chiz * Rzz ) - &
gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - & gupxz * ( F2o3 * Kx + EIGHT * PI * Sx ) - &
gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - & gupyz * ( F2o3 * Ky + EIGHT * PI * Sy ) - &
gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + & gupzz * ( F2o3 * Kz + EIGHT * PI * Sz ) + &
Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + & Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz + &
TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc)) TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )
Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
TWO * alpn1(i,j,k) * ( &
-F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
enddo
enddo
enddo
call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,& call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev) X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -359,54 +321,38 @@
call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,& call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev) X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)
fxx = gxxx + gxyy + gxzz
fxy = gxyx + gyyy + gyzz
fxz = gxzx + gyzy + gzzz
Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + &
TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz )
Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + &
TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz )
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev) call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev) call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev) call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
do k=1,ex(3)
do j=1,ex(2)
do i=1,ex(1)
divb_loc = div_beta(i,j,k)
fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k)
fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k)
fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k)
gupxx_loc = gupxx(i,j,k) Gamx_rhs = Gamx_rhs + F2o3 * Gamxa * div_beta - &
gupxy_loc = gupxy(i,j,k) Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz + &
gupxz_loc = gupxz(i,j,k) F1o3 * (gupxx * fxx + gupxy * fxy + gupxz * fxz ) + &
gupyy_loc = gupyy(i,j,k) gupxx * gxxx + gupyy * gyyx + gupzz * gzzx + &
gupyz_loc = gupyz(i,j,k) TWO * (gupxy * gxyx + gupxz * gxzx + gupyz * gyzx )
gupzz_loc = gupzz(i,j,k)
Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + & Gamy_rhs = Gamy_rhs + F2o3 * Gamya * div_beta - &
TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k)) Gamxa * betayx - Gamya * betayy - Gamza * betayz + &
Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + & F1o3 * (gupxy * fxx + gupyy * fxy + gupyz * fxz ) + &
TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k)) gupxx * gxxy + gupyy * gyyy + gupzz * gzzy + &
Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + & TWO * (gupxy * gxyy + gupxz * gxzy + gupyz * gyzy )
TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k))
Gamxa(i,j,k) = Gamxa_loc
Gamya(i,j,k) = Gamya_loc
Gamza(i,j,k) = Gamza_loc
Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - & Gamz_rhs = Gamz_rhs + F2o3 * Gamza * div_beta - &
Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + & Gamxa * betazx - Gamya * betazy - Gamza * betazz + &
F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + & F1o3 * (gupxz * fxx + gupyz * fxy + gupzz * fxz ) + &
gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + & gupxx * gxxz + gupyy * gyyz + gupzz * gzzz + &
TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k)) TWO * (gupxy * gxyz + gupxz * gxzz + gupyz * gyzz ) !rhs for Gam^i
Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
enddo
enddo
enddo
!first kind of connection stored in gij,k !first kind of connection stored in gij,k
gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -658,187 +604,189 @@
!covariant second derivative of chi respect to tilted metric !covariant second derivative of chi respect to tilted metric
call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev) call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
do k=1,ex(3) fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
do j=1,ex(2) fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
do i=1,ex(1) fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k) fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz
fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k) fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz
fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k) fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k) ! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k)
chin_loc = chin1(i,j,k) f = gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + & gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + & gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + & TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + & TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + & TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz )
TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k)) ! Add chi part to Ricci tensor:
f(i,j,k) = f_loc
Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
enddo
enddo
enddo
! covariant second derivatives of the lapse respect to physical metric ! covariant second derivatives of the lapse respect to physical metric
call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, & call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
SYM,SYM,SYM,symmetry,Lev) SYM,SYM,SYM,symmetry,Lev)
do k=1,ex(3) gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
do j=1,ex(2) gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
do i=1,ex(1) gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
chin_loc = chin1(i,j,k) ! now get physical second kind of connection
gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc Gamyxx = Gamyxx - ( - gxx * gxxy )*HALF
gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc Gamzxx = Gamzxx - ( - gxx * gxxz )*HALF
Gamxyy = Gamxyy - ( - gyy * gxxx )*HALF
Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
Gamzyy = Gamzyy - ( - gyy * gxxz )*HALF
Gamxzz = Gamxzz - ( - gzz * gxxx )*HALF
Gamyzz = Gamyzz - ( - gzz * gxxy )*HALF
Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
Gamxxy = Gamxxy - ( chiy /chin1 - gxy * gxxx )*HALF
Gamyxy = Gamyxy - ( chix /chin1 - gxy * gxxy )*HALF
Gamzxy = Gamzxy - ( - gxy * gxxz )*HALF
Gamxxz = Gamxxz - ( chiz /chin1 - gxz * gxxx )*HALF
Gamyxz = Gamyxz - ( - gxz * gxxy )*HALF
Gamzxz = Gamzxz - ( chix /chin1 - gxz * gxxz )*HALF
Gamxyz = Gamxyz - ( - gyz * gxxx )*HALF
Gamyyz = Gamyyz - ( chiz /chin1 - gyz * gxxy )*HALF
Gamzyz = Gamzyz - ( chiy /chin1 - gyz * gxxz )*HALF
Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
Gamyxx(i,j,k) = Gamyxx(i,j,k) - ( - gxx(i,j,k) * gxxy(i,j,k) )*HALF fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
Gamzxx(i,j,k) = Gamzxx(i,j,k) - ( - gxx(i,j,k) * gxxz(i,j,k) )*HALF fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz
Gamxyy(i,j,k) = Gamxyy(i,j,k) - ( - gyy(i,j,k) * gxxx(i,j,k) )*HALF fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz
Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz
Gamzyy(i,j,k) = Gamzyy(i,j,k) - ( - gyy(i,j,k) * gxxz(i,j,k) )*HALF fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz
Gamxzz(i,j,k) = Gamxzz(i,j,k) - ( - gzz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyzz(i,j,k) = Gamyzz(i,j,k) - ( - gzz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF
Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF
Gamzxy(i,j,k) = Gamzxy(i,j,k) - ( - gxy(i,j,k) * gxxz(i,j,k) )*HALF
Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyxz(i,j,k) = Gamyxz(i,j,k) - ( - gxz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF
Gamxyz(i,j,k) = Gamxyz(i,j,k) - ( - gyz(i,j,k) * gxxx(i,j,k) )*HALF
Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF
Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF
fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k) ! store D^i D_i Lap in trK_rhs upto chi
fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k) trK_rhs = gupxx * fxx + gupyy * fyy + gupzz * fzz + &
fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k) TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz )
fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k) #if 1
fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k) !! follow bam code
fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k) S = chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
f = F2o3 * trK * trK -(&
gupxx * ( &
gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + &
gupyy * ( &
gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + &
gupzz * ( &
gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + &
TWO * ( &
gupxy * ( &
gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
gupxy * (Axx * Ayy + Axy * Axy) + &
gupxz * (Axx * Ayz + Axz * Axy) + &
gupyz * (Axy * Ayz + Axz * Ayy) ) + &
gupxz * ( &
gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
gupxy * (Axx * Ayz + Axy * Axz) + &
gupxz * (Axx * Azz + Axz * Axz) + &
gupyz * (Axy * Azz + Axz * Ayz) ) + &
gupyz * ( &
gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
gupxy * (Axy * Ayz + Ayy * Axz) + &
gupxz * (Axy * Azz + Ayz * Axz) + &
gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
f = - F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + & fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
enddo fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
enddo fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
enddo fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
do k=1,ex(3) fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
do j=1,ex(2) #else
do i=1,ex(1) ! Add lapse and S_ij parts to Ricci tensor:
divb_loc = div_beta(i,j,k)
chin_loc = chin1(i,j,k)
S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + & fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) ) fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
S(i,j,k) = S_loc fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( & ! Compute trace-free part (note: chi^-1 and chi cancel!):
gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + &
gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + &
gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + &
gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + &
TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + &
gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + &
gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + &
gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - &
F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + & f = F1o3 *( gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + & TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) )
alpn1(i,j,k)/chin_loc * f_loc ) #endif
f(i,j,k) = f_loc
l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k) Axx_rhs = fxx - gxx * f
l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k) Ayy_rhs = fyy - gyy * f
l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k) Azz_rhs = fzz - gzz * f
l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k) Axy_rhs = fxy - gxy * f
l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k) Axz_rhs = fxz - gxz * f
l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k) Ayz_rhs = fyz - gyz * f
Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc ! Now: store A_il A^l_j into fij:
Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc
Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc
Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc
Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc
Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc
fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + & fxx = gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + & TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz)
gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) fyy = gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + & TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz)
gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + & fzz = gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz)
fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + & fxy = gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + & gupxy *(Axx * Ayy + Axy * Axy) + &
gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) gupxz *(Axx * Ayz + Axz * Axy) + &
fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + & gupyz *(Axy * Ayz + Axz * Ayy)
gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + & fxz = gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + & gupxy *(Axx * Ayz + Axy * Axz) + &
gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) gupxz *(Axx * Azz + Axz * Axz) + &
fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + & gupyz *(Axy * Azz + Axz * Ayz)
gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + & fyz = gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + & gupxy *(Axy * Ayz + Ayy * Axz) + &
gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) gupxz *(Axy * Azz + Ayz * Axz) + &
fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + & gupyz *(Ayy * Azz + Ayz * Ayz)
gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k))
trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k) f = chin1
! store D^i D_i Lap in trK_rhs
trK_rhs = f*trK_rhs
Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + & Axx_rhs = f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx) + &
TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - & TWO * ( Axx * betaxx + Axy * betayx + Axz * betazx )- &
F2o3 * Axx(i,j,k) * divb_loc F2o3 * Axx * div_beta
Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - &
F2o3 * Ayy(i,j,k) * divb_loc
Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + &
TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - &
F2o3 * Azz(i,j,k) * divb_loc
Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + &
Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + &
Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k)
Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + &
Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k)
Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + &
Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + &
Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k)
trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + & Ayy_rhs = f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy) + &
gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + & TWO * ( Axy * betaxy + Ayy * betayy + Ayz * betazy )- &
TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + & F2o3 * Ayy * div_beta
FOUR * PI * (rho(i,j,k) + S_loc) )
enddo Azz_rhs = f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz) + &
enddo TWO * ( Axz * betaxz + Ayz * betayz + Azz * betazz )- &
enddo F2o3 * Azz * div_beta
Axy_rhs = f * Axy_rhs+ alpn1 *( trK * Axy - TWO * fxy )+ &
Axx * betaxy + Axz * betazy + &
Ayy * betayx + Ayz * betazx + &
F1o3 * Axy * div_beta - Axy * betazz
Ayz_rhs = f * Ayz_rhs+ alpn1 *( trK * Ayz - TWO * fyz )+ &
Axy * betaxz + Ayy * betayz + &
Axz * betaxy + Azz * betazy + &
F1o3 * Ayz * div_beta - Ayz * betaxx
Axz_rhs = f * Axz_rhs+ alpn1 *( trK * Axz - TWO * fxz )+ &
Axx * betaxz + Axy * betayz + &
Ayz * betayx + Azz * betazx + &
F1o3 * Axz * div_beta - Axz * betayy !rhs for Aij
! Compute trace of S_ij
S = f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK + &
gupxx * fxx + gupyy * fyy + gupzz * fzz + &
TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
FOUR * PI * ( rho + S )) !rhs for trK
!!!! gauge variable part !!!! gauge variable part
@@ -1000,15 +948,15 @@
!!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency) !!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
! lopsided_kodis shares the symmetry_bd buffer between advection and ! lopsided_kodis shares the symmetry_bd buffer between advection and
! dissipation, eliminating redundant full-grid copies. For metric variables ! dissipation, eliminating redundant full-grid copies. For metric variables
! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero, ! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
! so the constant offset has no effect on dissipation. ! so the constant offset has no effect on dissipation.
call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps) call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps) call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps) call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps) call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)

View File

@@ -32,19 +32,6 @@
#define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_ #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
#define f_compute_constraint_fr compute_constraint_fr_ #define f_compute_constraint_fr compute_constraint_fr_
#endif #endif
#ifdef __cplusplus
extern "C"
{
#endif
void f_bssn_rhs_kernel_timing_reset();
int f_bssn_rhs_kernel_timing_bucket_count();
const double *f_bssn_rhs_kernel_timing_local_seconds();
const char *f_bssn_rhs_kernel_timing_label(int);
#ifdef __cplusplus
}
#endif
extern "C" extern "C"
{ {
int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
@@ -67,27 +54,6 @@ extern "C"
int &, int &, double &, int &); int &, int &, double &, int &);
} }
int f_compute_rhs_bssn_escalar_c(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij
double *, double *, double *, // Gam
double *, double *, double *, double *, double *, double *, double *, // Gauge
double *, double *, // Sphi, Spi
double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij
double *, double *, double *, // Gam
double *, double *, double *, double *, double *, double *, double *, // Gauge
double *, double *, // Sphi, Spi
double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, // stress-energy
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Ricci
double *, double *, double *, double *, double *, double *, double *, // constraint violation
int &, int &, double &, int &);
extern "C" extern "C"
{ {
int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *, // ex,T,rho,sigma,R int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *, // ex,T,rho,sigma,R
@@ -262,31 +228,4 @@ extern "C"
double *); double *);
} // FR_cons } // FR_cons
// BSSN-EM C kernel (replaces empart.f90 + bssn_rhs.f90 for BSSN+Maxwell)
int f_compute_rhs_bssn_em_c(int *, double &, double *, double *, double *,
double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *,
double *, double *,
double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *,
double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
int &, int &, double &, int &);
#endif /* BSSN_H */ #endif /* BSSN_H */

File diff suppressed because it is too large Load Diff

View File

@@ -43,6 +43,14 @@ cgh::cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun,
end_rank = 0; end_rank = 0;
#endif #endif
// Initialize load balancing variables
enable_load_balance = false;
load_balance_check_interval = 10; // Check every 10 time steps
current_time_step = 0;
rank_interp_times = nullptr;
heavy_ranks = nullptr;
num_heavy_ranks = 0;
if (!checkrun) if (!checkrun)
{ {
read_bbox(Symmetry, filename); read_bbox(Symmetry, filename);
@@ -113,6 +121,12 @@ cgh::~cgh()
delete[] Porgls[lev]; delete[] Porgls[lev];
} }
delete[] Porgls; delete[] Porgls;
// Clean up load balancing memory
if (rank_interp_times)
delete[] rank_interp_times;
if (heavy_ranks)
delete[] heavy_ranks;
} }
//================================================================================================ //================================================================================================
@@ -130,11 +144,7 @@ void cgh::compose_cgh(int nprocs)
for (int lev = 0; lev < levels; lev++) for (int lev = 0; lev < levels; lev++)
{ {
checkPatchList(PatL[lev], false); checkPatchList(PatL[lev], false);
#ifdef INTERP_LB_OPTIMIZE Parallel::distribute_hard(PatL[lev], nprocs, ingfs, fngfs, false);
Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
#else
Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
#endif
#if (RPB == 1) #if (RPB == 1)
// we need distributed box of PatL[lev] and PatL[lev-1] // we need distributed box of PatL[lev] and PatL[lev-1]
if (lev > 0) if (lev > 0)
@@ -1305,13 +1315,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
} }
bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB, MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor) monitor *ErrorMonitor)
{ {
if (lev < movls) if (lev < movls)
return false; return;
#if (0) #if (0)
// #if (PSTR == 1 || PSTR == 2) // #if (PSTR == 1 || PSTR == 2)
@@ -1400,7 +1410,7 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
for (bhi = 0; bhi < BH_num; bhi++) for (bhi = 0; bhi < BH_num; bhi++)
delete[] tmpPorg[bhi]; delete[] tmpPorg[bhi];
delete[] tmpPorg; delete[] tmpPorg;
return false; return;
} }
// x direction // x direction
rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX; rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
@@ -1504,7 +1514,6 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
for (int bhi = 0; bhi < BH_num; bhi++) for (int bhi = 0; bhi < BH_num; bhi++)
delete[] tmpPorg[bhi]; delete[] tmpPorg[bhi];
delete[] tmpPorg; delete[] tmpPorg;
return tot_flag;
} }
@@ -1710,3 +1719,121 @@ void cgh::settrfls(const int lev)
{ {
trfls = lev; trfls = lev;
} }
//================================================================================================
// Load Balancing Functions
//================================================================================================
// Initialize load balancing
void cgh::init_load_balance(int nprocs)
{
if (rank_interp_times)
delete[] rank_interp_times;
if (heavy_ranks)
delete[] heavy_ranks;
rank_interp_times = new double[nprocs];
heavy_ranks = new int[4]; // Maximum 4 heavy ranks
num_heavy_ranks = 0;
for (int i = 0; i < nprocs; i++)
rank_interp_times[i] = 0.0;
}
// Update interpolation time for a rank
void cgh::update_interp_time(int rank, double time)
{
if (rank_interp_times && rank >= 0)
{
rank_interp_times[rank] = time;
}
}
// Check and perform load balancing if needed
bool cgh::check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB)
{
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// Only check at specified intervals
current_time_step++;
if (current_time_step % load_balance_check_interval != 0)
return false;
if (myrank == 0)
{
cout << "\n=== Checking load balance at time step " << current_time_step << " ===" << endl;
}
// Collect all rank times on rank 0
double *all_times = nullptr;
if (myrank == 0)
{
all_times = new double[nprocs];
}
MPI_Gather(rank_interp_times, 1, MPI_DOUBLE, all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
bool need_rebalance = false;
if (myrank == 0)
{
// Check if load balancing is needed
need_rebalance = Parallel::check_load_balance_need(all_times, nprocs, num_heavy_ranks, heavy_ranks);
if (need_rebalance)
{
cout << "=== Load imbalance detected! Need to rebalance ===" << endl;
cout << "Top " << num_heavy_ranks << " heavy ranks: ";
for (int i = 0; i < num_heavy_ranks; i++)
{
cout << heavy_ranks[i] << " (" << all_times[heavy_ranks[i]] << " s) ";
}
cout << endl;
// Analyze blocks that need to be split
Parallel::split_heavy_blocks(PatL[lev], heavy_ranks, num_heavy_ranks, 2, nprocs, ingfs, fngfs);
// Set lev_flag to trigger recompose_cgh
cout << "=== Triggering recompose_cgh for level " << lev << " ===" << endl;
}
else
{
cout << "=== Load is balanced, no rebalancing needed ===" << endl;
}
delete[] all_times;
}
// Broadcast the decision to all ranks
MPI_Bcast(&need_rebalance, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
if (need_rebalance)
{
// Broadcast heavy ranks information
MPI_Bcast(&num_heavy_ranks, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(heavy_ranks, num_heavy_ranks, MPI_INT, 0, MPI_COMM_WORLD);
// Perform recompose_cgh on the specified level
if (myrank == 0)
{
cout << "=== Performing recompose_cgh ===" << endl;
}
// Call recompose_cgh_Onelevel for the specified level
bool *lev_flag = new bool[1];
lev_flag[0] = true;
recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
delete[] lev_flag;
// Reset time counter after rebalancing
current_time_step = 0;
return true;
}
return false;
}

View File

@@ -74,7 +74,7 @@ public:
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB); int Symmetry, bool BB);
bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
MyList<var> *OldList, MyList<var> *StateList, MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList, bool BB, MyList<var> *FutureList, MyList<var> *tmList, bool BB,
monitor *ErrorMonitor); monitor *ErrorMonitor);
@@ -87,6 +87,21 @@ public:
#if (PSTR == 1 || PSTR == 2 || PSTR == 3) #if (PSTR == 1 || PSTR == 2 || PSTR == 3)
void construct_mylev(int nprocs); void construct_mylev(int nprocs);
#endif #endif
// Load balancing support
bool enable_load_balance; // Enable load balancing
int load_balance_check_interval; // Check interval (in time steps)
int current_time_step; // Current time step counter
double *rank_interp_times; // Store interpolation times for each rank
int *heavy_ranks; // Store heavy rank numbers
int num_heavy_ranks; // Number of heavy ranks
void init_load_balance(int nprocs);
void update_interp_time(int rank, double time);
bool check_and_rebalance(int nprocs, int lev,
MyList<var> *OldList, MyList<var> *StateList,
MyList<var> *FutureList, MyList<var> *tmList,
int Symmetry, bool BB);
}; };
#endif /* CGH_H */ #endif /* CGH_H */

View File

@@ -1513,7 +1513,6 @@
real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3)) :: fh
real*8, dimension(3) :: SoA real*8, dimension(3) :: SoA
integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz real*8 :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz real*8 :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2 integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
@@ -1566,47 +1565,9 @@
fxz = ZEO fxz = ZEO
fyz = ZEO fyz = ZEO
i_core_min = max(1, imin+2)
i_core_max = min(ex(1), imax-2)
j_core_min = max(1, jmin+2)
j_core_max = min(ex(2), jmax-2)
k_core_min = max(1, kmin+2)
k_core_max = min(ex(3), kmax-2)
if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
do k=k_core_min,k_core_max
do j=j_core_min,j_core_max
do i=i_core_min,i_core_max
! interior points always use 4th-order stencils without branch checks
fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-fh(i+2,j,k)+F16*fh(i+1,j,k) )
fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-fh(i,j+2,k)+F16*fh(i,j+1,k) )
fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-fh(i,j,k+2)+F16*fh(i,j,k+1) )
fxy(i,j,k) = Fdxdy*( (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k)) &
-F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k)) &
+F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k)) &
- (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
fxz(i,j,k) = Fdxdz*( (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2)) &
-F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1)) &
+F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1)) &
- (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
fyz(i,j,k) = Fdydz*( (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2)) &
-F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1)) &
+F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1)) &
- (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
enddo
enddo
enddo
endif
do k=1,ex(3) do k=1,ex(3)
do j=1,ex(2) do j=1,ex(2)
do i=1,ex(1) do i=1,ex(1)
if(i>=i_core_min .and. i<=i_core_max .and. &
j>=j_core_min .and. j<=j_core_max .and. &
k>=k_core_min .and. k<=k_core_max) cycle
!~~~~~~ fxx !~~~~~~ fxx
if(i+2 <= imax .and. i-2 >= imin)then if(i+2 <= imax .and. i-2 >= imin)then
! !

View File

@@ -1,894 +0,0 @@
#include "macrodef.h"
#include "tool.h"
/*
* C 版 fdderivs — second derivatives d2f/dx2, d2f/dxdy, d2f/dxdz, d2f/dy2, d2f/dydz, d2f/dz2.
*
* Finite difference order selected at compile time via ghost_width macro.
* Multi-pass skip strategy: lowest-order computes widest region while skipping
* the union of higher-order regions, then each higher pass overwrites its interior.
*/
void fdderivs(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff)
{
(void)onoff;
const int NO_SYMM = 0, EQ_SYMM = 1;
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
const double F1o4 = 2.5e-1;
const double F8 = 8.0;
const double F16 = 16.0;
const double F30 = 30.0;
const double F1o12 = ONE / 12.0;
const double F1o144 = ONE / 144.0;
const double F9 = 9.0, F45 = 45.0, F60 = 60.0;
const double F27 = 27.0, F270 = 270.0, F490 = 490.0;
const double F1o180 = ONE / 180.0;
const double F1o3600 = ONE / 3600.0;
const double F32 = 32.0, F128 = 128.0, F168 = 168.0, F672 = 672.0;
const double F840 = 840.0, F1008 = 1008.0, F8064 = 8064.0, F14350 = 14350.0;
const double F1o5040 = ONE / 5040.0;
const double F1o705600 = ONE / 705600.0;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
#if (ghost_width == 2)
/* ---- 2nd-order ------------------------------------------------------ */
{
const int ord = 1;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = 0;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = 0;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = 0;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double Sdxdx = ONE / (dX * dX);
const double Sdydy = ONE / (dY * dY);
const double Sdzdz = ONE / (dZ * dZ);
const double Sdxdy = F1o4 / (dX * dY);
const double Sdxdz = F1o4 / (dX * dZ);
const double Sdydz = F1o4 / (dY * dZ);
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2;
const int j2_hi = ex2 - 2;
const int k2_hi = ex3 - 2;
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Sdxdx * (
fh[idx_fh_F_ord1(iF - 1, jF, kF, ex)] -
TWO * fh[idx_fh_F_ord1(iF, jF, kF, ex)] +
fh[idx_fh_F_ord1(iF + 1, jF, kF, ex)]);
fyy[p] = Sdydy * (
fh[idx_fh_F_ord1(iF, jF - 1, kF, ex)] -
TWO * fh[idx_fh_F_ord1(iF, jF, kF, ex)] +
fh[idx_fh_F_ord1(iF, jF + 1, kF, ex)]);
fzz[p] = Sdzdz * (
fh[idx_fh_F_ord1(iF, jF, kF - 1, ex)] -
TWO * fh[idx_fh_F_ord1(iF, jF, kF, ex)] +
fh[idx_fh_F_ord1(iF, jF, kF + 1, ex)]);
fxy[p] = Sdxdy * (
fh[idx_fh_F_ord1(iF - 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord1(iF + 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord1(iF - 1, jF + 1, kF, ex)] +
fh[idx_fh_F_ord1(iF + 1, jF + 1, kF, ex)]);
fxz[p] = Sdxdz * (
fh[idx_fh_F_ord1(iF - 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord1(iF + 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord1(iF - 1, jF, kF + 1, ex)] +
fh[idx_fh_F_ord1(iF + 1, jF, kF + 1, ex)]);
fyz[p] = Sdydz * (
fh[idx_fh_F_ord1(iF, jF - 1, kF - 1, ex)] -
fh[idx_fh_F_ord1(iF, jF + 1, kF - 1, ex)] -
fh[idx_fh_F_ord1(iF, jF - 1, kF + 1, ex)] +
fh[idx_fh_F_ord1(iF, jF + 1, kF + 1, ex)]);
}
}
}
}
return;
}
#elif (ghost_width == 3)
/* ---- 4th-order (original code) ------------------------------------ */
{
const int ord = 2;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double Sdxdx = ONE / (dX * dX);
const double Sdydy = ONE / (dY * dY);
const double Sdzdz = ONE / (dZ * dZ);
const double Fdxdx = F1o12 / (dX * dX);
const double Fdydy = F1o12 / (dY * dY);
const double Fdzdz = F1o12 / (dZ * dZ);
const double Sdxdy = F1o4 / (dX * dY);
const double Sdxdz = F1o4 / (dX * dZ);
const double Sdydz = F1o4 / (dY * dZ);
const double Fdxdy = F1o144 / (dX * dY);
const double Fdxdz = F1o144 / (dX * dZ);
const double Fdydz = F1o144 / (dY * dZ);
/* zero high-boundary faces (points the loops below won't cover) */
for (int j0 = 0; j0 < ex2; ++j0)
for (int i0 = 0; i0 < ex1; ++i0) {
const size_t p = idx_ex(i0, j0, ex3 - 1, ex);
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
for (int k0 = 0; k0 < ex3 - 1; ++k0)
for (int i0 = 0; i0 < ex1; ++i0) {
const size_t p = idx_ex(i0, ex2 - 1, k0, ex);
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
for (int k0 = 0; k0 < ex3 - 1; ++k0)
for (int j0 = 0; j0 < ex2 - 1; ++j0) {
const size_t p = idx_ex(ex1 - 1, j0, k0, ex);
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
if (kminF == 1) {
for (int j0 = 0; j0 < ex2; ++j0)
for (int i0 = 0; i0 < ex1; ++i0) {
const size_t p = idx_ex(i0, j0, 0, ex);
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
}
if (jminF == 1) {
for (int k0 = 0; k0 < ex3; ++k0)
for (int i0 = 0; i0 < ex1; ++i0) {
const size_t p = idx_ex(i0, 0, k0, ex);
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
}
if (iminF == 1) {
for (int k0 = 0; k0 < ex3; ++k0)
for (int j0 = 0; j0 < ex2; ++j0) {
const size_t p = idx_ex(0, j0, k0, ex);
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
}
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2;
const int j2_hi = ex2 - 2;
const int k2_hi = ex3 - 2;
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i4_hi = ex1 - 3;
const int j4_hi = ex2 - 3;
const int k4_hi = ex3 - 3;
const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
if (has4 &&
i0 >= i4_lo && i0 <= i4_hi &&
j0 >= j4_lo && j0 <= j4_hi &&
k0 >= k4_lo && k0 <= k4_hi) {
continue;
}
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Sdxdx * (
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]);
fyy[p] = Sdydy * (
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]);
fzz[p] = Sdzdz * (
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]);
fxy[p] = Sdxdy * (
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]);
fxz[p] = Sdxdz * (
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]);
fyz[p] = Sdydz * (
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]);
}
}
}
}
if (has4) {
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Fdxdx * (
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]);
fyy[p] = Fdydy * (
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]);
fzz[p] = Fdzdz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]);
/* fxy: 5x5 outer product */
{
const double t_jm2 = (
fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)]);
const double t_jm1 = (
fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)]);
const double t_jp1 = (
fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)]);
const double t_jp2 = (
fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)]);
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
}
/* fxz: 5x5 outer product */
{
const double t_km2 = (
fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)]);
const double t_km1 = (
fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)]);
const double t_kp1 = (
fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)]);
const double t_kp2 = (
fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)]);
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
/* fyz: 5x5 outer product */
{
const double t_km2 = (
fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)]);
const double t_km1 = (
fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)]);
const double t_kp1 = (
fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)]);
const double t_kp2 = (
fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)]);
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
}
}
}
}
}
return;
}
#elif (ghost_width == 4)
/* ---- 6th-order ----------------------------------------------------- */
{
const int ord = 3;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
/* Denominators */
const double Sdxdx = ONE / (dX * dX); // 2nd
const double Sdydy = ONE / (dY * dY);
const double Sdzdz = ONE / (dZ * dZ);
const double Fdxdx = F1o12 / (dX * dX); // 4th
const double Fdydy = F1o12 / (dY * dY);
const double Fdzdz = F1o12 / (dZ * dZ);
const double Xdxdx = F1o180 / (dX * dX); // 6th
const double Xdydy = F1o180 / (dY * dY);
const double Xdzdz = F1o180 / (dZ * dZ);
const double Sdxdy = F1o4 / (dX * dY);
const double Sdxdz = F1o4 / (dX * dZ);
const double Sdydz = F1o4 / (dY * dZ);
const double Fdxdy = F1o144 / (dX * dY);
const double Fdxdz = F1o144 / (dX * dZ);
const double Fdydz = F1o144 / (dY * dZ);
const double Xdxdy = F1o3600 / (dX * dY);
const double Xdxdz = F1o3600 / (dX * dZ);
const double Xdydz = F1o3600 / (dY * dZ);
/* zero everything first */
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
/* loop bounds for each pass (from widest to narrowest) */
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2, j2_hi = ex2 - 2, k2_hi = ex3 - 2;
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i4_hi = ex1 - 3, j4_hi = ex2 - 3, k4_hi = ex3 - 3;
const int i6_lo = (iminF + 2 > 0) ? (iminF + 2) : 0;
const int j6_lo = (jminF + 2 > 0) ? (jminF + 2) : 0;
const int k6_lo = (kminF + 2 > 0) ? (kminF + 2) : 0;
const int i6_hi = ex1 - 4, j6_hi = ex2 - 4, k6_hi = ex3 - 4;
const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
const int has6 = (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi);
/* 2nd-order: skip 4th+6th overlap */
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
_Bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi;
if (in4) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Sdxdx * (fh[idx_fh_F(iF - 1, jF, kF, ex)] - TWO*fh[idx_fh_F(iF,jF,kF,ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]);
fyy[p] = Sdydy * (fh[idx_fh_F(iF, jF - 1, kF, ex)] - TWO*fh[idx_fh_F(iF,jF,kF,ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]);
fzz[p] = Sdzdz * (fh[idx_fh_F(iF, jF, kF - 1, ex)] - TWO*fh[idx_fh_F(iF,jF,kF,ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]);
fxy[p] = Sdxdy * (fh[idx_fh_F(iF - 1, jF - 1, kF, ex)] - fh[idx_fh_F(iF + 1, jF - 1, kF, ex)] - fh[idx_fh_F(iF - 1, jF + 1, kF, ex)] + fh[idx_fh_F(iF + 1, jF + 1, kF, ex)]);
fxz[p] = Sdxdz * (fh[idx_fh_F(iF - 1, jF, kF - 1, ex)] - fh[idx_fh_F(iF + 1, jF, kF - 1, ex)] - fh[idx_fh_F(iF - 1, jF, kF + 1, ex)] + fh[idx_fh_F(iF + 1, jF, kF + 1, ex)]);
fyz[p] = Sdydz * (fh[idx_fh_F(iF, jF - 1, kF - 1, ex)] - fh[idx_fh_F(iF, jF + 1, kF - 1, ex)] - fh[idx_fh_F(iF, jF - 1, kF + 1, ex)] + fh[idx_fh_F(iF, jF + 1, kF + 1, ex)]);
}
}
}
}
/* 4th-order: skip 6th overlap */
if (has4) {
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
if (has6 && i0>=i6_lo && i0<=i6_hi && j0>=j6_lo && j0<=j6_hi && k0>=k6_lo && k0<=k6_hi) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Fdxdx * (-fh[idx_fh_F(iF - 2, jF, kF, ex)] + F16*fh[idx_fh_F(iF-1,jF,kF,ex)] - F30*fh[idx_fh_F(iF,jF,kF,ex)] - fh[idx_fh_F(iF+2,jF,kF,ex)] + F16*fh[idx_fh_F(iF+1,jF,kF,ex)]);
fyy[p] = Fdydy * (-fh[idx_fh_F(iF, jF - 2, kF, ex)] + F16*fh[idx_fh_F(iF,jF-1,kF,ex)] - F30*fh[idx_fh_F(iF,jF,kF,ex)] - fh[idx_fh_F(iF,jF+2,kF,ex)] + F16*fh[idx_fh_F(iF,jF+1,kF,ex)]);
fzz[p] = Fdzdz * (-fh[idx_fh_F(iF, jF, kF - 2, ex)] + F16*fh[idx_fh_F(iF,jF,kF-1,ex)] - F30*fh[idx_fh_F(iF,jF,kF,ex)] - fh[idx_fh_F(iF,jF,kF+2,ex)] + F16*fh[idx_fh_F(iF,jF,kF+1,ex)]);
{
const double t_jm2 = (fh[idx_fh_F(iF-2,jF-2,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF-2,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF-2,kF,ex)]-fh[idx_fh_F(iF+2,jF-2,kF,ex)]);
const double t_jm1 = (fh[idx_fh_F(iF-2,jF-1,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF-1,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF-1,kF,ex)]-fh[idx_fh_F(iF+2,jF-1,kF,ex)]);
const double t_jp1 = (fh[idx_fh_F(iF-2,jF+1,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF+1,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF+1,kF,ex)]-fh[idx_fh_F(iF+2,jF+1,kF,ex)]);
const double t_jp2 = (fh[idx_fh_F(iF-2,jF+2,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF+2,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF+2,kF,ex)]-fh[idx_fh_F(iF+2,jF+2,kF,ex)]);
fxy[p] = Fdxdy * (t_jm2 - F8*t_jm1 + F8*t_jp1 - t_jp2);
}
{
const double t_km2 = (fh[idx_fh_F(iF-2,jF,kF-2,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF-2,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF-2,ex)]-fh[idx_fh_F(iF+2,jF,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F(iF-2,jF,kF-1,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF-1,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF-1,ex)]-fh[idx_fh_F(iF+2,jF,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F(iF-2,jF,kF+1,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF+1,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF+1,ex)]-fh[idx_fh_F(iF+2,jF,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F(iF-2,jF,kF+2,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF+2,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF+2,ex)]-fh[idx_fh_F(iF+2,jF,kF+2,ex)]);
fxz[p] = Fdxdz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2);
}
{
const double t_km2 = (fh[idx_fh_F(iF,jF-2,kF-2,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF-2,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF-2,ex)]-fh[idx_fh_F(iF,jF+2,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F(iF,jF-2,kF-1,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF-1,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF-1,ex)]-fh[idx_fh_F(iF,jF+2,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F(iF,jF-2,kF+1,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF+1,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF+1,ex)]-fh[idx_fh_F(iF,jF+2,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F(iF,jF-2,kF+2,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF+2,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF+2,ex)]-fh[idx_fh_F(iF,jF+2,kF+2,ex)]);
fyz[p] = Fdydz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2);
}
}
}
}
}
/* 6th-order: interior only */
if (has6) {
for (int k0 = k6_lo; k0 <= k6_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j6_lo; j0 <= j6_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i6_lo; i0 <= i6_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* Diagonal: [+2,-27,+270,-490,+270,-27,+2] / (180*dx^2) */
fxx[p] = Xdxdx * (
TWO * fh[idx_fh_F(iF - 3, jF, kF, ex)] -
F27 * fh[idx_fh_F(iF - 2, jF, kF, ex)] +
F270 * fh[idx_fh_F(iF - 1, jF, kF, ex)] -
F490 * fh[idx_fh_F(iF, jF, kF, ex)] +
F270 * fh[idx_fh_F(iF + 1, jF, kF, ex)] -
F27 * fh[idx_fh_F(iF + 2, jF, kF, ex)] +
TWO * fh[idx_fh_F(iF + 3, jF, kF, ex)]);
fyy[p] = Xdydy * (
TWO * fh[idx_fh_F(iF, jF - 3, kF, ex)] -
F27 * fh[idx_fh_F(iF, jF - 2, kF, ex)] +
F270 * fh[idx_fh_F(iF, jF - 1, kF, ex)] -
F490 * fh[idx_fh_F(iF, jF, kF, ex)] +
F270 * fh[idx_fh_F(iF, jF + 1, kF, ex)] -
F27 * fh[idx_fh_F(iF, jF + 2, kF, ex)] +
TWO * fh[idx_fh_F(iF, jF + 3, kF, ex)]);
fzz[p] = Xdzdz * (
TWO * fh[idx_fh_F(iF, jF, kF - 3, ex)] -
F27 * fh[idx_fh_F(iF, jF, kF - 2, ex)] +
F270 * fh[idx_fh_F(iF, jF, kF - 1, ex)] -
F490 * fh[idx_fh_F(iF, jF, kF, ex)] +
F270 * fh[idx_fh_F(iF, jF, kF + 1, ex)] -
F27 * fh[idx_fh_F(iF, jF, kF + 2, ex)] +
TWO * fh[idx_fh_F(iF, jF, kF + 3, ex)]);
/* Mixed: 7x7 outer product. Compute 1D x-stencil at each y/z offset,
then combine using 1D y/z weights [-1,+9,-45,0,+45,-9,+1] / (3600*dx*dy) */
{
// x-stencil: -f(i-3)+9f(i-2)-45f(i-1)+45f(i+1)-9f(i+2)+f(i+3)
// Helper macro would help but explicit is safer
#define XSTEN6(JF, KF_DUMMY) \
(-fh[idx_fh_F(iF-3,JF,KF_DUMMY,ex)] + F9*fh[idx_fh_F(iF-2,JF,KF_DUMMY,ex)] - F45*fh[idx_fh_F(iF-1,JF,KF_DUMMY,ex)] + F45*fh[idx_fh_F(iF+1,JF,KF_DUMMY,ex)] - F9*fh[idx_fh_F(iF+2,JF,KF_DUMMY,ex)] + fh[idx_fh_F(iF+3,JF,KF_DUMMY,ex)])
fxy[p] = Xdxdy * (
-XSTEN6(jF-3, kF) + F9*XSTEN6(jF-2, kF) - F45*XSTEN6(jF-1, kF) + F45*XSTEN6(jF+1, kF) - F9*XSTEN6(jF+2, kF) + XSTEN6(jF+3, kF));
fxz[p] = Xdxdz * (
-XSTEN6(jF, kF-3) + F9*XSTEN6(jF, kF-2) - F45*XSTEN6(jF, kF-1) + F45*XSTEN6(jF, kF+1) - F9*XSTEN6(jF, kF+2) + XSTEN6(jF, kF+3));
#undef XSTEN6
}
/* fyz: apply 1D y-stencil at each z offset */
{
#define YSTEN6(JF, KF_DUMMY) \
(-fh[idx_fh_F(iF,JF-3,KF_DUMMY,ex)] + F9*fh[idx_fh_F(iF,JF-2,KF_DUMMY,ex)] - F45*fh[idx_fh_F(iF,JF-1,KF_DUMMY,ex)] + F45*fh[idx_fh_F(iF,JF+1,KF_DUMMY,ex)] - F9*fh[idx_fh_F(iF,JF+2,KF_DUMMY,ex)] + fh[idx_fh_F(iF,JF+3,KF_DUMMY,ex)])
fyz[p] = Xdydz * (
-YSTEN6(jF, kF-3) + F9*YSTEN6(jF, kF-2) - F45*YSTEN6(jF, kF-1) + F45*YSTEN6(jF, kF+1) - F9*YSTEN6(jF, kF+2) + YSTEN6(jF, kF+3));
#undef YSTEN6
}
}
}
}
}
return;
}
#elif (ghost_width == 5)
/* ---- 8th-order ----------------------------------------------------- */
{
const int ord = 4;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double Sdxdx = ONE / (dX * dX);
const double Sdydy = ONE / (dY * dY);
const double Sdzdz = ONE / (dZ * dZ);
const double Fdxdx = F1o12 / (dX * dX);
const double Fdydy = F1o12 / (dY * dY);
const double Fdzdz = F1o12 / (dZ * dZ);
const double Xdxdx = F1o180 / (dX * dX);
const double Xdydy = F1o180 / (dY * dY);
const double Xdzdz = F1o180 / (dZ * dZ);
const double Edxdx = F1o5040 / (dX * dX);
const double Edydy = F1o5040 / (dY * dY);
const double Edzdz = F1o5040 / (dZ * dZ);
const double Sdxdy = F1o4 / (dX * dY);
const double Sdxdz = F1o4 / (dX * dZ);
const double Sdydz = F1o4 / (dY * dZ);
const double Fdxdy = F1o144 / (dX * dY);
const double Fdxdz = F1o144 / (dX * dZ);
const double Fdydz = F1o144 / (dY * dZ);
const double Xdxdy = F1o3600 / (dX * dY);
const double Xdxdz = F1o3600 / (dX * dZ);
const double Xdydz = F1o3600 / (dY * dZ);
const double Edxdy = F1o705600 / (dX * dY);
const double Edxdz = F1o705600 / (dX * dZ);
const double Edydz = F1o705600 / (dY * dZ);
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
}
/* Loop bounds for each pass */
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2, j2_hi = ex2 - 2, k2_hi = ex3 - 2;
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i4_hi = ex1 - 3, j4_hi = ex2 - 3, k4_hi = ex3 - 3;
const int i6_lo = (iminF + 2 > 0) ? (iminF + 2) : 0;
const int j6_lo = (jminF + 2 > 0) ? (jminF + 2) : 0;
const int k6_lo = (kminF + 2 > 0) ? (kminF + 2) : 0;
const int i6_hi = ex1 - 4, j6_hi = ex2 - 4, k6_hi = ex3 - 4;
const int i8_lo = (iminF + 3 > 0) ? (iminF + 3) : 0;
const int j8_lo = (jminF + 3 > 0) ? (jminF + 3) : 0;
const int k8_lo = (kminF + 3 > 0) ? (kminF + 3) : 0;
const int i8_hi = ex1 - 5, j8_hi = ex2 - 5, k8_hi = ex3 - 5;
const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
const int has6 = (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi);
const int has8 = (i8_lo <= i8_hi && j8_lo <= j8_hi && k8_lo <= k8_hi);
/* 2nd-order: skip 4th+6th+8th overlap */
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
_Bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi;
if (in4) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Sdxdx * (fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
fyy[p] = Sdydy * (fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
fzz[p] = Sdzdz * (fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
fxy[p] = Sdxdy * (fh[idx_fh_F_ord4(iF-1,jF-1,kF,ex)] - fh[idx_fh_F_ord4(iF+1,jF-1,kF,ex)] - fh[idx_fh_F_ord4(iF-1,jF+1,kF,ex)] + fh[idx_fh_F_ord4(iF+1,jF+1,kF,ex)]);
fxz[p] = Sdxdz * (fh[idx_fh_F_ord4(iF-1,jF,kF-1,ex)] - fh[idx_fh_F_ord4(iF+1,jF,kF-1,ex)] - fh[idx_fh_F_ord4(iF-1,jF,kF+1,ex)] + fh[idx_fh_F_ord4(iF+1,jF,kF+1,ex)]);
fyz[p] = Sdydz * (fh[idx_fh_F_ord4(iF,jF-1,kF-1,ex)] - fh[idx_fh_F_ord4(iF,jF+1,kF-1,ex)] - fh[idx_fh_F_ord4(iF,jF-1,kF+1,ex)] + fh[idx_fh_F_ord4(iF,jF+1,kF+1,ex)]);
}
}
}
}
/* 4th-order: skip 6th+8th overlap */
if (has4) {
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
_Bool in6 = has6 && i0>=i6_lo && i0<=i6_hi && j0>=j6_lo && j0<=j6_hi && k0>=k6_lo && k0<=k6_hi;
if (in6) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Fdxdx * (-fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + F16*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF+2,jF,kF,ex)] + F16*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
fyy[p] = Fdydy * (-fh[idx_fh_F_ord4(iF,jF-2,kF,ex)] + F16*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF,jF+2,kF,ex)] + F16*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
fzz[p] = Fdzdz * (-fh[idx_fh_F_ord4(iF,jF,kF-2,ex)] + F16*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF,jF,kF+2,ex)] + F16*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
{
const double t_jm2 = (fh[idx_fh_F_ord4(iF-2,jF-2,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF-2,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF-2,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF-2,kF,ex)]);
const double t_jm1 = (fh[idx_fh_F_ord4(iF-2,jF-1,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF-1,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF-1,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF-1,kF,ex)]);
const double t_jp1 = (fh[idx_fh_F_ord4(iF-2,jF+1,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF+1,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF+1,kF,ex)]);
const double t_jp2 = (fh[idx_fh_F_ord4(iF-2,jF+2,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF+2,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF+2,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF+2,kF,ex)]);
fxy[p] = Fdxdy * (t_jm2 - F8*t_jm1 + F8*t_jp1 - t_jp2);
}
{
const double t_km2 = (fh[idx_fh_F_ord4(iF-2,jF,kF-2,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF-2,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF-2,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F_ord4(iF-2,jF,kF-1,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF-1,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF-1,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F_ord4(iF-2,jF,kF+1,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF+1,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F_ord4(iF-2,jF,kF+2,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF+2,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF+2,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF+2,ex)]);
fxz[p] = Fdxdz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2);
}
{
const double t_km2 = (fh[idx_fh_F_ord4(iF,jF-2,kF-2,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF-2,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF-2,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF-2,ex)]);
const double t_km1 = (fh[idx_fh_F_ord4(iF,jF-2,kF-1,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF-1,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF-1,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF-1,ex)]);
const double t_kp1 = (fh[idx_fh_F_ord4(iF,jF-2,kF+1,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF+1,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF+1,ex)]);
const double t_kp2 = (fh[idx_fh_F_ord4(iF,jF-2,kF+2,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF+2,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF+2,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF+2,ex)]);
fyz[p] = Fdydz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2);
}
}
}
}
}
/* 6th-order: skip 8th overlap */
if (has6) {
for (int k0 = k6_lo; k0 <= k6_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j6_lo; j0 <= j6_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i6_lo; i0 <= i6_hi; ++i0) {
if (has8 && i0>=i8_lo && i0<=i8_hi && j0>=j8_lo && j0<=j8_hi && k0>=k8_lo && k0<=k8_hi) continue;
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fxx[p] = Xdxdx * (
TWO * fh[idx_fh_F_ord4(iF-3,jF,kF,ex)] - F27*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)] - F27*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)] + TWO*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
fyy[p] = Xdydy * (
TWO * fh[idx_fh_F_ord4(iF,jF-3,kF,ex)] - F27*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)] - F27*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)] + TWO*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
fzz[p] = Xdzdz * (
TWO * fh[idx_fh_F_ord4(iF,jF,kF-3,ex)] - F27*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)] + F270*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)] - F27*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)] + TWO*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
{
#define XSTEN6_8(JF, KF_DUMMY) \
(-fh[idx_fh_F_ord4(iF-3,JF,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord4(iF-2,JF,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord4(iF-1,JF,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord4(iF+1,JF,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord4(iF+2,JF,KF_DUMMY,ex)] + fh[idx_fh_F_ord4(iF+3,JF,KF_DUMMY,ex)])
fxy[p] = Xdxdy * (
-XSTEN6_8(jF-3,kF) + F9*XSTEN6_8(jF-2,kF) - F45*XSTEN6_8(jF-1,kF) + F45*XSTEN6_8(jF+1,kF) - F9*XSTEN6_8(jF+2,kF) + XSTEN6_8(jF+3,kF));
fxz[p] = Xdxdz * (
-XSTEN6_8(jF,kF-3) + F9*XSTEN6_8(jF,kF-2) - F45*XSTEN6_8(jF,kF-1) + F45*XSTEN6_8(jF,kF+1) - F9*XSTEN6_8(jF,kF+2) + XSTEN6_8(jF,kF+3));
#undef XSTEN6_8
}
{
#define YSTEN6_8(JF, KF_DUMMY) \
(-fh[idx_fh_F_ord4(iF,JF-3,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord4(iF,JF-2,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord4(iF,JF-1,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord4(iF,JF+1,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord4(iF,JF+2,KF_DUMMY,ex)] + fh[idx_fh_F_ord4(iF,JF+3,KF_DUMMY,ex)])
fyz[p] = Xdydz * (
-YSTEN6_8(jF,kF-3) + F9*YSTEN6_8(jF,kF-2) - F45*YSTEN6_8(jF,kF-1) + F45*YSTEN6_8(jF,kF+1) - F9*YSTEN6_8(jF,kF+2) + YSTEN6_8(jF,kF+3));
#undef YSTEN6_8
}
}
}
}
}
/* 8th-order: interior only */
if (has8) {
for (int k0 = k8_lo; k0 <= k8_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j8_lo; j0 <= j8_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i8_lo; i0 <= i8_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* Diagonal: [-9,+128,-1008,+8064,-14350,+8064,-1008,+128,-9] / (5040*dx^2) */
fxx[p] = Edxdx * (
-(double)9 * fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] -
F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)] -
(double)9 * fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]);
fyy[p] = Edydy * (
-(double)9 * fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] -
F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)] -
(double)9 * fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]);
fzz[p] = Edzdz * (
-(double)9 * fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] -
F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] +
F8064 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
F1008 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] +
F128 * fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)] -
(double)9 * fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]);
/* Mixed: 9x9 outer product.
x-stencil: +3*f(i-4)-32*f(i-3)+168*f(i-2)-672*f(i-1)+672*f(i+1)-168*f(i+2)+32*f(i+3)-3*f(i+4)
y/z weights: same [+3,-32,+168,-672,+672,-168,+32,-3] / 705600 */
{
#define XSTEN8(JF, KF_DUMMY) \
(+(double)3*fh[idx_fh_F_ord4(iF-4,JF,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord4(iF-3,JF,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord4(iF-2,JF,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord4(iF-1,JF,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord4(iF+1,JF,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord4(iF+2,JF,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord4(iF+3,JF,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord4(iF+4,JF,KF_DUMMY,ex)])
fxy[p] = Edxdy * (
+(double)3*XSTEN8(jF-4,kF) - F32*XSTEN8(jF-3,kF) + F168*XSTEN8(jF-2,kF) - F672*XSTEN8(jF-1,kF) + F672*XSTEN8(jF+1,kF) - F168*XSTEN8(jF+2,kF) + F32*XSTEN8(jF+3,kF) - (double)3*XSTEN8(jF+4,kF));
fxz[p] = Edxdz * (
+(double)3*XSTEN8(jF,kF-4) - F32*XSTEN8(jF,kF-3) + F168*XSTEN8(jF,kF-2) - F672*XSTEN8(jF,kF-1) + F672*XSTEN8(jF,kF+1) - F168*XSTEN8(jF,kF+2) + F32*XSTEN8(jF,kF+3) - (double)3*XSTEN8(jF,kF+4));
#undef XSTEN8
}
{
#define YSTEN8(JF, KF_DUMMY) \
(+(double)3*fh[idx_fh_F_ord4(iF,JF-4,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord4(iF,JF-3,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord4(iF,JF-2,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord4(iF,JF-1,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord4(iF,JF+1,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord4(iF,JF+2,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord4(iF,JF+3,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord4(iF,JF+4,KF_DUMMY,ex)])
fyz[p] = Edydz * (
+(double)3*YSTEN8(jF,kF-4) - F32*YSTEN8(jF,kF-3) + F168*YSTEN8(jF,kF-2) - F672*YSTEN8(jF,kF-1) + F672*YSTEN8(jF,kF+1) - F168*YSTEN8(jF,kF+2) + F32*YSTEN8(jF,kF+3) - (double)3*YSTEN8(jF,kF+4));
#undef YSTEN8
}
}
}
}
}
return;
}
#else
#error "fdderivs_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
#endif
}

View File

@@ -1,321 +0,0 @@
#include "macrodef.h"
#include "share_func.h"
/*
* fdderivs_sh — second derivatives on shell patch in (rho, sigma, R) coords.
* Same stencil coefficients as Cartesian fdderivs. Uses symmetry_stbd.
*/
extern "C" void fdderivs_sh_(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff, int sst)
{
(void)SYM3; (void)onoff; (void)sst;
const int NO_SYMM=0, EQ_SYMM=1, OCTANT=2;
const double ZEO=0.0, ONE=1.0, TWO=2.0, F1o4=2.5e-1;
const double F8=8.0, F16=16.0, F30=30.0, F1o12=ONE/12.0, F1o144=ONE/144.0;
const double F9=9.0, F45=45.0, F60=60.0, F27=27.0, F270=270.0, F490=490.0;
const double F1o180=ONE/180.0, F1o3600=ONE/3600.0;
const double F32=32.0, F128=128.0, F168=168.0, F672=672.0, F840=840.0;
const double F1008=1008.0, F8064=8064.0, F14350=14350.0;
const double F1o5040=ONE/5040.0, F1o705600=ONE/705600.0;
const int ex1=ex[0], ex2=ex[1], ex3=ex[2];
const double dX=X[1]-X[0], dY=Y[1]-Y[0], dZ=Z[1]-Z[0];
const int imaxF=ex1, jmaxF=ex2, kmaxF=ex3;
const double SoA[2]={SYM1,SYM2};
#if (ghost_width == 2)
{
const int ord=1;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=0;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=0;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=0;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
static double *fh_buf=NULL;static size_t cap=0;
if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
double *fh=fh_buf;if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
const size_t all=(size_t)ex1*ex2*ex3;
for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=ZEO;fxy[p]=fxz[p]=fyz[p]=ZEO;}
const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
#define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){
for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
}}}
}
#undef FH
return;
}
#elif (ghost_width == 3)
{
const int ord=2;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-1;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-1;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-1;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
static double *fh_buf=NULL;static size_t cap=0;
if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
double *fh=fh_buf;if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ);
const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ);
const size_t all=(size_t)ex1*ex2*ex3;
for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=fxy[p]=fxz[p]=fyz[p]=ZEO;}
const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi);
#define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){
for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){
if(has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
}}}
}
if(has4){
for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF));
fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF));
fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1));
{const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF));
const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF));
const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF));
const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF));
fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);}
{const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2));
const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1));
const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1));
const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2));
fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
{const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2));
const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1));
const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1));
const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2));
fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
}}}
}
#undef FH
return;
}
#elif (ghost_width == 4)
{
const int ord=3;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-2;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-2;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-2;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
static double *fh_buf=NULL;static size_t cap=0;
if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
double *fh=fh_buf;if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ);
const double Xdxdx=F1o180/(dX*dX),Xdydy=F1o180/(dY*dY),Xdzdz=F1o180/(dZ*dZ);
const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ);
const double Xdxdy=F1o3600/(dX*dY),Xdxdz=F1o3600/(dX*dZ),Xdydz=F1o3600/(dY*dZ);
const size_t all=(size_t)ex1*ex2*ex3;
for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=fxy[p]=fxz[p]=fyz[p]=ZEO;}
const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi),has6=(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi);
#define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
}}}}
if(has4){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
for(int i0=i4_lo;i0<=i4_hi;++i0){if(has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF));
fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF));
fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1));
{const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF));
const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF));
const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF));
const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF));
fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);}
{const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2));
const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1));
const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1));
const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2));
fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
{const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2));
const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1));
const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1));
const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2));
fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
}}}}
if(has6){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Xdxdx*(TWO*FH(iF-3,jF,kF)-F27*FH(iF-2,jF,kF)+F270*FH(iF-1,jF,kF)-F490*FH(iF,jF,kF)+F270*FH(iF+1,jF,kF)-F27*FH(iF+2,jF,kF)+TWO*FH(iF+3,jF,kF));
fyy[p]=Xdydy*(TWO*FH(iF,jF-3,kF)-F27*FH(iF,jF-2,kF)+F270*FH(iF,jF-1,kF)-F490*FH(iF,jF,kF)+F270*FH(iF,jF+1,kF)-F27*FH(iF,jF+2,kF)+TWO*FH(iF,jF+3,kF));
fzz[p]=Xdzdz*(TWO*FH(iF,jF,kF-3)-F27*FH(iF,jF,kF-2)+F270*FH(iF,jF,kF-1)-F490*FH(iF,jF,kF)+F270*FH(iF,jF,kF+1)-F27*FH(iF,jF,kF+2)+TWO*FH(iF,jF,kF+3));
#define XS6(JF,KFDUMMY) (-FH(iF-3,JF,KFDUMMY)+F9*FH(iF-2,JF,KFDUMMY)-F45*FH(iF-1,JF,KFDUMMY)+F45*FH(iF+1,JF,KFDUMMY)-F9*FH(iF+2,JF,KFDUMMY)+FH(iF+3,JF,KFDUMMY))
fxy[p]=Xdxdy*(-XS6(jF-3,kF)+F9*XS6(jF-2,kF)-F45*XS6(jF-1,kF)+F45*XS6(jF+1,kF)-F9*XS6(jF+2,kF)+XS6(jF+3,kF));
fxz[p]=Xdxdz*(-XS6(jF,kF-3)+F9*XS6(jF,kF-2)-F45*XS6(jF,kF-1)+F45*XS6(jF,kF+1)-F9*XS6(jF,kF+2)+XS6(jF,kF+3));
#undef XS6
#define YS6(JF,KFDUMMY) (-FH(iF,JF-3,KFDUMMY)+F9*FH(iF,JF-2,KFDUMMY)-F45*FH(iF,JF-1,KFDUMMY)+F45*FH(iF,JF+1,KFDUMMY)-F9*FH(iF,JF+2,KFDUMMY)+FH(iF,JF+3,KFDUMMY))
fyz[p]=Xdydz*(-YS6(jF,kF-3)+F9*YS6(jF,kF-2)-F45*YS6(jF,kF-1)+F45*YS6(jF,kF+1)-F9*YS6(jF,kF+2)+YS6(jF,kF+3));
#undef YS6
}}}}
#undef FH
return;
}
#elif (ghost_width == 5)
{
/* 8th-order shell second derivatives — inherits 8th-order stencil coeffs from Cartesian */
const int ord=4;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-3;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-3;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-3;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
static double *fh_buf=NULL;static size_t cap=0;
if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
double *fh=fh_buf;if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const double Sdxdx=ONE/(dX*dX),Sdydy=ONE/(dY*dY),Sdzdz=ONE/(dZ*dZ);
const double Fdxdx=F1o12/(dX*dX),Fdydy=F1o12/(dY*dY),Fdzdz=F1o12/(dZ*dZ);
const double Xdxdx=F1o180/(dX*dX),Xdydy=F1o180/(dY*dY),Xdzdz=F1o180/(dZ*dZ);
const double Edxdx=F1o5040/(dX*dX),Edydy=F1o5040/(dY*dY),Edzdz=F1o5040/(dZ*dZ);
const double Sdxdy=F1o4/(dX*dY),Sdxdz=F1o4/(dX*dZ),Sdydz=F1o4/(dY*dZ);
const double Fdxdy=F1o144/(dX*dY),Fdxdz=F1o144/(dX*dZ),Fdydz=F1o144/(dY*dZ);
const double Xdxdy=F1o3600/(dX*dY),Xdxdz=F1o3600/(dX*dZ),Xdydz=F1o3600/(dY*dZ);
const double Edxdy=F1o705600/(dX*dY),Edxdz=F1o705600/(dX*dZ),Edydz=F1o705600/(dY*dZ);
const size_t all=(size_t)ex1*ex2*ex3;
for(size_t p=0;p<all;++p){fxx[p]=fyy[p]=fzz[p]=fxy[p]=fxz[p]=fyz[p]=ZEO;}
const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
const int i8_lo=(iminF+3>0)?iminF+3:0,j8_lo=(jminF+3>0)?jminF+3:0,k8_lo=4,i8_hi=ex1-5,j8_hi=ex2-5,k8_hi=ex3-5;
const int has4=(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi),has6=(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi),has8=(i8_lo<=i8_hi&&j8_lo<=j8_hi&&k8_lo<=k8_hi);
#define FH(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
/* 2nd-order pass */
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){_Bool in4=has4&&i0>=i4_lo&&i0<=i4_hi&&j0>=j4_lo&&j0<=j4_hi&&k0>=k4_lo&&k0<=k4_hi;if(in4)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Sdxdx*(FH(iF-1,jF,kF)-TWO*FH(iF,jF,kF)+FH(iF+1,jF,kF));
fyy[p]=Sdydy*(FH(iF,jF-1,kF)-TWO*FH(iF,jF,kF)+FH(iF,jF+1,kF));
fzz[p]=Sdzdz*(FH(iF,jF,kF-1)-TWO*FH(iF,jF,kF)+FH(iF,jF,kF+1));
fxy[p]=Sdxdy*(FH(iF-1,jF-1,kF)-FH(iF+1,jF-1,kF)-FH(iF-1,jF+1,kF)+FH(iF+1,jF+1,kF));
fxz[p]=Sdxdz*(FH(iF-1,jF,kF-1)-FH(iF+1,jF,kF-1)-FH(iF-1,jF,kF+1)+FH(iF+1,jF,kF+1));
fyz[p]=Sdydz*(FH(iF,jF-1,kF-1)-FH(iF,jF+1,kF-1)-FH(iF,jF-1,kF+1)+FH(iF,jF+1,kF+1));
}}}}
/* 4th-order pass */
if(has4){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
for(int i0=i4_lo;i0<=i4_hi;++i0){_Bool in6=has6&&i0>=i6_lo&&i0<=i6_hi&&j0>=j6_lo&&j0<=j6_hi&&k0>=k6_lo&&k0<=k6_hi;if(in6)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Fdxdx*(-FH(iF-2,jF,kF)+F16*FH(iF-1,jF,kF)-F30*FH(iF,jF,kF)-FH(iF+2,jF,kF)+F16*FH(iF+1,jF,kF));
fyy[p]=Fdydy*(-FH(iF,jF-2,kF)+F16*FH(iF,jF-1,kF)-F30*FH(iF,jF,kF)-FH(iF,jF+2,kF)+F16*FH(iF,jF+1,kF));
fzz[p]=Fdzdz*(-FH(iF,jF,kF-2)+F16*FH(iF,jF,kF-1)-F30*FH(iF,jF,kF)-FH(iF,jF,kF+2)+F16*FH(iF,jF,kF+1));
{const double t_jm2=(FH(iF-2,jF-2,kF)-F8*FH(iF-1,jF-2,kF)+F8*FH(iF+1,jF-2,kF)-FH(iF+2,jF-2,kF));
const double t_jm1=(FH(iF-2,jF-1,kF)-F8*FH(iF-1,jF-1,kF)+F8*FH(iF+1,jF-1,kF)-FH(iF+2,jF-1,kF));
const double t_jp1=(FH(iF-2,jF+1,kF)-F8*FH(iF-1,jF+1,kF)+F8*FH(iF+1,jF+1,kF)-FH(iF+2,jF+1,kF));
const double t_jp2=(FH(iF-2,jF+2,kF)-F8*FH(iF-1,jF+2,kF)+F8*FH(iF+1,jF+2,kF)-FH(iF+2,jF+2,kF));
fxy[p]=Fdxdy*(t_jm2-F8*t_jm1+F8*t_jp1-t_jp2);}
{const double t_km2=(FH(iF-2,jF,kF-2)-F8*FH(iF-1,jF,kF-2)+F8*FH(iF+1,jF,kF-2)-FH(iF+2,jF,kF-2));
const double t_km1=(FH(iF-2,jF,kF-1)-F8*FH(iF-1,jF,kF-1)+F8*FH(iF+1,jF,kF-1)-FH(iF+2,jF,kF-1));
const double t_kp1=(FH(iF-2,jF,kF+1)-F8*FH(iF-1,jF,kF+1)+F8*FH(iF+1,jF,kF+1)-FH(iF+2,jF,kF+1));
const double t_kp2=(FH(iF-2,jF,kF+2)-F8*FH(iF-1,jF,kF+2)+F8*FH(iF+1,jF,kF+2)-FH(iF+2,jF,kF+2));
fxz[p]=Fdxdz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
{const double t_km2=(FH(iF,jF-2,kF-2)-F8*FH(iF,jF-1,kF-2)+F8*FH(iF,jF+1,kF-2)-FH(iF,jF+2,kF-2));
const double t_km1=(FH(iF,jF-2,kF-1)-F8*FH(iF,jF-1,kF-1)+F8*FH(iF,jF+1,kF-1)-FH(iF,jF+2,kF-1));
const double t_kp1=(FH(iF,jF-2,kF+1)-F8*FH(iF,jF-1,kF+1)+F8*FH(iF,jF+1,kF+1)-FH(iF,jF+2,kF+1));
const double t_kp2=(FH(iF,jF-2,kF+2)-F8*FH(iF,jF-1,kF+2)+F8*FH(iF,jF+1,kF+2)-FH(iF,jF+2,kF+2));
fyz[p]=Fdydz*(t_km2-F8*t_km1+F8*t_kp1-t_kp2);}
}}}}
/* 6th-order pass */
if(has6){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
for(int i0=i6_lo;i0<=i6_hi;++i0){if(has8&&i0>=i8_lo&&i0<=i8_hi&&j0>=j8_lo&&j0<=j8_hi&&k0>=k8_lo&&k0<=k8_hi)continue;
const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Xdxdx*(TWO*FH(iF-3,jF,kF)-F27*FH(iF-2,jF,kF)+F270*FH(iF-1,jF,kF)-F490*FH(iF,jF,kF)+F270*FH(iF+1,jF,kF)-F27*FH(iF+2,jF,kF)+TWO*FH(iF+3,jF,kF));
fyy[p]=Xdydy*(TWO*FH(iF,jF-3,kF)-F27*FH(iF,jF-2,kF)+F270*FH(iF,jF-1,kF)-F490*FH(iF,jF,kF)+F270*FH(iF,jF+1,kF)-F27*FH(iF,jF+2,kF)+TWO*FH(iF,jF+3,kF));
fzz[p]=Xdzdz*(TWO*FH(iF,jF,kF-3)-F27*FH(iF,jF,kF-2)+F270*FH(iF,jF,kF-1)-F490*FH(iF,jF,kF)+F270*FH(iF,jF,kF+1)-F27*FH(iF,jF,kF+2)+TWO*FH(iF,jF,kF+3));
#define XS6_8(JF,KFDUMMY) (-FH(iF-3,JF,KFDUMMY)+F9*FH(iF-2,JF,KFDUMMY)-F45*FH(iF-1,JF,KFDUMMY)+F45*FH(iF+1,JF,KFDUMMY)-F9*FH(iF+2,JF,KFDUMMY)+FH(iF+3,JF,KFDUMMY))
fxy[p]=Xdxdy*(-XS6_8(jF-3,kF)+F9*XS6_8(jF-2,kF)-F45*XS6_8(jF-1,kF)+F45*XS6_8(jF+1,kF)-F9*XS6_8(jF+2,kF)+XS6_8(jF+3,kF));
fxz[p]=Xdxdz*(-XS6_8(jF,kF-3)+F9*XS6_8(jF,kF-2)-F45*XS6_8(jF,kF-1)+F45*XS6_8(jF,kF+1)-F9*XS6_8(jF,kF+2)+XS6_8(jF,kF+3));
#undef XS6_8
#define YS6_8(JF,KFDUMMY) (-FH(iF,JF-3,KFDUMMY)+F9*FH(iF,JF-2,KFDUMMY)-F45*FH(iF,JF-1,KFDUMMY)+F45*FH(iF,JF+1,KFDUMMY)-F9*FH(iF,JF+2,KFDUMMY)+FH(iF,JF+3,KFDUMMY))
fyz[p]=Xdydz*(-YS6_8(jF,kF-3)+F9*YS6_8(jF,kF-2)-F45*YS6_8(jF,kF-1)+F45*YS6_8(jF,kF+1)-F9*YS6_8(jF,kF+2)+YS6_8(jF,kF+3));
#undef YS6_8
}}}}
/* 8th-order pass */
if(has8){for(int k0=k8_lo;k0<=k8_hi;++k0){const int kF=k0+1;
for(int j0=j8_lo;j0<=j8_hi;++j0){const int jF=j0+1;
for(int i0=i8_lo;i0<=i8_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fxx[p]=Edxdx*(-(double)9*FH(iF-4,jF,kF)+F128*FH(iF-3,jF,kF)-F1008*FH(iF-2,jF,kF)+F8064*FH(iF-1,jF,kF)-F14350*FH(iF,jF,kF)+F8064*FH(iF+1,jF,kF)-F1008*FH(iF+2,jF,kF)+F128*FH(iF+3,jF,kF)-(double)9*FH(iF+4,jF,kF));
fyy[p]=Edydy*(-(double)9*FH(iF,jF-4,kF)+F128*FH(iF,jF-3,kF)-F1008*FH(iF,jF-2,kF)+F8064*FH(iF,jF-1,kF)-F14350*FH(iF,jF,kF)+F8064*FH(iF,jF+1,kF)-F1008*FH(iF,jF+2,kF)+F128*FH(iF,jF+3,kF)-(double)9*FH(iF,jF+4,kF));
fzz[p]=Edzdz*(-(double)9*FH(iF,jF,kF-4)+F128*FH(iF,jF,kF-3)-F1008*FH(iF,jF,kF-2)+F8064*FH(iF,jF,kF-1)-F14350*FH(iF,jF,kF)+F8064*FH(iF,jF,kF+1)-F1008*FH(iF,jF,kF+2)+F128*FH(iF,jF,kF+3)-(double)9*FH(iF,jF,kF+4));
#define XS8(JF,KFDUMMY) (+(double)3*FH(iF-4,JF,KFDUMMY)-F32*FH(iF-3,JF,KFDUMMY)+F168*FH(iF-2,JF,KFDUMMY)-F672*FH(iF-1,JF,KFDUMMY)+F672*FH(iF+1,JF,KFDUMMY)-F168*FH(iF+2,JF,KFDUMMY)+F32*FH(iF+3,JF,KFDUMMY)-(double)3*FH(iF+4,JF,KFDUMMY))
fxy[p]=Edxdy*(+(double)3*XS8(jF-4,kF)-F32*XS8(jF-3,kF)+F168*XS8(jF-2,kF)-F672*XS8(jF-1,kF)+F672*XS8(jF+1,kF)-F168*XS8(jF+2,kF)+F32*XS8(jF+3,kF)-(double)3*XS8(jF+4,kF));
fxz[p]=Edxdz*(+(double)3*XS8(jF,kF-4)-F32*XS8(jF,kF-3)+F168*XS8(jF,kF-2)-F672*XS8(jF,kF-1)+F672*XS8(jF,kF+1)-F168*XS8(jF,kF+2)+F32*XS8(jF,kF+3)-(double)3*XS8(jF,kF+4));
#undef XS8
#define YS8(JF,KFDUMMY) (+(double)3*FH(iF,JF-4,KFDUMMY)-F32*FH(iF,JF-3,KFDUMMY)+F168*FH(iF,JF-2,KFDUMMY)-F672*FH(iF,JF-1,KFDUMMY)+F672*FH(iF,JF+1,KFDUMMY)-F168*FH(iF,JF+2,KFDUMMY)+F32*FH(iF,JF+3,KFDUMMY)-(double)3*FH(iF,JF+4,KFDUMMY))
fyz[p]=Edydz*(+(double)3*YS8(jF,kF-4)-F32*YS8(jF,kF-3)+F168*YS8(jF,kF-2)-F672*YS8(jF,kF-1)+F672*YS8(jF,kF+1)-F168*YS8(jF,kF+2)+F32*YS8(jF,kF+3)-(double)3*YS8(jF,kF+4));
#undef YS8
}}}}
#undef FH
return;
}
#else
#error "fdderivs_sh_c.C: unsupported ghost_width"
#endif
}

View File

@@ -1,107 +0,0 @@
#include "macrodef.h"
#include "share_func.h"
#include <cstddef>
/* Forward declarations — Fortran-mangled names from shell C kernels */
extern "C" {
void fderivs_sh_(const int ex[3], const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff, int sst);
void fdderivs_sh_(const int ex[3], const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff, int sst);
void fdderivs_shc_(int *ex,
double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
double *crho, double *sigma, double *R,
double &SYM1, double &SYM2, double &SYM3,
int &Symmetry, int &Lev, int &sst,
double *drhodx, double *drhody, double *drhodz,
double *dsigmadx, double *dsigmady, double *dsigmadz,
double *dRdx, double *dRdy, double *dRdz,
double *drhodxx, double *drhodxy, double *drhodxz,
double *drhodyy, double *drhodyz, double *drhodzz,
double *dsigmadxx, double *dsigmadxy, double *dsigmadxz,
double *dsigmadyy, double *dsigmadyz, double *dsigmadzz,
double *dRdxx, double *dRdxy, double *dRdxz,
double *dRdyy, double *dRdyz, double *dRdzz)
{
const int ex3[3] = { ex[0], ex[1], ex[2] };
const size_t n = (size_t)ex[0] * (size_t)ex[1] * (size_t)ex[2];
double *gx = (double*)malloc(n * sizeof(double));
double *gy = (double*)malloc(n * sizeof(double));
double *gz = (double*)malloc(n * sizeof(double));
double *gxx = (double*)malloc(n * sizeof(double));
double *gxy = (double*)malloc(n * sizeof(double));
double *gxz = (double*)malloc(n * sizeof(double));
double *gyy = (double*)malloc(n * sizeof(double));
double *gyz = (double*)malloc(n * sizeof(double));
double *gzz = (double*)malloc(n * sizeof(double));
if (!gx||!gy||!gz||!gxx||!gxy||!gxz||!gyy||!gyz||!gzz) {
free(gx);free(gy);free(gz);free(gxx);free(gxy);free(gxz);free(gyy);free(gyz);free(gzz);
return;
}
fderivs_sh_(ex3, f, gx, gy, gz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst);
fdderivs_sh_(ex3, f, gxx, gxy, gxz, gyy, gyz, gzz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst);
for (size_t i = 0; i < n; ++i) {
const double rx=drhodx[i], ry=drhody[i], rz=drhodz[i];
const double sx=dsigmadx[i], sy=dsigmady[i], sz=dsigmadz[i];
const double Rx=dRdx[i], Ry=dRdy[i], Rz=dRdz[i];
const double rxx=drhodxx[i], rxy=drhodxy[i], rxz=drhodxz[i];
const double ryy=drhodyy[i], ryz=drhodyz[i], rzz=drhodzz[i];
const double sxx=dsigmadxx[i], sxy=dsigmadxy[i], sxz=dsigmadxz[i];
const double syy=dsigmadyy[i], syz=dsigmadyz[i], szz=dsigmadzz[i];
const double Rxx=dRdxx[i], Rxy=dRdxy[i], Rxz=dRdxz[i];
const double Ryy=dRdyy[i], Ryz=dRdyz[i], Rzz=dRdzz[i];
const double Gr=gx[i], Gs=gy[i], GR=gz[i];
const double Grr=gxx[i], Grs=gxy[i], GrR=gxz[i];
const double Gss=gyy[i], GsR=gyz[i], GRR=gzz[i];
/* fxx */
fxx[i] = rx*rx*Grr + sx*sx*Gss + Rx*Rx*GRR
+ 2.0*(rx*sx*Grs + rx*Rx*GrR + sx*Rx*GsR)
+ rxx*Gr + sxx*Gs + Rxx*GR;
/* fxy */
fxy[i] = rx*ry*Grr + sx*sy*Gss + Rx*Ry*GRR
+ rx*sy*Grs + ry*sx*Grs + rx*Ry*GrR + ry*Rx*GrR + sx*Ry*GsR + sy*Rx*GsR
+ rxy*Gr + sxy*Gs + Rxy*GR;
/* fxz */
fxz[i] = rx*rz*Grr + sx*sz*Gss + Rx*Rz*GRR
+ rx*sz*Grs + rz*sx*Grs + rx*Rz*GrR + rz*Rx*GrR + sx*Rz*GsR + sz*Rx*GsR
+ rxz*Gr + sxz*Gs + Rxz*GR;
/* fyy */
fyy[i] = ry*ry*Grr + sy*sy*Gss + Ry*Ry*GRR
+ 2.0*(ry*sy*Grs + ry*Ry*GrR + sy*Ry*GsR)
+ ryy*Gr + syy*Gs + Ryy*GR;
/* fyz */
fyz[i] = ry*rz*Grr + sy*sz*Gss + Ry*Rz*GRR
+ ry*sz*Grs + rz*sy*Grs + ry*Rz*GrR + rz*Ry*GrR + sy*Rz*GsR + sz*Ry*GsR
+ ryz*Gr + syz*Gs + Ryz*GR;
/* fzz */
fzz[i] = rz*rz*Grr + sz*sz*Gss + Rz*Rz*GRR
+ 2.0*(rz*sz*Grs + rz*Rz*GrR + sz*Rz*GsR)
+ rzz*Gr + szz*Gs + Rzz*GR;
}
free(gx);free(gy);free(gz);free(gxx);free(gxy);free(gxz);free(gyy);free(gyz);free(gzz);
}
} // extern "C"

View File

@@ -1,616 +0,0 @@
#include "macrodef.h"
#include "tool.h"
/*
* C 版 fderivs — first derivatives df/dx, df/dy, df/dz.
*
* Finite difference order is selected at compile time via the ghost_width macro
* (defined in macrodef.fh):
* ghost_width = 2 → 2nd-order
* ghost_width = 3 → 4th-order
* ghost_width = 4 → 6th-order
* ghost_width = 5 → 8th-order
*
* Multi-pass overwrite strategy: compute the widest (lowest-order) stencil first,
* then overwrite interior regions with progressively higher-order stencils.
*/
void fderivs(const int ex[3],
const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff)
{
(void)onoff;
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, EIT = 8.0;
const double F9 = 9.0, F12 = 12.0, F45 = 45.0, F60 = 60.0;
const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
const int gw = ghost_width; // compile-time constant
#if (ghost_width == 2)
/* ---- 2nd-order ------------------------------------------------------ */
{
const int ord = 1; // symmetry_bd ord = ghost_width - 1
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = 0;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = 0;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = 0;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
}
/* 2nd-order pass: [-1, 0, +1] / (2*dx) */
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2;
const int j2_hi = ex2 - 2;
const int k2_hi = ex3 - 2;
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d2dx * (
-fh[idx_fh_F_ord1(iF - 1, jF, kF, ex)] +
fh[idx_fh_F_ord1(iF + 1, jF, kF, ex)]
);
fy[p] = d2dy * (
-fh[idx_fh_F_ord1(iF, jF - 1, kF, ex)] +
fh[idx_fh_F_ord1(iF, jF + 1, kF, ex)]
);
fz[p] = d2dz * (
-fh[idx_fh_F_ord1(iF, jF, kF - 1, ex)] +
fh[idx_fh_F_ord1(iF, jF, kF + 1, ex)]
);
}
}
}
}
return;
}
#elif (ghost_width == 3)
/* ---- 4th-order (original code) ------------------------------------ */
{
const int ord = 2; // symmetry_bd ord
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
}
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2;
const int j2_hi = ex2 - 2;
const int k2_hi = ex3 - 2;
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i4_hi = ex1 - 3;
const int j4_hi = ex2 - 3;
const int k4_hi = ex3 - 3;
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d2dx * (
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
);
fy[p] = d2dy * (
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
);
fz[p] = d2dz * (
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
);
}
}
}
}
if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d12dx * (
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]
);
fy[p] = d12dy * (
fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] -
EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]
);
fz[p] = d12dz * (
fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] -
EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
);
}
}
}
}
return;
}
#elif (ghost_width == 4)
/* ---- 6th-order ----------------------------------------------------- */
{
const int ord = 3;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
/* Denominators */
const double d60dx = ONE / F60 / dX;
const double d60dy = ONE / F60 / dY;
const double d60dz = ONE / F60 / dZ;
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
}
/* 2nd-order pass: 3pt, widest */
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2;
const int j2_hi = ex2 - 2;
const int k2_hi = ex3 - 2;
/* 4th-order pass: 5pt */
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i4_hi = ex1 - 3;
const int j4_hi = ex2 - 3;
const int k4_hi = ex3 - 3;
/* 6th-order pass: 7pt, narrowest interior */
const int i6_lo = (iminF + 2 > 0) ? (iminF + 2) : 0;
const int j6_lo = (jminF + 2 > 0) ? (jminF + 2) : 0;
const int k6_lo = (kminF + 2 > 0) ? (kminF + 2) : 0;
const int i6_hi = ex1 - 4;
const int j6_hi = ex2 - 4;
const int k6_hi = ex3 - 4;
/* 2nd-order */
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d2dx * (
-fh[idx_fh_F(iF - 1, jF, kF, ex)] +
fh[idx_fh_F(iF + 1, jF, kF, ex)]);
fy[p] = d2dy * (
-fh[idx_fh_F(iF, jF - 1, kF, ex)] +
fh[idx_fh_F(iF, jF + 1, kF, ex)]);
fz[p] = d2dz * (
-fh[idx_fh_F(iF, jF, kF - 1, ex)] +
fh[idx_fh_F(iF, jF, kF + 1, ex)]);
}
}
}
}
/* 4th-order overwrite */
if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d12dx * (
fh[idx_fh_F(iF - 2, jF, kF, ex)] -
EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] +
EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] -
fh[idx_fh_F(iF + 2, jF, kF, ex)]);
fy[p] = d12dy * (
fh[idx_fh_F(iF, jF - 2, kF, ex)] -
EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] +
EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] -
fh[idx_fh_F(iF, jF + 2, kF, ex)]);
fz[p] = d12dz * (
fh[idx_fh_F(iF, jF, kF - 2, ex)] -
EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] +
EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] -
fh[idx_fh_F(iF, jF, kF + 2, ex)]);
}
}
}
}
/* 6th-order overwrite: [-1,+9,-45,0,+45,-9,+1] / (60*dx) */
if (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi) {
for (int k0 = k6_lo; k0 <= k6_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j6_lo; j0 <= j6_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i6_lo; i0 <= i6_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d60dx * (
-fh[idx_fh_F(iF - 3, jF, kF, ex)] +
F9 * fh[idx_fh_F(iF - 2, jF, kF, ex)] -
F45 * fh[idx_fh_F(iF - 1, jF, kF, ex)] +
F45 * fh[idx_fh_F(iF + 1, jF, kF, ex)] -
F9 * fh[idx_fh_F(iF + 2, jF, kF, ex)] +
fh[idx_fh_F(iF + 3, jF, kF, ex)]);
fy[p] = d60dy * (
-fh[idx_fh_F(iF, jF - 3, kF, ex)] +
F9 * fh[idx_fh_F(iF, jF - 2, kF, ex)] -
F45 * fh[idx_fh_F(iF, jF - 1, kF, ex)] +
F45 * fh[idx_fh_F(iF, jF + 1, kF, ex)] -
F9 * fh[idx_fh_F(iF, jF + 2, kF, ex)] +
fh[idx_fh_F(iF, jF + 3, kF, ex)]);
fz[p] = d60dz * (
-fh[idx_fh_F(iF, jF, kF - 3, ex)] +
F9 * fh[idx_fh_F(iF, jF, kF - 2, ex)] -
F45 * fh[idx_fh_F(iF, jF, kF - 1, ex)] +
F45 * fh[idx_fh_F(iF, jF, kF + 1, ex)] -
F9 * fh[idx_fh_F(iF, jF, kF + 2, ex)] +
fh[idx_fh_F(iF, jF, kF + 3, ex)]);
}
}
}
}
return;
}
#elif (ghost_width == 5)
/* ---- 8th-order ----------------------------------------------------- */
{
const int ord = 4;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
const double SoA[3] = { SYM1, SYM2, SYM3 };
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL;
static size_t cap = 0;
if (fh_size > cap) {
free(fh_buf);
fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double));
cap = fh_size;
}
double *fh = fh_buf;
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d840dx = ONE / F840 / dX;
const double d840dy = ONE / F840 / dY;
const double d840dz = ONE / F840 / dZ;
const double d60dx = ONE / F60 / dX;
const double d60dy = ONE / F60 / dY;
const double d60dz = ONE / F60 / dZ;
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
for (size_t p = 0; p < all; ++p) {
fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO;
}
/* 2nd: 3pt, widest */
const int i2_lo = (iminF > 0) ? iminF : 0;
const int j2_lo = (jminF > 0) ? jminF : 0;
const int k2_lo = (kminF > 0) ? kminF : 0;
const int i2_hi = ex1 - 2;
const int j2_hi = ex2 - 2;
const int k2_hi = ex3 - 2;
/* 4th: 5pt */
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i4_hi = ex1 - 3;
const int j4_hi = ex2 - 3;
const int k4_hi = ex3 - 3;
/* 6th: 7pt */
const int i6_lo = (iminF + 2 > 0) ? (iminF + 2) : 0;
const int j6_lo = (jminF + 2 > 0) ? (jminF + 2) : 0;
const int k6_lo = (kminF + 2 > 0) ? (kminF + 2) : 0;
const int i6_hi = ex1 - 4;
const int j6_hi = ex2 - 4;
const int k6_hi = ex3 - 4;
/* 8th: 9pt, narrowest */
const int i8_lo = (iminF + 3 > 0) ? (iminF + 3) : 0;
const int j8_lo = (jminF + 3 > 0) ? (jminF + 3) : 0;
const int k8_lo = (kminF + 3 > 0) ? (kminF + 3) : 0;
const int i8_hi = ex1 - 5;
const int j8_hi = ex2 - 5;
const int k8_hi = ex3 - 5;
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d2dx * (
-fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)]);
fy[p] = d2dy * (
-fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)]);
fz[p] = d2dz * (
-fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)]);
}
}
}
}
if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d12dx * (
fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] -
EIT * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
EIT * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)]);
fy[p] = d12dy * (
fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] -
EIT * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
EIT * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)]);
fz[p] = d12dz * (
fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] -
EIT * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
EIT * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)]);
}
}
}
}
if (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi) {
for (int k0 = k6_lo; k0 <= k6_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j6_lo; j0 <= j6_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i6_lo; i0 <= i6_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d60dx * (
-fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] +
F9 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] -
F45 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
F45 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
F9 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] +
fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)]);
fy[p] = d60dy * (
-fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] +
F9 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] -
F45 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
F45 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
F9 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] +
fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)]);
fz[p] = d60dz * (
-fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] +
F9 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] -
F45 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
F45 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
F9 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] +
fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)]);
}
}
}
}
/* 8th-order overwrite: [+3,-32,+168,-672,0,+672,-168,+32,-3] / (840*dx) */
if (i8_lo <= i8_hi && j8_lo <= j8_hi && k8_lo <= k8_hi) {
for (int k0 = k8_lo; k0 <= k8_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j8_lo; j0 <= j8_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i8_lo; i0 <= i8_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
fx[p] = d840dx * (
+(double)3 * fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] -
F32 * fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] +
F168 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] -
F672 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] +
F672 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] -
F168 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] +
F32 * fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)] -
(double)3 * fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]);
fy[p] = d840dy * (
+(double)3 * fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] -
F32 * fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] +
F168 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] -
F672 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] +
F672 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] -
F168 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] +
F32 * fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)] -
(double)3 * fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]);
fz[p] = d840dz * (
+(double)3 * fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] -
F32 * fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] +
F168 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] -
F672 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] +
F672 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] -
F168 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] +
F32 * fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)] -
(double)3 * fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]);
}
}
}
}
return;
}
#else
#error "fderivs_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
#endif
}

View File

@@ -1,234 +0,0 @@
#include "macrodef.h"
#include "share_func.h"
/*
* C 版 fderivs_sh — first derivatives on shell patch in (rho, sigma, R) coords.
*
* Same stencil coefficients as Cartesian fderivs, but:
* - Uses symmetry_stbd (ghost on BOTH sides of x/y, none in z)
* - fh buffer: (-ord+1:ex+ord) in x/y, (1:ex) in z
* - SoA is 2-element only (x/y), no z-symmetry
* - sst parameter (shell surface type, not used in stencil computation)
*/
extern "C" void fderivs_sh_(const int ex[3],
const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff, int sst)
{
(void)SYM3; (void)onoff; (void)sst;
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, EIT = 8.0;
const double F9 = 9.0, F12 = 12.0, F45 = 45.0, F60 = 60.0;
const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
const double SoA[2] = { SYM1, SYM2 };
#if (ghost_width == 2)
{
const int ord = 1;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = 0;
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = 0;
if ((sst==2||sst==4) && fabs(Y[0]) < dY) jminF = 0; // EQ reflection
const size_t nx = (size_t)ex1 + 2 * ord;
const size_t ny = (size_t)ex2 + 2 * ord;
const size_t nz = (size_t)ex3;
const size_t fh_size = nx * ny * nz;
static double *fh_buf = NULL; static size_t cap = 0;
if (fh_size > cap) { free(fh_buf); fh_buf = (double*)aligned_alloc(64, fh_size*sizeof(double)); cap = fh_size; }
double *fh = fh_buf; if (!fh) return;
symmetry_stbd(ord, ex, f, fh, SoA);
const double d2dx = ONE/TWO/dX, d2dy = ONE/TWO/dY, d2dz = ONE/TWO/dZ;
const size_t all = (size_t)ex1*ex2*ex3;
for (size_t p=0;p<all;++p) { fx[p]=ZEO; fy[p]=ZEO; fz[p]=ZEO; }
const int i2_lo=(iminF>0)?iminF:0, j2_lo=(jminF>0)?jminF:0, k2_lo=1;
const int i2_hi=ex1-2, j2_hi=ex2-2, k2_hi=ex3-2;
if (i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi) {
for (int k0=k2_lo;k0<=k2_hi;++k0) { const int kF=k0+1;
for (int j0=j2_lo;j0<=j2_hi;++j0) { const int jF=j0+1;
for (int i0=i2_lo;i0<=i2_hi;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]);
fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]);
fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]);
}}}
}
return;
}
#elif (ghost_width == 3)
{
const int ord = 2;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -1;
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -1;
if ((sst==2||sst==4) && fabs(Y[0]) < dY) jminF = -1;
const size_t nx=(size_t)ex1+2*ord, ny=(size_t)ex2+2*ord, nz=(size_t)ex3;
const size_t fh_size=nx*ny*nz;
static double *fh_buf=NULL; static size_t cap=0;
if (fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
double *fh=fh_buf; if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ;
const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ;
const size_t all=(size_t)ex1*ex2*ex3;
for(size_t p=0;p<all;++p){fx[p]=ZEO;fy[p]=ZEO;fz[p]=ZEO;}
const int i2_lo=(iminF>0)?iminF:0, j2_lo=(jminF>0)?jminF:0, k2_lo=1;
const int i2_hi=ex1-2, j2_hi=ex2-2, k2_hi=ex3-2;
const int i4_lo=(iminF+1>0)?iminF+1:0, j4_lo=(jminF+1>0)?jminF+1:0, k4_lo=2;
const int i4_hi=ex1-3, j4_hi=ex2-3, k4_hi=ex3-3;
if (i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi) {
for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]);
fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]);
fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]);
}}}
}
if (i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi) {
for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d12dx*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]);
fy[p]=d12dy*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]);
fz[p]=d12dz*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]);
}}}
}
return;
}
#elif (ghost_width == 4)
{
const int ord = 3;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-2;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-2;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-2;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3;
const size_t fh_size=nx*ny*nz;
static double *fh_buf=NULL;static size_t cap=0;
if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
double *fh=fh_buf;if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const double d60dx=ONE/F60/dX,d60dy=ONE/F60/dY,d60dz=ONE/F60/dZ;
const double d12dx=ONE/F12/dX,d12dy=ONE/F12/dY,d12dz=ONE/F12/dZ;
const double d2dx=ONE/TWO/dX,d2dy=ONE/TWO/dY,d2dz=ONE/TWO/dZ;
const size_t all=(size_t)ex1*ex2*ex3;
for(size_t p=0;p<all;++p){fx[p]=ZEO;fy[p]=ZEO;fz[p]=ZEO;}
const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){
for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d2dx*(-fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]);
fy[p]=d2dy*(-fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]);
fz[p]=d2dz*(-fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]);
}}}
}
if(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi){
for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d12dx*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]);
fy[p]=d12dy*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]);
fz[p]=d12dz*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-EIT*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+EIT*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]);
}}}
}
if(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi){
for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d60dx*(-fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+F9*fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]-F45*fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+F45*fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)]-F9*fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)]);
fy[p]=d60dy*(-fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+F9*fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]-F45*fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+F45*fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)]-F9*fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)]);
fz[p]=d60dz*(-fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+F9*fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]-F45*fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+F45*fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)]-F9*fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)]);
}}}
}
return;
}
#elif (ghost_width == 5)
{
const int ord = 4;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-3;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-3;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-3;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3;
const size_t fh_size=nx*ny*nz;
static double *fh_buf=NULL;static size_t cap=0;
if(fh_size>cap){free(fh_buf);fh_buf=(double*)aligned_alloc(64,fh_size*sizeof(double));cap=fh_size;}
double *fh=fh_buf;if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const double d840dx=ONE/F840/dX,d840dy=ONE/F840/dY,d840dz=ONE/F840/dZ;
const double d60dx=ONE/F60/dX,d60dy=ONE/F60/dY,d60dz=ONE/F60/dZ;
const double d12dx=ONE/F12/dX,d12dy=ONE/F12/dY,d12dz=ONE/F12/dZ;
const double d2dx=ONE/TWO/dX,d2dy=ONE/TWO/dY,d2dz=ONE/TWO/dZ;
const size_t all=(size_t)ex1*ex2*ex3;
for(size_t p=0;p<all;++p){fx[p]=ZEO;fy[p]=ZEO;fz[p]=ZEO;}
const int i2_lo=(iminF>0)?iminF:0,j2_lo=(jminF>0)?jminF:0,k2_lo=1,i2_hi=ex1-2,j2_hi=ex2-2,k2_hi=ex3-2;
const int i4_lo=(iminF+1>0)?iminF+1:0,j4_lo=(jminF+1>0)?jminF+1:0,k4_lo=2,i4_hi=ex1-3,j4_hi=ex2-3,k4_hi=ex3-3;
const int i6_lo=(iminF+2>0)?iminF+2:0,j6_lo=(jminF+2>0)?jminF+2:0,k6_lo=3,i6_hi=ex1-4,j6_hi=ex2-4,k6_hi=ex3-4;
const int i8_lo=(iminF+3>0)?iminF+3:0,j8_lo=(jminF+3>0)?jminF+3:0,k8_lo=4,i8_hi=ex1-5,j8_hi=ex2-5,k8_hi=ex3-5;
#define FH_S(iF,jF,kF) fh[idx_fh_stbd(iF,jF,kF,ord,ex)]
if(i2_lo<=i2_hi&&j2_lo<=j2_hi&&k2_lo<=k2_hi){for(int k0=k2_lo;k0<=k2_hi;++k0){const int kF=k0+1;
for(int j0=j2_lo;j0<=j2_hi;++j0){const int jF=j0+1;
for(int i0=i2_lo;i0<=i2_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d2dx*(-FH_S(iF-1,jF,kF)+FH_S(iF+1,jF,kF));
fy[p]=d2dy*(-FH_S(iF,jF-1,kF)+FH_S(iF,jF+1,kF));
fz[p]=d2dz*(-FH_S(iF,jF,kF-1)+FH_S(iF,jF,kF+1));}}}}
if(i4_lo<=i4_hi&&j4_lo<=j4_hi&&k4_lo<=k4_hi){for(int k0=k4_lo;k0<=k4_hi;++k0){const int kF=k0+1;
for(int j0=j4_lo;j0<=j4_hi;++j0){const int jF=j0+1;
for(int i0=i4_lo;i0<=i4_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d12dx*(FH_S(iF-2,jF,kF)-EIT*FH_S(iF-1,jF,kF)+EIT*FH_S(iF+1,jF,kF)-FH_S(iF+2,jF,kF));
fy[p]=d12dy*(FH_S(iF,jF-2,kF)-EIT*FH_S(iF,jF-1,kF)+EIT*FH_S(iF,jF+1,kF)-FH_S(iF,jF+2,kF));
fz[p]=d12dz*(FH_S(iF,jF,kF-2)-EIT*FH_S(iF,jF,kF-1)+EIT*FH_S(iF,jF,kF+1)-FH_S(iF,jF,kF+2));}}}}
if(i6_lo<=i6_hi&&j6_lo<=j6_hi&&k6_lo<=k6_hi){for(int k0=k6_lo;k0<=k6_hi;++k0){const int kF=k0+1;
for(int j0=j6_lo;j0<=j6_hi;++j0){const int jF=j0+1;
for(int i0=i6_lo;i0<=i6_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d60dx*(-FH_S(iF-3,jF,kF)+F9*FH_S(iF-2,jF,kF)-F45*FH_S(iF-1,jF,kF)+F45*FH_S(iF+1,jF,kF)-F9*FH_S(iF+2,jF,kF)+FH_S(iF+3,jF,kF));
fy[p]=d60dy*(-FH_S(iF,jF-3,kF)+F9*FH_S(iF,jF-2,kF)-F45*FH_S(iF,jF-1,kF)+F45*FH_S(iF,jF+1,kF)-F9*FH_S(iF,jF+2,kF)+FH_S(iF,jF+3,kF));
fz[p]=d60dz*(-FH_S(iF,jF,kF-3)+F9*FH_S(iF,jF,kF-2)-F45*FH_S(iF,jF,kF-1)+F45*FH_S(iF,jF,kF+1)-F9*FH_S(iF,jF,kF+2)+FH_S(iF,jF,kF+3));}}}}
if(i8_lo<=i8_hi&&j8_lo<=j8_hi&&k8_lo<=k8_hi){for(int k0=k8_lo;k0<=k8_hi;++k0){const int kF=k0+1;
for(int j0=j8_lo;j0<=j8_hi;++j0){const int jF=j0+1;
for(int i0=i8_lo;i0<=i8_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
fx[p]=d840dx*(+(double)3*FH_S(iF-4,jF,kF)-F32*FH_S(iF-3,jF,kF)+F168*FH_S(iF-2,jF,kF)-F672*FH_S(iF-1,jF,kF)+F672*FH_S(iF+1,jF,kF)-F168*FH_S(iF+2,jF,kF)+F32*FH_S(iF+3,jF,kF)-(double)3*FH_S(iF+4,jF,kF));
fy[p]=d840dy*(+(double)3*FH_S(iF,jF-4,kF)-F32*FH_S(iF,jF-3,kF)+F168*FH_S(iF,jF-2,kF)-F672*FH_S(iF,jF-1,kF)+F672*FH_S(iF,jF+1,kF)-F168*FH_S(iF,jF+2,kF)+F32*FH_S(iF,jF+3,kF)-(double)3*FH_S(iF,jF+4,kF));
fz[p]=d840dz*(+(double)3*FH_S(iF,jF,kF-4)-F32*FH_S(iF,jF,kF-3)+F168*FH_S(iF,jF,kF-2)-F672*FH_S(iF,jF,kF-1)+F672*FH_S(iF,jF,kF+1)-F168*FH_S(iF,jF,kF+2)+F32*FH_S(iF,jF,kF+3)-(double)3*FH_S(iF,jF,kF+4));}}}}
#undef FH_S
return;
}
#else
#error "fderivs_sh_c.C: unsupported ghost_width"
#endif
}

View File

@@ -1,54 +0,0 @@
#include "macrodef.h"
#include "share_func.h"
#include <cstddef>
/*
* fderivs_shc — shell first derivatives converted to Cartesian via chain rule.
*
* Calls fderivs_sh internally, then:
* fx = drhodx * df/drho + dsigmadx * df/dsigma + dRdx * df/dR
* fy = drhody * df/drho + dsigmady * df/dsigma + dRdy * df/dR
* fz = drhodz * df/drho + dsigmadz * df/dsigma + dRdz * df/dR
*/
// Forward declaration (defined in fderivs_sh_c.C with extern "C" name fderivs_sh_)
extern "C" {
void fderivs_sh_(const int ex[3], const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff, int sst);
void fderivs_shc_(int *ex,
double *f,
double *fx, double *fy, double *fz,
double *crho, double *sigma, double *R,
double &SYM1, double &SYM2, double &SYM3,
int &Symmetry, int &Lev, int &sst,
double *drhodx, double *drhody, double *drhodz,
double *dsigmadx, double *dsigmady, double *dsigmadz,
double *dRdx, double *dRdy, double *dRdz)
{
const int ex3[3] = { ex[0], ex[1], ex[2] };
const size_t n = (size_t)ex[0] * (size_t)ex[1] * (size_t)ex[2];
// Temporary shell-coordinate derivatives
double *gx = (double*)malloc(n * sizeof(double));
double *gy = (double*)malloc(n * sizeof(double));
double *gz = (double*)malloc(n * sizeof(double));
if (!gx || !gy || !gz) { free(gx); free(gy); free(gz); return; }
// Compute shell-coordinate derivatives
fderivs_sh_(ex3, f, gx, gy, gz, crho, sigma, R, SYM1, SYM2, SYM3, Symmetry, Lev, sst);
// Chain rule to Cartesian
for (size_t i = 0; i < n; ++i) {
fx[i] = drhodx[i] * gx[i] + dsigmadx[i] * gy[i] + dRdx[i] * gz[i];
fy[i] = drhody[i] * gx[i] + dsigmady[i] * gy[i] + dRdy[i] * gz[i];
fz[i] = drhodz[i] * gx[i] + dsigmadz[i] * gy[i] + dRdz[i] * gz[i];
}
free(gx); free(gy); free(gz);
}
} // extern "C"

View File

@@ -324,9 +324,6 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
integer::i integer::i
#if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -352,9 +349,6 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
integer::i integer::i
#if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -383,9 +377,6 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
integer::i integer::i
#if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
@@ -1124,151 +1115,6 @@ end subroutine d2dump
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! Lagrangian polynomial interpolation ! Lagrangian polynomial interpolation
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
#ifndef POLINT6_USE_BARYCENTRIC
#define POLINT6_USE_BARYCENTRIC 1
#endif
#ifndef USE_FMISC_SAFE_MODE
#define USE_FMISC_SAFE_MODE 0
#endif
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
subroutine polint6_neville(xa, ya, x, y, dy)
implicit none
real*8, dimension(6), intent(in) :: xa, ya
real*8, intent(in) :: x
real*8, intent(out) :: y, dy
integer :: i, m, ns, n_m
real*8, dimension(6) :: c, d, ho
real*8 :: dif, dift, hp, h, den_val
c = ya
d = ya
ho = xa - x
ns = 1
dif = abs(x - xa(1))
do i = 2, 6
dift = abs(x - xa(i))
if (dift < dif) then
ns = i
dif = dift
end if
end do
y = ya(ns)
ns = ns - 1
do m = 1, 5
n_m = 6 - m
do i = 1, n_m
hp = ho(i)
h = ho(i+m)
den_val = hp - h
if (den_val == 0.0d0) then
write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
stop
end if
den_val = (c(i+1) - d(i)) / den_val
d(i) = h * den_val
c(i) = hp * den_val
end do
if (2 * ns < n_m) then
dy = c(ns + 1)
else
dy = d(ns)
ns = ns - 1
end if
y = y + dy
end do
return
end subroutine polint6_neville
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
subroutine polint6_barycentric(xa, ya, x, y, dy)
implicit none
real*8, dimension(6), intent(in) :: xa, ya
real*8, intent(in) :: x
real*8, intent(out) :: y, dy
integer :: i, j
logical :: is_uniform
real*8, dimension(6) :: lambda
real*8 :: dx, den_i, term, num, den, step, tol
real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
do i = 1, 6
if (x == xa(i)) then
y = ya(i)
dy = 0.d0
return
end if
end do
step = xa(2) - xa(1)
is_uniform = (step /= 0.d0)
if (is_uniform) then
tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
do i = 3, 6
if (abs((xa(i) - xa(i-1)) - step) > tol) then
is_uniform = .false.
exit
end if
end do
end if
if (is_uniform) then
num = 0.d0
den = 0.d0
do i = 1, 6
term = c_uniform(i) / (x - xa(i))
num = num + term * ya(i)
den = den + term
end do
y = num / den
dy = 0.d0
return
end if
do i = 1, 6
den_i = 1.d0
do j = 1, 6
if (j /= i) then
dx = xa(i) - xa(j)
if (dx == 0.0d0) then
write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
stop
end if
den_i = den_i * dx
end if
end do
lambda(i) = 1.d0 / den_i
end do
num = 0.d0
den = 0.d0
do i = 1, 6
term = lambda(i) / (x - xa(i))
num = num + term * ya(i)
den = den + term
end do
y = num / den
dy = 0.d0
return
end subroutine polint6_barycentric
!DIR$ ATTRIBUTES FORCEINLINE :: polint !DIR$ ATTRIBUTES FORCEINLINE :: polint
subroutine polint(xa, ya, x, y, dy, ordn) subroutine polint(xa, ya, x, y, dy, ordn)
@@ -1283,17 +1129,6 @@ end subroutine d2dump
real*8, dimension(ordn) :: c, d, ho real*8, dimension(ordn) :: c, d, ho
real*8 :: dif, dift, hp, h, den_val real*8 :: dif, dift, hp, h, den_val
if (ordn == 6) then
#if USE_FMISC_SAFE_MODE
call polint6_neville(xa, ya, x, y, dy)
#elif POLINT6_USE_BARYCENTRIC
call polint6_barycentric(xa, ya, x, y, dy)
#else
call polint6_neville(xa, ya, x, y, dy)
#endif
return
end if
c = ya c = ya
d = ya d = ya
ho = xa - x ho = xa - x
@@ -1343,41 +1178,6 @@ end subroutine d2dump
return return
end subroutine polint end subroutine polint
!------------------------------------------------------------------------------ !------------------------------------------------------------------------------
! Compute Lagrange interpolation basis weights for one target point.
!------------------------------------------------------------------------------
!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
subroutine polint_lagrange_weights(xa, x, w, ordn)
implicit none
integer, intent(in) :: ordn
real*8, dimension(1:ordn), intent(in) :: xa
real*8, intent(in) :: x
real*8, dimension(1:ordn), intent(out) :: w
integer :: i, j
real*8 :: num, den, dx
do i = 1, ordn
num = 1.d0
den = 1.d0
do j = 1, ordn
if (j /= i) then
dx = xa(i) - xa(j)
if (dx == 0.0d0) then
write(*,*) 'failure in polint for point',x
write(*,*) 'with input points: ',xa
stop
end if
num = num * (x - xa(j))
den = den * dx
end if
end do
w(i) = num / den
end do
return
end subroutine polint_lagrange_weights
!------------------------------------------------------------------------------
! !
! interpolation in 2 dimensions, follow yx order ! interpolation in 2 dimensions, follow yx order
! !
@@ -1391,7 +1191,7 @@ end subroutine d2dump
real*8, intent(in) :: x1,x2 real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER) #ifdef POLINT_LEGACY_ORDER
integer :: i,m integer :: i,m
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp real*8, dimension(ordn) :: yntmp
@@ -1429,7 +1229,7 @@ end subroutine d2dump
real*8, intent(in) :: x1,x2,x3 real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER) #ifdef POLINT_LEGACY_ORDER
integer :: i,j,m,n integer :: i,j,m,n
real*8, dimension(ordn,ordn) :: yatmp real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
@@ -1448,26 +1248,19 @@ end subroutine d2dump
end do end do
call polint(x1a,ymtmp,x1,y,dy,ordn) call polint(x1a,ymtmp,x1,y,dy,ordn)
#else #else
integer :: i, j, k integer :: j, k
real*8, dimension(ordn) :: w1, w2 real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
real*8 :: yx_sum, x_sum real*8 :: dy_temp
call polint_lagrange_weights(x1a, x1, w1, ordn) do k=1,ordn
call polint_lagrange_weights(x2a, x2, w2, ordn) do j=1,ordn
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
do k = 1, ordn
yx_sum = 0.d0
do j = 1, ordn
x_sum = 0.d0
do i = 1, ordn
x_sum = x_sum + w1(i) * ya(i,j,k)
end do
yx_sum = yx_sum + w2(j) * x_sum
end do end do
ymtmp(k) = yx_sum
end do end do
do k=1,ordn
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
end do
call polint(x3a, ymtmp, x3, y, dy, ordn) call polint(x3a, ymtmp, x3, y, dy, ordn)
#endif #endif
@@ -1517,106 +1310,18 @@ if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1 if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1 if(dabs(Z(1)-zmin) < dZ) kmin = 1
#if USE_FMISC_SAFE_MODE
f_out = 0.d0
do k = kmin, kmax
do j = jmin, jmax
do i = imin, imax
f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product ! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements)) allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat) deallocate(f_flat)
#endif
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
return return
end subroutine l2normhelper end subroutine l2normhelper
!--------------------------------------------------------------------------------------
subroutine l2normhelper7(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
f1,f2,f3,f4,f5,f6,f7,f_out,gw)
implicit none
!~~~~~~> Input parameters:
integer,intent(in ):: ex(1:3)
real*8, intent(in ):: X(1:ex(1)),Y(1:ex(2)),Z(1:ex(3)),xmin,ymin,zmin,xmax,ymax,zmax
integer,intent(in)::gw
real*8, dimension(ex(1),ex(2),ex(3)),intent(in) :: f1,f2,f3,f4,f5,f6,f7
real*8, intent(out) :: f_out(7)
!~~~~~~> Other variables:
real*8 :: dX, dY, dZ
integer::imin,jmin,kmin
integer::imax,jmax,kmax
integer::i,j,k
real*8 :: s1,s2,s3,s4,s5,s6,s7
dX = X(2) - X(1)
dY = Y(2) - Y(1)
dZ = Z(2) - Z(1)
! for ghost zone
imin = gw+1
jmin = gw+1
kmin = gw+1
imax = ex(1) - gw
jmax = ex(2) - gw
kmax = ex(3) - gw
!for patch boundary (i.e., not ghost boundary)
if(dabs(X(ex(1))-xmax) < dX) imax = ex(1)
if(dabs(Y(ex(2))-ymax) < dY) jmax = ex(2)
if(dabs(Z(ex(3))-zmax) < dZ) kmax = ex(3)
if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1
s1 = 0.d0
s2 = 0.d0
s3 = 0.d0
s4 = 0.d0
s5 = 0.d0
s6 = 0.d0
s7 = 0.d0
do k=kmin,kmax
do j=jmin,jmax
#if !USE_FMISC_SAFE_MODE
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
#endif
do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k)
s3 = s3 + f3(i,j,k)*f3(i,j,k)
s4 = s4 + f4(i,j,k)*f4(i,j,k)
s5 = s5 + f5(i,j,k)*f5(i,j,k)
s6 = s6 + f6(i,j,k)*f6(i,j,k)
s7 = s7 + f7(i,j,k)*f7(i,j,k)
enddo
enddo
enddo
f_out(1) = s1*dX*dY*dZ
f_out(2) = s2*dX*dY*dZ
f_out(3) = s3*dX*dY*dZ
f_out(4) = s4*dX*dY*dZ
f_out(5) = s5*dX*dY*dZ
f_out(6) = s6*dX*dY*dZ
f_out(7) = s7*dX*dY*dZ
return
end subroutine l2normhelper7
!-------------------------------------------------------------------------------------- !--------------------------------------------------------------------------------------
! calculate L2norm especially for shell Blocks ! calculate L2norm especially for shell Blocks
subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,& subroutine l2normhelper_sh(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
@@ -1700,23 +1405,12 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
#if USE_FMISC_SAFE_MODE
f_out = 0.d0
do k = kmin, kmax
do j = jmin, jmax
do i = imin, imax
f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product ! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements)) allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat) deallocate(f_flat)
#endif
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1808,23 +1502,12 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
#if USE_FMISC_SAFE_MODE
f_out = 0.d0
do k = kmin, kmax
do j = jmin, jmax
do i = imin, imax
f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product ! Optimized with oneMKL BLAS DDOT for dot product
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(Nout)) allocate(f_flat(Nout))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout]) f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_out = DDOT(Nout, f_flat, 1, f_flat, 1) f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
deallocate(f_flat) deallocate(f_flat)
#endif
return return
@@ -1926,21 +1609,8 @@ endif
! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3 ! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0 real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
integer :: i,j,k
#if USE_FMISC_SAFE_MODE fout = C1*f1+C2*f2+C3*f3
do k=1,ext(3)
do j=1,ext(2)
do i=1,ext(1)
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
enddo
enddo
enddo
#else
do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
end do
#endif
return return
@@ -2084,15 +1754,8 @@ endif
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m) tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
enddo enddo
#if USE_FMISC_SAFE_MODE
f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * tmp1(m)
end do
#else
! Third dimension: x-direction weighted sum using BLAS DDOT ! Third dimension: x-direction weighted sum using BLAS DDOT
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#endif
return return
@@ -2158,15 +1821,8 @@ endif
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m) tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
enddo enddo
#if USE_FMISC_SAFE_MODE
f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * tmp1(m)
end do
#else
! Use BLAS DDOT for final weighted sum ! Use BLAS DDOT for final weighted sum
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#endif
return return
@@ -2258,15 +1914,8 @@ endif
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
endif endif
#if USE_FMISC_SAFE_MODE
f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * ya(m)
end do
#else
! Optimized with BLAS DDOT for weighted sum ! Optimized with BLAS DDOT for weighted sum
f_int = DDOT(ORDN, coef, 1, ya, 1) f_int = DDOT(ORDN, coef, 1, ya, 1)
#endif
return return

View File

@@ -13,7 +13,6 @@
#define f_global_interpind2d global_interpind2d #define f_global_interpind2d global_interpind2d
#define f_global_interpind1d global_interpind1d #define f_global_interpind1d global_interpind1d
#define f_l2normhelper l2normhelper #define f_l2normhelper l2normhelper
#define f_l2normhelper7 l2normhelper7
#define f_l2normhelper_sh l2normhelper_sh #define f_l2normhelper_sh l2normhelper_sh
#define f_l2normhelper_sh_rms l2normhelper_sh_rms #define f_l2normhelper_sh_rms l2normhelper_sh_rms
#define f_average average #define f_average average
@@ -43,7 +42,6 @@
#define f_global_interpind2d GLOBAL_INTERPIND2D #define f_global_interpind2d GLOBAL_INTERPIND2D
#define f_global_interpind1d GLOBAL_INTERPIND1D #define f_global_interpind1d GLOBAL_INTERPIND1D
#define f_l2normhelper L2NORMHELPER #define f_l2normhelper L2NORMHELPER
#define f_l2normhelper7 L2NORMHELPER7
#define f_l2normhelper_sh L2NORMHELPER_SH #define f_l2normhelper_sh L2NORMHELPER_SH
#define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS #define f_l2normhelper_sh_rms L2NORMHELPER_SH_RMS
#define f_average AVERAGE #define f_average AVERAGE
@@ -73,7 +71,6 @@
#define f_global_interpind2d global_interpind2d_ #define f_global_interpind2d global_interpind2d_
#define f_global_interpind1d global_interpind1d_ #define f_global_interpind1d global_interpind1d_
#define f_l2normhelper l2normhelper_ #define f_l2normhelper l2normhelper_
#define f_l2normhelper7 l2normhelper7_
#define f_l2normhelper_sh l2normhelper_sh_ #define f_l2normhelper_sh l2normhelper_sh_
#define f_l2normhelper_sh_rms l2normhelper_sh_rms_ #define f_l2normhelper_sh_rms l2normhelper_sh_rms_
#define f_average average_ #define f_average average_
@@ -167,15 +164,6 @@ extern "C"
double *, double &, int &); double *, double &, int &);
} }
extern "C"
{
void f_l2normhelper7(int *, double *, double *, double *,
double &, double &, double &,
double &, double &, double &,
double *, double *, double *, double *,
double *, double *, double *, double *, int &);
}
extern "C" extern "C"
{ {
void f_l2normhelper_sh(int *, double *, double *, double *, void f_l2normhelper_sh(int *, double *, double *, double *,

View File

@@ -1,107 +0,0 @@
#include "interp_lb_profile.h"
#include <cstdio>
#include <cstring>
#include <algorithm>
namespace InterpLBProfile {
bool write_profile(const char *filepath, int nprocs,
const double *rank_times,
const int *heavy_ranks, int num_heavy,
double threshold_ratio)
{
FILE *fp = fopen(filepath, "wb");
if (!fp) return false;
ProfileHeader hdr;
hdr.magic = MAGIC;
hdr.version = VERSION;
hdr.nprocs = nprocs;
hdr.num_heavy = num_heavy;
hdr.threshold_ratio = threshold_ratio;
fwrite(&hdr, sizeof(hdr), 1, fp);
fwrite(rank_times, sizeof(double), nprocs, fp);
fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
fclose(fp);
return true;
}
bool read_profile(const char *filepath, int current_nprocs,
int *heavy_ranks, int &num_heavy,
double *rank_times, MPI_Comm comm)
{
int myrank;
MPI_Comm_rank(comm, &myrank);
int valid = 0;
ProfileHeader hdr;
memset(&hdr, 0, sizeof(hdr));
if (myrank == 0) {
FILE *fp = fopen(filepath, "rb");
if (fp) {
if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
hdr.magic == MAGIC && hdr.version == VERSION &&
hdr.nprocs == current_nprocs)
{
if (fread(rank_times, sizeof(double), current_nprocs, fp)
== (size_t)current_nprocs &&
fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
== (size_t)hdr.num_heavy)
{
num_heavy = hdr.num_heavy;
valid = 1;
}
} else if (fp) {
printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
"nprocs=%d (current=%d)\n",
hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
}
fclose(fp);
}
}
MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
if (!valid) return false;
MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
return true;
}
int identify_heavy_ranks(const double *rank_times, int nprocs,
double threshold_ratio,
int *heavy_ranks, int max_heavy)
{
double sum = 0;
for (int i = 0; i < nprocs; i++) sum += rank_times[i];
double mean = sum / nprocs;
double threshold = threshold_ratio * mean;
// Collect candidates
struct RankTime { int rank; double time; };
RankTime *candidates = new RankTime[nprocs];
int ncand = 0;
for (int i = 0; i < nprocs; i++) {
if (rank_times[i] > threshold)
candidates[ncand++] = {i, rank_times[i]};
}
// Sort descending by time
std::sort(candidates, candidates + ncand,
[](const RankTime &a, const RankTime &b) {
return a.time > b.time;
});
int count = (ncand < max_heavy) ? ncand : max_heavy;
for (int i = 0; i < count; i++)
heavy_ranks[i] = candidates[i].rank;
delete[] candidates;
return count;
}
} // namespace InterpLBProfile

View File

@@ -1,38 +0,0 @@
#ifndef INTERP_LB_PROFILE_H
#define INTERP_LB_PROFILE_H
#include <mpi.h>
namespace InterpLBProfile {
static const unsigned int MAGIC = 0x494C4250; // "ILBP"
static const unsigned int VERSION = 1;
struct ProfileHeader {
unsigned int magic;
unsigned int version;
int nprocs;
int num_heavy;
double threshold_ratio;
};
// Write profile file (rank 0 only)
bool write_profile(const char *filepath, int nprocs,
const double *rank_times,
const int *heavy_ranks, int num_heavy,
double threshold_ratio);
// Read profile file (rank 0 reads, then broadcasts to all)
// Returns true if file found and valid for current nprocs
bool read_profile(const char *filepath, int current_nprocs,
int *heavy_ranks, int &num_heavy,
double *rank_times, MPI_Comm comm);
// Identify heavy ranks: those with time > threshold_ratio * mean
int identify_heavy_ranks(const double *rank_times, int nprocs,
double threshold_ratio,
int *heavy_ranks, int max_heavy);
} // namespace InterpLBProfile
#endif /* INTERP_LB_PROFILE_H */

View File

@@ -1,29 +0,0 @@
/* 本头文件由自订profile框架自动生成并非人工硬编码针对Case优化 */
/* 更新负载均衡问题已经通过优化插值函数解决此profile静态均衡方案已弃用本头文件现在未参与编译 */
/* Auto-generated from interp_lb_profile.bin — do not edit */
#ifndef INTERP_LB_PROFILE_DATA_H
#define INTERP_LB_PROFILE_DATA_H
#define INTERP_LB_NPROCS 64
#define INTERP_LB_NUM_HEAVY 4
static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
/* Split table: {block_id, r_left, r_right} */
static const int interp_lb_splits[4][3] = {
{27, 26, 27},
{35, 34, 35},
{28, 28, 29},
{36, 36, 37},
};
/* Rank remap for displaced neighbor blocks */
static const int interp_lb_num_remaps = 4;
static const int interp_lb_remaps[][2] = {
{26, 25},
{29, 30},
{34, 33},
{37, 38},
};
#endif /* INTERP_LB_PROFILE_DATA_H */

View File

@@ -1,321 +0,0 @@
#include "macrodef.h"
#include "tool.h"
/*
* C 版 kodis — Kreiss-Oliger numerical dissipation (Cartesian patches).
*
* The KO operator is (D₊D₋)^r applied to f_rhs with alternating sign (-1)^(r-1).
*
* FD order → r → cof=2^(2r) mapping:
* ghost_width=2 (2nd) → r=2, cof=16, sign=-
* ghost_width=3 (4th) → r=3, cof=64, sign=+
* ghost_width=4 (6th) → r=4, cof=256, sign=-
* ghost_width=5 (8th) → r=5, cof=1024,sign=+
*/
void kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double SoA[3],
int Symmetry, double eps)
{
const double ZEO = 0.0;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
#if (ghost_width == 2)
/* ---- r=2, cof=16, sign=-, 5pt stencil ----------------------------- */
{
const int ord = 2;
const int r = 2;
const double cof = 16.0;
const double F4 = 4.0, F6 = 6.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
/* i±2 must be valid: i-2 >= iminF && i+2 <= imaxF
C 0-based: i0 >= iminF+1, i0 <= ex1-3 */
const int i0_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
const int j0_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
const int k0_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
const int i0_hi = imaxF - 3;
const int j0_hi = jmaxF - 3;
const int k0_hi = kmaxF - 3;
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
const double Dx = (
(fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]) -
F4 * (fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]) +
F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]) -
F4 * (fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]) +
F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]) -
F4 * (fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]) +
F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 3)
/* ---- r=3, cof=64, sign=+, 7pt stencil (current default) ---------- */
{
const int ord = 3;
const int r = 3;
const double cof = 64.0;
const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
const int NO_SYMM = 0, OCTANT = 2;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0;
const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0;
const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0;
const int i0_hi = imaxF - 4;
const int j0_hi = jmaxF - 4;
const int k0_hi = kmaxF - 4;
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
const double Dx = (
(fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
TWT * fh[idx_fh_F(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 4)
/* ---- r=4, cof=256, sign=-, 9pt stencil ---------------------------- */
{
const int ord = 4;
const int r = 4;
const double cof = 256.0;
const double F8 = 8.0, F28 = 28.0, F56 = 56.0, F70 = 70.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
/* i±4 valid: i-4>=iminF → i0>=iminF+3, i+4<=imaxF → i0<=ex1-5 */
const int i0_lo = (iminF + 3 > 0) ? iminF + 3 : 0;
const int j0_lo = (jminF + 3 > 0) ? jminF + 3 : 0;
const int k0_lo = (kminF + 3 > 0) ? kminF + 3 : 0;
const int i0_hi = imaxF - 5;
const int j0_hi = jmaxF - 5;
const int k0_hi = kmaxF - 5;
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* Stencil: [1,-8,28,-56,70,-56,28,-8,1] */
const double Dx = (
(fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]) -
F8 * (fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)]) +
F28* (fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)]) -
F56* (fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)]) +
F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]) -
F8 * (fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)]) +
F28* (fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)]) -
F56* (fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)]) +
F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]) -
F8 * (fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)]) +
F28* (fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)]) -
F56* (fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)]) +
F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 5)
/* ---- r=5, cof=1024, sign=+, 11pt stencil ------------------------- */
{
const int ord = 5;
const int r = 5;
const double cof = 1024.0;
const double F10 = 10.0, F45 = 45.0, F120 = 120.0;
const double F210 = 210.0, F252 = 252.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
/* i±5 valid: i0>=iminF+4, i0<=ex1-6 */
const int i0_lo = (iminF + 4 > 0) ? iminF + 4 : 0;
const int j0_lo = (jminF + 4 > 0) ? jminF + 4 : 0;
const int k0_lo = (kminF + 4 > 0) ? kminF + 4 : 0;
const int i0_hi = imaxF - 6;
const int j0_hi = jmaxF - 6;
const int k0_hi = kmaxF - 6;
if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) {
for (int k0 = k0_lo; k0 <= k0_hi; ++k0) {
const int kF = k0 + 1;
for (int j0 = j0_lo; j0 <= j0_hi; ++j0) {
const int jF = j0 + 1;
for (int i0 = i0_lo; i0 <= i0_hi; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* Stencil: [1,-10,45,-120,210,-252,210,-120,45,-10,1] */
const double Dx = (
(fh[idx_fh_F_ord5(iF - 5, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 5, jF, kF, ex)]) -
F10 * (fh[idx_fh_F_ord5(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 4, jF, kF, ex)]) +
F45 * (fh[idx_fh_F_ord5(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 3, jF, kF, ex)]) -
F120* (fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)]) +
F210* (fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)]) -
F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)]
) / dX;
const double Dy = (
(fh[idx_fh_F_ord5(iF, jF - 5, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 5, kF, ex)]) -
F10 * (fh[idx_fh_F_ord5(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 4, kF, ex)]) +
F45 * (fh[idx_fh_F_ord5(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 3, kF, ex)]) -
F120* (fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)]) +
F210* (fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)]) -
F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)]
) / dY;
const double Dz = (
(fh[idx_fh_F_ord5(iF, jF, kF - 5, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 5, ex)]) -
F10 * (fh[idx_fh_F_ord5(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 4, ex)]) +
F45 * (fh[idx_fh_F_ord5(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 3, ex)]) -
F120* (fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)]) +
F210* (fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)]) -
F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)]
) / dZ;
f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */
}
}
}
}
free(fh);
return;
}
#else
#error "kodiss_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
#endif
}

View File

@@ -1,136 +0,0 @@
#include "macrodef.h"
#include "share_func.h"
/*
* kodis_sh — Kreiss-Oliger dissipation on shell patches.
* Same stencil coefficients as Cartesian kodis. Uses symmetry_stbd.
*/
extern "C" void kodis_sh_(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double SoAi[2],
int Symmetry, double eps, int sst)
{
(void)sst;
const double ZEO=0.0;
const int ex1=ex[0], ex2=ex[1], ex3=ex[2];
const double dX=X[1]-X[0], dY=Y[1]-Y[0], dZ=Z[1]-Z[0];
const int imaxF=ex1, jmaxF=ex2, kmaxF=ex3;
const double SoA[2]={SoAi[0],SoAi[1]};
#if (ghost_width == 2)
{
const int ord=2, r=2;
const double cof=16.0, F4=4.0, F6=6.0;
const int NO_SYMM=0, OCTANT=2;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-1;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-1;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-1;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const int i0_lo=(iminF+1>0)?iminF+1:0,j0_lo=(jminF+1>0)?jminF+1:0,k0_lo=2;
const int i0_hi=imaxF-3,j0_hi=jmaxF-3,k0_hi=kmaxF-3;
if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])-F4*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
const double Dy=((fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])-F4*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
const double Dz=((fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])-F4*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])+F6*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
f_rhs[p]-=(eps/cof)*(Dx+Dy+Dz);
}}}
}
free(fh);return;
}
#elif (ghost_width == 3)
{
const int ord=3, r=3;
const double cof=64.0,SIX=6.0,FIT=15.0,TWT=20.0;
const int NO_SYMM=0,OCTANT=2;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-2;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-2;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-2;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const int i0_lo=(iminF+2>0)?iminF+2:0,j0_lo=(jminF+2>0)?jminF+2:0,k0_lo=3;
const int i0_hi=imaxF-4,j0_hi=jmaxF-4,k0_hi=kmaxF-4;
if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])-SIX*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])+FIT*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
const double Dy=((fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])-SIX*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])+FIT*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
const double Dz=((fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])-SIX*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])+FIT*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])-TWT*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
f_rhs[p]+=(eps/cof)*(Dx+Dy+Dz);
}}}
}
free(fh);return;
}
#elif (ghost_width == 4)
{
const int ord=4, r=4;
const double cof=256.0,F8=8.0,F28=28.0,F56=56.0,F70=70.0;
const int NO_SYMM=0,OCTANT=2;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-3;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-3;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-3;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const int i0_lo=(iminF+3>0)?iminF+3:0,j0_lo=(jminF+3>0)?jminF+3:0,k0_lo=4;
const int i0_hi=imaxF-5,j0_hi=jmaxF-5,k0_hi=kmaxF-5;
if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_stbd(iF-4,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+4,jF,kF,ord,ex)])-F8*(fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])+F28*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])-F56*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
const double Dy=((fh[idx_fh_stbd(iF,jF-4,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+4,kF,ord,ex)])-F8*(fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])+F28*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])-F56*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
const double Dz=((fh[idx_fh_stbd(iF,jF,kF-4,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+4,ord,ex)])-F8*(fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])+F28*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])-F56*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])+F70*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
f_rhs[p]-=(eps/cof)*(Dx+Dy+Dz);
}}}
}
free(fh);return;
}
#elif (ghost_width == 5)
{
const int ord=5, r=5;
const double cof=1024.0,F10=10.0,F45k=45.0,F120=120.0,F210=210.0,F252=252.0;
const int NO_SYMM=0,OCTANT=2;
int iminF=1,jminF=1,kminF=1;
if(Symmetry==OCTANT&&fabs(X[0])<dX)iminF=-4;
if(Symmetry==OCTANT&&fabs(Y[0])<dY)jminF=-4;
if((sst==2||sst==4)&&fabs(Y[0])<dY)jminF=-4;
const size_t nx=(size_t)ex1+2*ord,ny=(size_t)ex2+2*ord,nz=(size_t)ex3,fh_size=nx*ny*nz;
double *fh=(double*)malloc(fh_size*sizeof(double));if(!fh)return;
symmetry_stbd(ord,ex,f,fh,SoA);
const int i0_lo=(iminF+4>0)?iminF+4:0,j0_lo=(jminF+4>0)?jminF+4:0,k0_lo=5;
const int i0_hi=imaxF-6,j0_hi=jmaxF-6,k0_hi=kmaxF-6;
if(!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)){
for(int k0=k0_lo;k0<=k0_hi;++k0){const int kF=k0+1;
for(int j0=j0_lo;j0<=j0_hi;++j0){const int jF=j0+1;
for(int i0=i0_lo;i0<=i0_hi;++i0){const int iF=i0+1;const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_stbd(iF-5,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+5,jF,kF,ord,ex)])-F10*(fh[idx_fh_stbd(iF-4,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+4,jF,kF,ord,ex)])+F45k*(fh[idx_fh_stbd(iF-3,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+3,jF,kF,ord,ex)])-F120*(fh[idx_fh_stbd(iF-2,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+2,jF,kF,ord,ex)])+F210*(fh[idx_fh_stbd(iF-1,jF,kF,ord,ex)]+fh[idx_fh_stbd(iF+1,jF,kF,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dX;
const double Dy=((fh[idx_fh_stbd(iF,jF-5,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+5,kF,ord,ex)])-F10*(fh[idx_fh_stbd(iF,jF-4,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+4,kF,ord,ex)])+F45k*(fh[idx_fh_stbd(iF,jF-3,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+3,kF,ord,ex)])-F120*(fh[idx_fh_stbd(iF,jF-2,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+2,kF,ord,ex)])+F210*(fh[idx_fh_stbd(iF,jF-1,kF,ord,ex)]+fh[idx_fh_stbd(iF,jF+1,kF,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dY;
const double Dz=((fh[idx_fh_stbd(iF,jF,kF-5,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+5,ord,ex)])-F10*(fh[idx_fh_stbd(iF,jF,kF-4,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+4,ord,ex)])+F45k*(fh[idx_fh_stbd(iF,jF,kF-3,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+3,ord,ex)])-F120*(fh[idx_fh_stbd(iF,jF,kF-2,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+2,ord,ex)])+F210*(fh[idx_fh_stbd(iF,jF,kF-1,ord,ex)]+fh[idx_fh_stbd(iF,jF,kF+1,ord,ex)])-F252*fh[idx_fh_stbd(iF,jF,kF,ord,ex)])/dZ;
f_rhs[p]+=(eps/cof)*(Dx+Dy+Dz);
}}}
}
free(fh);return;
}
#else
#error "kodiss_sh_c.C: unsupported ghost_width"
#endif
}

View File

@@ -1,591 +0,0 @@
#include "macrodef.h"
#include "tool.h"
/*
* C 版 lopsided — upwind (lopsided) advection derivatives.
*
* Adds advection terms to f_rhs for all three spatial directions.
* Uses sign-biased (one-sided) stencils with centered fallbacks.
*
* For lopsided, symmetry_bd ord = ghost_width (same as kodiss).
*/
void lopsided(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3])
{
const double ZEO = 0.0, ONE = 1.0;
const double TWO = 2.0, F6 = 6.0, EIT = 8.0;
const double F3 = 3.0, F4 = 4.0, F5 = 5.0, F10 = 10.0, F12 = 12.0, F18 = 18.0;
const double F9 = 9.0, F45 = 45.0, F60 = 60.0;
const double F2 = 2.0, F15 = 15.0, F24 = 24.0, F30 = 30.0, F35 = 35.0;
const double F50 = 50.0, F77 = 77.0, F80 = 80.0, F100 = 100.0, F150 = 150.0;
const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
const double F140=140.0, F378=378.0, F420=420.0, F1050=1050.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
#if (ghost_width == 2)
/* ---- 2nd-order lopsided --------------------------------------------- */
{
const int ord = 2;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* x-direction */
const double sfx = Sfx[p];
if (sfx > ZEO) {
if (i0 <= ex1 - 3) // i+2 <= imax
f_rhs[p] += sfx * d2dx * (
-F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
F4*fh[idx_fh_F_ord2(iF+1, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF+2, jF, kF, ex)]);
else if (i0 <= ex1 - 2) // i+1 <= imax
f_rhs[p] += sfx * d2dx * (
-fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF+1, jF, kF, ex)]);
} else if (sfx < ZEO) {
if ((i0 - 1) >= iminF) // i-2 >= imin
f_rhs[p] -= sfx * d2dx * (
-F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
F4*fh[idx_fh_F_ord2(iF-1, jF, kF, ex)] -
fh[idx_fh_F_ord2(iF-2, jF, kF, ex)]);
else if (i0 >= iminF) // i-1 >= imin
f_rhs[p] -= sfx * d2dx * (
-fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF-1, jF, kF, ex)]);
}
/* y-direction */
const double sfy = Sfy[p];
if (sfy > ZEO) {
if (j0 <= ex2-3)
f_rhs[p] += sfy * d2dy * (
-F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
F4*fh[idx_fh_F_ord2(iF, jF+1, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF+2, kF, ex)]);
else if (j0 <= ex2-2)
f_rhs[p] += sfy * d2dy * (
-fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF+1, kF, ex)]);
} else if (sfy < ZEO) {
if ((j0-1) >= jminF)
f_rhs[p] -= sfy * d2dy * (
-F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
F4*fh[idx_fh_F_ord2(iF, jF-1, kF, ex)] -
fh[idx_fh_F_ord2(iF, jF-2, kF, ex)]);
else if (j0 >= jminF)
f_rhs[p] -= sfy * d2dy * (
-fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF-1, kF, ex)]);
}
/* z-direction */
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0 <= ex3-3)
f_rhs[p] += sfz * d2dz * (
-F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
F4*fh[idx_fh_F_ord2(iF, jF, kF+1, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF+2, ex)]);
else if (k0 <= ex3-2)
f_rhs[p] += sfz * d2dz * (
-fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF+1, ex)]);
} else if (sfz < ZEO) {
if ((k0-1) >= kminF)
f_rhs[p] -= sfz * d2dz * (
-F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
F4*fh[idx_fh_F_ord2(iF, jF, kF-1, ex)] -
fh[idx_fh_F_ord2(iF, jF, kF-2, ex)]);
else if (k0 >= kminF)
f_rhs[p] -= sfz * d2dz * (
-fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
fh[idx_fh_F_ord2(iF, jF, kF-1, ex)]);
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 3)
/* ---- 4th-order lopsided (original code) ---------------------------- */
{
const int ord = 3;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
const double sfx = Sfx[p];
if (sfx > ZEO) {
if (i0 <= ex1 - 4) // i+3 <= imax
f_rhs[p] += sfx * d12dx * (
-F3 *fh[idx_fh_F(iF-1, jF, kF, ex)]
-F10*fh[idx_fh_F(iF, jF, kF, ex)]
+F18*fh[idx_fh_F(iF+1, jF, kF, ex)]
-F6 *fh[idx_fh_F(iF+2, jF, kF, ex)]
+ fh[idx_fh_F(iF+3, jF, kF, ex)]);
else if (i0 <= ex1 - 3) // i+2 <= imax
f_rhs[p] += sfx * d12dx * (
fh[idx_fh_F(iF-2, jF, kF, ex)]
-EIT*fh[idx_fh_F(iF-1, jF, kF, ex)]
+EIT*fh[idx_fh_F(iF+1, jF, kF, ex)]
- fh[idx_fh_F(iF+2, jF, kF, ex)]);
else if (i0 <= ex1 - 2) // i+1 <= imax → mirrored
f_rhs[p] -= sfx * d12dx * (
-F3 *fh[idx_fh_F(iF+1, jF, kF, ex)]
-F10*fh[idx_fh_F(iF, jF, kF, ex)]
+F18*fh[idx_fh_F(iF-1, jF, kF, ex)]
-F6 *fh[idx_fh_F(iF-2, jF, kF, ex)]
+ fh[idx_fh_F(iF-3, jF, kF, ex)]);
} else if (sfx < ZEO) {
if ((i0 - 2) >= iminF) // i-3 >= imin
f_rhs[p] -= sfx * d12dx * (
-F3 *fh[idx_fh_F(iF+1, jF, kF, ex)]
-F10*fh[idx_fh_F(iF, jF, kF, ex)]
+F18*fh[idx_fh_F(iF-1, jF, kF, ex)]
-F6 *fh[idx_fh_F(iF-2, jF, kF, ex)]
+ fh[idx_fh_F(iF-3, jF, kF, ex)]);
else if ((i0 - 1) >= iminF) // i-2 >= imin
f_rhs[p] += sfx * d12dx * (
fh[idx_fh_F(iF-2, jF, kF, ex)]
-EIT*fh[idx_fh_F(iF-1, jF, kF, ex)]
+EIT*fh[idx_fh_F(iF+1, jF, kF, ex)]
- fh[idx_fh_F(iF+2, jF, kF, ex)]);
else if (i0 >= iminF) // i-1 >= imin → mirrored
f_rhs[p] += sfx * d12dx * (
-F3 *fh[idx_fh_F(iF-1, jF, kF, ex)]
-F10*fh[idx_fh_F(iF, jF, kF, ex)]
+F18*fh[idx_fh_F(iF+1, jF, kF, ex)]
-F6 *fh[idx_fh_F(iF+2, jF, kF, ex)]
+ fh[idx_fh_F(iF+3, jF, kF, ex)]);
}
const double sfy = Sfy[p];
if (sfy > ZEO) {
if (j0 <= ex2-4)
f_rhs[p] += sfy * d12dy * (
-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]
+fh[idx_fh_F(iF,jF+3,kF,ex)]);
else if (j0 <= ex2-3)
f_rhs[p] += sfy * d12dy * (fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
else if (j0 <= ex2-2)
f_rhs[p] -= sfy * d12dy * (
-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]
+fh[idx_fh_F(iF,jF-3,kF,ex)]);
} else if (sfy < ZEO) {
if ((j0-2) >= jminF)
f_rhs[p] -= sfy * d12dy * (
-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]
+fh[idx_fh_F(iF,jF-3,kF,ex)]);
else if ((j0-1) >= jminF)
f_rhs[p] += sfy * d12dy * (fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
else if (j0 >= jminF)
f_rhs[p] += sfy * d12dy * (
-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]
+fh[idx_fh_F(iF,jF+3,kF,ex)]);
}
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0 <= ex3-4)
f_rhs[p] += sfz * d12dz * (
-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]
+fh[idx_fh_F(iF,jF,kF+3,ex)]);
else if (k0 <= ex3-3)
f_rhs[p] += sfz * d12dz * (fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
else if (k0 <= ex3-2)
f_rhs[p] -= sfz * d12dz * (
-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]
+fh[idx_fh_F(iF,jF,kF-3,ex)]);
} else if (sfz < ZEO) {
if ((k0-2) >= kminF)
f_rhs[p] -= sfz * d12dz * (
-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]
+fh[idx_fh_F(iF,jF,kF-3,ex)]);
else if ((k0-1) >= kminF)
f_rhs[p] += sfz * d12dz * (fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
else if (k0 >= kminF)
f_rhs[p] += sfz * d12dz * (
-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]
+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]
+fh[idx_fh_F(iF,jF,kF+3,ex)]);
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 4)
/* ---- 6th-order lopsided --------------------------------------------- */
{
const int ord = 4;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d60dx = ONE / F60 / dX;
const double d60dy = ONE / F60 / dY;
const double d60dz = ONE / F60 / dZ;
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
/* ---- x-direction ---- */
const double sfx = Sfx[p];
if (sfx > ZEO) {
/* Primary biased: 2*f(i-2)-24*f(i-1)-35*f(i)+80*f(i+1)-30*f(i+2)+8*f(i+3)-f(i+4) */
if (i0 <= ex1-5 && (i0-1)>=iminF) // i+4<=imax && i-2>=imin
f_rhs[p] += sfx * d60dx * (
+F2*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
-F30*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]
-fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]);
/* Boundary-adapted: -10*f(i-1)-77*f(i)+150*f(i+1)-100*f(i+2)+50*f(i+3)-15*f(i+4)+2*f(i+5) */
else if (i0 <= ex1-6 && i0 >= iminF) // i+5<=imax && i-1>=imin
f_rhs[p] += sfx * d60dx * (
-F10*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]
+F150*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]
+F50*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]
+F2*fh[idx_fh_F_ord4(iF+5,jF,kF,ex)]);
/* Centered fallbacks */
else if (i0 <= ex1-4 && (i0-2)>=iminF) // 6th: i+3<=imax && i-3>=imin
f_rhs[p] += sfx * d60dx * (
-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
else if (i0 <= ex1-3 && (i0-1)>=iminF) // 4th
f_rhs[p] += sfx * d12dx * (
fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if (i0 <= ex1-2 && i0>=iminF) // 2nd
f_rhs[p] += sfx * d2dx * (
-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
} else if (sfx < ZEO) {
if ((i0-4)>=iminF && i0<=ex1-2) // i-4>=imin && i+2<=imax
f_rhs[p] -= sfx * d60dx * (
+F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
-F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]
-fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]);
else if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+1<=imax
f_rhs[p] -= sfx * d60dx * (
-F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]
+F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
+F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]
+F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]);
else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered
f_rhs[p] -= sfx * d60dx * (
-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]
-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]
-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
else if ((i0-2)>=iminF && i0<=ex1-2) // 4th
f_rhs[p] -= sfx * d12dx * (
fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]
+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd
f_rhs[p] -= sfx * d2dx * (
-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
}
/* ---- y-direction ---- */
const double sfy = Sfy[p];
if (sfy > ZEO) {
if (j0<=ex2-5 && (j0-1)>=jminF)
f_rhs[p] += sfy * d60dy*(F2*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]);
else if (j0<=ex2-6 && j0>=jminF)
f_rhs[p] += sfy * d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF+5,kF,ex)]);
else if (j0<=ex2-4 && (j0-2)>=jminF)
f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if (j0<=ex2-3 && (j0-1)>=jminF)
f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2 && j0>=jminF)
f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
} else if (sfy < ZEO) {
if ((j0-4)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]);
else if ((j0-5)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]);
else if ((j0-3)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
}
/* ---- z-direction ---- */
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0<=ex3-5 && (k0-1)>=kminF)
f_rhs[p] += sfz * d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]);
else if (k0<=ex3-6 && k0>=kminF)
f_rhs[p] += sfz * d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF+5,ex)]);
else if (k0<=ex3-4 && (k0-2)>=kminF)
f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if (k0<=ex3-3 && (k0-1)>=kminF)
f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2 && k0>=kminF)
f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
} else if (sfz < ZEO) {
if ((k0-4)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]);
else if ((k0-5)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]);
else if ((k0-3)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
}
}
}
}
free(fh);
return;
}
#elif (ghost_width == 5)
/* ---- 8th-order lopsided --------------------------------------------- */
{
const int ord = 5;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
const size_t fh_size = nx * ny * nz;
double *fh = (double*)malloc(fh_size * sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d840dx = ONE / F840 / dX;
const double d840dy = ONE / F840 / dY;
const double d840dz = ONE / F840 / dZ;
const double d60dx = ONE / F60 / dX;
const double d60dy = ONE / F60 / dY;
const double d60dz = ONE / F60 / dZ;
const double d12dx = ONE / F12 / dX;
const double d12dy = ONE / F12 / dY;
const double d12dz = ONE / F12 / dZ;
const double d2dx = ONE / TWO / dX;
const double d2dy = ONE / TWO / dY;
const double d2dz = ONE / TWO / dZ;
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
const int kF = k0 + 1;
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
const int jF = j0 + 1;
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
const int iF = i0 + 1;
const size_t p = idx_ex(i0, j0, k0, ex);
const double sfx = Sfx[p];
if (sfx > ZEO) {
/* 8th biased: -5*f(i-3)+60*f(i-2)-420*f(i-1)-378*f(i)+1050*f(i+1)-420*f(i+2)+140*f(i+3)-30*f(i+4)+3*f(i+5) */
if (i0 <= ex1-6 && (i0-2)>=iminF) // i+5<=imax && i-3>=imin
f_rhs[p] += sfx * d840dx * (
-F5*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
-F420*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]
+F1050*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
+F140*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]
+F3*fh[idx_fh_F_ord5(iF+5,jF,kF,ex)]);
/* 8th centered: +3*f(i-4)-32*f(i-3)+168*f(i-2)-672*f(i-1)+672*f(i+1)-168*f(i+2)+32*f(i+3)-3*f(i+4) */
else if (i0 <= ex1-5 && (i0-3)>=iminF)
f_rhs[p] += sfx * d840dx * (
+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]
+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
else if (i0 <= ex1-4 && (i0-2)>=iminF) // 6th centered
f_rhs[p] += sfx * d60dx * (
-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]
-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
else if (i0 <= ex1-3 && (i0-1)>=iminF) // 4th centered
f_rhs[p] += sfx * d12dx * (
fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
else if (i0 <= ex1-2 && i0>=iminF) // 2nd centered
f_rhs[p] += sfx * d2dx * (
-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
} else if (sfx < ZEO) {
if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+3<=imax
f_rhs[p] -= sfx * d840dx * (
-F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
-F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]
+F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
+F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]
+F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]);
else if ((i0-4)>=iminF && i0<=ex1-2) // 8th centered
f_rhs[p] -= sfx * d840dx * (
+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]
+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]
+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered
f_rhs[p] -= sfx * d60dx * (
-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]
-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]
-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
else if ((i0-2)>=iminF && i0<=ex1-2) // 4th centered
f_rhs[p] -= sfx * d12dx * (
fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]
+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd centered
f_rhs[p] -= sfx * d2dx * (
-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
}
const double sfy = Sfy[p];
if (sfy > ZEO) {
if (j0<=ex2-6 && (j0-2)>=jminF)
f_rhs[p] += sfy * d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF+5,kF,ex)]);
else if (j0<=ex2-5 && (j0-3)>=jminF)
f_rhs[p] += sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if (j0<=ex2-4 && (j0-2)>=jminF)
f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if (j0<=ex2-3 && (j0-1)>=jminF)
f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2 && j0>=jminF)
f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
} else if (sfy < ZEO) {
if ((j0-5)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]);
else if ((j0-4)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if ((j0-3)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF && j0<=ex2-2)
f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
}
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0<=ex3-6 && (k0-2)>=kminF)
f_rhs[p] += sfz * d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF+5,ex)]);
else if (k0<=ex3-5 && (k0-3)>=kminF)
f_rhs[p] += sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if (k0<=ex3-4 && (k0-2)>=kminF)
f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if (k0<=ex3-3 && (k0-1)>=kminF)
f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2 && k0>=kminF)
f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
} else if (sfz < ZEO) {
if ((k0-5)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]);
else if ((k0-4)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if ((k0-3)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF && k0<=ex3-2)
f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
}
}
}
}
free(fh);
return;
}
#else
#error "lopsided_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
#endif
}

View File

@@ -1,386 +0,0 @@
#include "macrodef.h"
#include "tool.h"
/*
* C 版 lopsided_kodis — combined upwind advection + KO dissipation.
* Uses one shared symmetry_bd buffer (ord = ghost_width for both components).
*
* FD order selection via ghost_width:
* 2 → 2nd-order advection + r=2 KO (cof=16, sign=-)
* 3 → 4th-order advection + r=3 KO (cof=64, sign=+)
* 4 → 6th-order advection + r=4 KO (cof=256, sign=-)
* 5 → 8th-order advection + r=5 KO (cof=1024, sign=+)
*/
void lopsided_kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3], double eps)
{
const double ZEO = 0.0, ONE = 1.0;
const double TWO = 2.0, F6 = 6.0, EIT = 8.0;
const double F3 = 3.0, F4 = 4.0, F5 = 5.0, F10 = 10.0, F12 = 12.0, F18 = 18.0;
const double F9 = 9.0, F45 = 45.0, F60 = 60.0;
const double F2 = 2.0, F15 = 15.0, F24 = 24.0, F30 = 30.0, F35 = 35.0;
const double F50 = 50.0, F77 = 77.0, F80 = 80.0, F100 = 100.0, F150 = 150.0;
const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0;
const double F140=140.0, F378=378.0, F420=420.0, F1050=1050.0;
const int NO_SYMM = 0, EQ_SYMM = 1;
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
const double dX = X[1] - X[0];
const double dY = Y[1] - Y[0];
const double dZ = Z[1] - Z[0];
const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3;
#if (ghost_width == 2)
{
const int ord = 2;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d2dx = ONE/TWO/dX, d2dy = ONE/TWO/dY, d2dz = ONE/TWO/dZ;
/* ---- advection (2nd-order) ---- */
for (int k0 = 0; k0 <= ex3-2; ++k0) {
const int kF = k0+1;
for (int j0 = 0; j0 <= ex2-2; ++j0) {
const int jF = j0+1;
for (int i0 = 0; i0 <= ex1-2; ++i0) {
const int iF = i0+1;
const size_t p = idx_ex(i0,j0,k0,ex);
const double sfx = Sfx[p];
if (sfx > ZEO) {
if (i0<=ex1-3) f_rhs[p] += sfx*d2dx*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord2(iF+2,jF,kF,ex)]);
else if (i0<=ex1-2) f_rhs[p] += sfx*d2dx*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+1,jF,kF,ex)]);
} else if (sfx < ZEO) {
if ((i0-1)>=iminF) f_rhs[p] -= sfx*d2dx*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]-fh[idx_fh_F_ord2(iF-2,jF,kF,ex)]);
else if (i0>=iminF) f_rhs[p] -= sfx*d2dx*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]);
}
const double sfy = Sfy[p];
if (sfy > ZEO) {
if (j0<=ex2-3) f_rhs[p] += sfy*d2dy*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord2(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2) f_rhs[p] += sfy*d2dy*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+1,kF,ex)]);
} else if (sfy < ZEO) {
if ((j0-1)>=jminF) f_rhs[p] -= sfy*d2dy*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]-fh[idx_fh_F_ord2(iF,jF-2,kF,ex)]);
else if (j0>=jminF) f_rhs[p] -= sfy*d2dy*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]);
}
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0<=ex3-3) f_rhs[p] += sfz*d2dz*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord2(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2) f_rhs[p] += sfz*d2dz*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+1,ex)]);
} else if (sfz < ZEO) {
if ((k0-1)>=kminF) f_rhs[p] -= sfz*d2dz*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]-fh[idx_fh_F_ord2(iF,jF,kF-2,ex)]);
else if (k0>=kminF) f_rhs[p] -= sfz*d2dz*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]);
}
}
}
}
/* ---- KO dissipation (r=2, cof=16, sign=-) ---- */
if (eps > ZEO) {
const double cof = 16.0;
const double F4k = 4.0, F6k = 6.0;
const int i0_lo = (iminF+1>0)?iminF+1:0, j0_lo=(jminF+1>0)?jminF+1:0, k0_lo=(kminF+1>0)?kminF+1:0;
const int i0_hi=imaxF-3, j0_hi=jmaxF-3, k0_hi=kmaxF-3;
if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_F_ord2(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+2,jF,kF,ex)])-F4k*(fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+1,jF,kF,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dX;
const double Dy=((fh[idx_fh_F_ord2(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+2,kF,ex)])-F4k*(fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+1,kF,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dY;
const double Dz=((fh[idx_fh_F_ord2(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+2,ex)])-F4k*(fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+1,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dZ;
f_rhs[p] -= (eps/cof)*(Dx+Dy+Dz);
}}}
}
}
free(fh);
return;
}
#elif (ghost_width == 3)
/* ---- 4th-order advection + r=3 KO (original code) ----------------- */
{
const int ord = 3;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d12dx = ONE/F12/dX, d12dy = ONE/F12/dY, d12dz = ONE/F12/dZ;
/* ---- advection ---- */
for (int k0 = 0; k0 <= ex3-2; ++k0) {
const int kF = k0+1;
for (int j0 = 0; j0 <= ex2-2; ++j0) {
const int jF = j0+1;
for (int i0 = 0; i0 <= ex1-2; ++i0) {
const int iF = i0+1;
const size_t p = idx_ex(i0,j0,k0,ex);
const double sfx = Sfx[p];
if (sfx > ZEO) {
if (i0 <= ex1-4)
f_rhs[p] += sfx*d12dx*(-F3*fh[idx_fh_F(iF-1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF+1,jF,kF,ex)]-F6*fh[idx_fh_F(iF+2,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)]);
else if (i0 <= ex1-3)
f_rhs[p] += sfx*d12dx*(fh[idx_fh_F(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F(iF+1,jF,kF,ex)]-fh[idx_fh_F(iF+2,jF,kF,ex)]);
else if (i0 <= ex1-2)
f_rhs[p] -= sfx*d12dx*(-F3*fh[idx_fh_F(iF+1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF-1,jF,kF,ex)]-F6*fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF-3,jF,kF,ex)]);
} else if (sfx < ZEO) {
if ((i0-2) >= iminF)
f_rhs[p] -= sfx*d12dx*(-F3*fh[idx_fh_F(iF+1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF-1,jF,kF,ex)]-F6*fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF-3,jF,kF,ex)]);
else if ((i0-1) >= iminF)
f_rhs[p] += sfx*d12dx*(fh[idx_fh_F(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F(iF+1,jF,kF,ex)]-fh[idx_fh_F(iF+2,jF,kF,ex)]);
else if (i0 >= iminF)
f_rhs[p] += sfx*d12dx*(-F3*fh[idx_fh_F(iF-1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF+1,jF,kF,ex)]-F6*fh[idx_fh_F(iF+2,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)]);
}
const double sfy = Sfy[p];
if (sfy > ZEO) {
if (j0<=ex2-4) f_rhs[p] += sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)]);
else if (j0<=ex2-3) f_rhs[p] += sfy*d12dy*(fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2) f_rhs[p] -= sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF-3,kF,ex)]);
} else if (sfy < ZEO) {
if ((j0-2)>=jminF) f_rhs[p] -= sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF-3,kF,ex)]);
else if ((j0-1)>=jminF) f_rhs[p] += sfy*d12dy*(fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]);
else if (j0>=jminF) f_rhs[p] += sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)]);
}
const double sfz = Sfz[p];
if (sfz > ZEO) {
if (k0<=ex3-4) f_rhs[p] += sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)]);
else if (k0<=ex3-3) f_rhs[p] += sfz*d12dz*(fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2) f_rhs[p] -= sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF-3,ex)]);
} else if (sfz < ZEO) {
if ((k0-2)>=kminF) f_rhs[p] -= sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF-3,ex)]);
else if ((k0-1)>=kminF) f_rhs[p] += sfz*d12dz*(fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]);
else if (k0>=kminF) f_rhs[p] += sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)]);
}
}
}
}
/* ---- KO dissipation (r=3, cof=64, sign=+) ---- */
if (eps > ZEO) {
const double cof = 64.0;
const double SIX = 6.0, FIT = 15.0, TWT = 20.0;
const int i0_lo=(iminF+2>0)?iminF+2:0, j0_lo=(jminF+2>0)?jminF+2:0, k0_lo=(kminF+2>0)?kminF+2:0;
const int i0_hi=imaxF-4, j0_hi=jmaxF-4, k0_hi=kmaxF-4;
if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_F(iF-3,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)])-SIX*(fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF+2,jF,kF,ex)])+FIT*(fh[idx_fh_F(iF-1,jF,kF,ex)]+fh[idx_fh_F(iF+1,jF,kF,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dX;
const double Dy=((fh[idx_fh_F(iF,jF-3,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)])-SIX*(fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF+2,kF,ex)])+FIT*(fh[idx_fh_F(iF,jF-1,kF,ex)]+fh[idx_fh_F(iF,jF+1,kF,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dY;
const double Dz=((fh[idx_fh_F(iF,jF,kF-3,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)])-SIX*(fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF+2,ex)])+FIT*(fh[idx_fh_F(iF,jF,kF-1,ex)]+fh[idx_fh_F(iF,jF,kF+1,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dZ;
f_rhs[p] += (eps/cof)*(Dx+Dy+Dz);
}}}
}
}
free(fh);
return;
}
#elif (ghost_width == 4)
{
const int ord = 4;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d60dx=ONE/F60/dX, d60dy=ONE/F60/dY, d60dz=ONE/F60/dZ;
const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ;
const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ;
/* ---- advection (6th-order lopsided) ---- */
for (int k0=0;k0<=ex3-2;++k0) { const int kF=k0+1;
for (int j0=0;j0<=ex2-2;++j0) { const int jF=j0+1;
for (int i0=0;i0<=ex1-2;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
/* x */
const double sfx=Sfx[p];
if (sfx>ZEO) {
if (i0<=ex1-5&&(i0-1)>=iminF) f_rhs[p]+=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]);
else if (i0<=ex1-6&&i0>=iminF) f_rhs[p]+=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF+5,jF,kF,ex)]);
else if (i0<=ex1-4&&(i0-2)>=iminF) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
} else if (sfx<ZEO) {
if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]);
else if ((i0-5)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]);
else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]);
else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]);
else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]);
}
/* y */
const double sfy=Sfy[p];
if (sfy>ZEO) {
if (j0<=ex2-5&&(j0-1)>=jminF) f_rhs[p]+=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]);
else if (j0<=ex2-6&&j0>=jminF) f_rhs[p]+=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF+5,kF,ex)]);
else if (j0<=ex2-4&&(j0-2)>=jminF) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
} else if (sfy<ZEO) {
if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]);
else if ((j0-5)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]);
else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]);
}
/* z */
const double sfz=Sfz[p];
if (sfz>ZEO) {
if (k0<=ex3-5&&(k0-1)>=kminF) f_rhs[p]+=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]);
else if (k0<=ex3-6&&k0>=kminF) f_rhs[p]+=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF+5,ex)]);
else if (k0<=ex3-4&&(k0-2)>=kminF) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
} else if (sfz<ZEO) {
if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]);
else if ((k0-5)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]);
else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]);
}
}}}
/* ---- KO dissipation (r=4, cof=256, sign=-) ---- */
if (eps > ZEO) {
const double cof = 256.0;
const double F8k = 8.0, F28 = 28.0, F56 = 56.0, F70 = 70.0;
const int i0_lo=(iminF+3>0)?iminF+3:0, j0_lo=(jminF+3>0)?jminF+3:0, k0_lo=(kminF+3>0)?kminF+3:0;
const int i0_hi=imaxF-5, j0_hi=jmaxF-5, k0_hi=kmaxF-5;
if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+4,jF,kF,ex)])-F8k*(fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)])+F28*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+2,jF,kF,ex)])-F56*(fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dX;
const double Dy=((fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+4,kF,ex)])-F8k*(fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)])+F28*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+2,kF,ex)])-F56*(fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dY;
const double Dz=((fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+4,ex)])-F8k*(fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)])+F28*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+2,ex)])-F56*(fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dZ;
f_rhs[p] -= (eps/cof)*(Dx+Dy+Dz);
}}}
}
}
free(fh);
return;
}
#elif (ghost_width == 5)
{
const int ord = 5;
int iminF = 1, jminF = 1, kminF = 1;
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4;
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4;
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4;
const size_t nx = (size_t)ex1 + ord;
const size_t ny = (size_t)ex2 + ord;
const size_t nz = (size_t)ex3 + ord;
double *fh = (double*)malloc(nx*ny*nz*sizeof(double));
if (!fh) return;
symmetry_bd(ord, ex, f, fh, SoA);
const double d840dx=ONE/F840/dX, d840dy=ONE/F840/dY, d840dz=ONE/F840/dZ;
const double d60dx=ONE/F60/dX, d60dy=ONE/F60/dY, d60dz=ONE/F60/dZ;
const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ;
const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ;
/* ---- advection (8th-order lopsided) ---- */
for (int k0=0;k0<=ex3-2;++k0) { const int kF=k0+1;
for (int j0=0;j0<=ex2-2;++j0) { const int jF=j0+1;
for (int i0=0;i0<=ex1-2;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
const double sfx=Sfx[p];
if (sfx>ZEO) {
if (i0<=ex1-6&&(i0-2)>=iminF) f_rhs[p]+=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF+5,jF,kF,ex)]);
else if (i0<=ex1-5&&(i0-3)>=iminF) f_rhs[p]+=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
else if (i0<=ex1-4&&(i0-2)>=iminF) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
} else if (sfx<ZEO) {
if ((i0-5)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]);
else if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]);
else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]);
else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]);
else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]);
}
const double sfy=Sfy[p];
if (sfy>ZEO) {
if (j0<=ex2-6&&(j0-2)>=jminF) f_rhs[p]+=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF+5,kF,ex)]);
else if (j0<=ex2-5&&(j0-3)>=jminF) f_rhs[p]+=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if (j0<=ex2-4&&(j0-2)>=jminF) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
} else if (sfy<ZEO) {
if ((j0-5)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]);
else if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]);
else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]);
else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]);
else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]);
}
const double sfz=Sfz[p];
if (sfz>ZEO) {
if (k0<=ex3-6&&(k0-2)>=kminF) f_rhs[p]+=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF+5,ex)]);
else if (k0<=ex3-5&&(k0-3)>=kminF) f_rhs[p]+=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if (k0<=ex3-4&&(k0-2)>=kminF) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
} else if (sfz<ZEO) {
if ((k0-5)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]);
else if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]);
else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]);
else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]);
else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]);
}
}}}
/* ---- KO dissipation (r=5, cof=1024, sign=+) ---- */
if (eps > ZEO) {
const double cof = 1024.0;
const double F10k=10.0, F45k=45.0, F120=120.0, F210=210.0, F252=252.0;
const int i0_lo=(iminF+4>0)?iminF+4:0, j0_lo=(jminF+4>0)?jminF+4:0, k0_lo=(kminF+4>0)?kminF+4:0;
const int i0_hi=imaxF-6, j0_hi=jmaxF-6, k0_hi=kmaxF-6;
if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) {
for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1;
for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1;
for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1;
const size_t p=idx_ex(i0,j0,k0,ex);
const double Dx=((fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+5,jF,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+4,jF,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+2,jF,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dX;
const double Dy=((fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+5,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+4,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+2,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dY;
const double Dz=((fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+5,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+4,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+2,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dZ;
f_rhs[p] += (eps/cof)*(Dx+Dy+Dz);
}}}
}
}
free(fh);
return;
}
#else
#error "lopsided_kodis_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)"
#endif
}

View File

@@ -1,77 +1,83 @@
#define tetradtype 2
#define Cell
#define ghost_width 3
#define GAUGE 0
#define CPBC_ghost_width (ghost_width)
#define ABV 0
#define EScalar_CC 2
#if 0 #if 0
note here
define tetradtype v:r; u: phi; w: theta
v:r; u: phi; w: theta tetradtype 0
tetradtype 0 v^a = (x,y,z)
v^a = (x,y,z) orthonormal order: v,u,w
orthonormal order: v,u,w m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007) tetradtype 1
tetradtype 1 orthonormal order: w,u,v
orthonormal order: w,u,v m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012)
m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012) tetradtype 2
tetradtype 2 v_a = (x,y,z)
v_a = (x,y,z) orthonormal order: v,u,w
orthonormal order: v,u,w m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
define Cell or Vertex
Cell center or Vertex center
define ghost_width
2nd order: 2
4th order: 3
6th order: 4
8th order: 5
define WithShell
use shell or not
define CPBC
use constraint preserving boundary condition or not
only affect Z4c
CPBC only supports WithShell
define GAUGE
0: B^i gauge
1: David puncture gauge
2: MB B^i gauge
3: RIT B^i gauge
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
6: MGB1 B^i gauge
7: MGB2 B^i gauge
define CPBC_ghost_width (ghost_width)
buffer points for CPBC boundary
define ABV
0: using BSSN variable for constraint violation and psi4 calculation
1: using ADM variable for constraint violation and psi4 calculation
define EScalar_CC
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
1: Case C of 1112.3928, V=0
2: shell with phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
5: shell with phi(r) = phi0 * Exp(-(r-r0)**2/sigma), V = 0
#endif #endif
#define tetradtype 2
#if 0
note here
Cell center or Vertex center
#endif
#define Cell
#if 0
note here
2nd order: 2
4th order: 3
6th order: 4
8th order: 5
#endif
#define ghost_width 3
#if 0
note here
use shell or not
#endif
#define WithShell
#if 0
note here
use constraint preserving boundary condition or not
only affect Z4c
#endif
#define CPBC
#if 0
note here
Gauge condition type
0: B^i gauge
1: David's puncture gauge
2: MB B^i gauge
3: RIT B^i gauge
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
6: MGB1 B^i gauge
7: MGB2 B^i gauge
#endif
#define GAUGE 2
#if 0
buffer points for CPBC boundary
#endif
#define CPBC_ghost_width (ghost_width)
#if 0
using BSSN variable for constraint violation and psi4 calculation: 0
using ADM variable for constraint violation and psi4 calculation: 1
#endif
#define ABV 0
#if 0
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
1: Case C of 1112.3928, V=0
2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
#endif
#define EScalar_CC 2

View File

@@ -6,152 +6,95 @@
// application parameters // application parameters
/// ****
// sommerfeld boundary type
// 0: bam, 1: shibata
#define SommerType 0 #define SommerType 0
/// ****
// for Using Gauss-Legendre quadrature in theta direction
#define GaussInt #define GaussInt
#define ABEtype 0 /// ****
// 0: BSSN vacuum
// 1: coupled to scalar field
// 2: Z4c vacuum
// 3: coupled to Maxwell field
//
#define ABEtype 2
/// ****
// using Apparent Horizon Finder
//#define With_AHF //#define With_AHF
/// ****
// Psi4 calculation method
// 0: EB method
// 1: 4-D method
//
#define Psi4type 0 #define Psi4type 0
/// ****
// for Using point psi4 or not
//#define Point_Psi4 //#define Point_Psi4
/// ****
// RestrictProlong in Step (0) or after Step (1)
#define RPS 1 #define RPS 1
/// ****
// Enforce algebra constraint
// for every RK4 sub step: 0
// only when iter_count == 3: 1
// after routine Step: 2
#define AGM 0 #define AGM 0
/// ****
// Restrict Prolong using BAM style 1 or old style 0
#define RPB 0 #define RPB 0
/// ****
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
#define MAPBH 1 #define MAPBH 1
/// ****
// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
#define PSTR 0 #define PSTR 0
/// ****
// regrid for every level or for all levels at a time
// 0: for every level; 1: for all
#define REGLEV 0 #define REGLEV 0
#define BSSN_FINE_TIMING 0 /// ****
// use gpu or not
#define BSSN_FINE_TIMING_EVERY 1
#define BSSN_FINE_TIMING_TOPN 8
#define BSSN_KERNEL_FINE_TIMING 0
#define BSSN_ENABLE_STDIN_ABORT_POLL 0
//#define USE_GPU //#define USE_GPU
/// ****
// use checkpoint for every process
//#define CHECKDETAIL //#define CHECKDETAIL
/// ****
// use FakeCheckPrepare to write CheckPoint
//#define FAKECHECK //#define FAKECHECK
//
// define SommerType
// sommerfeld boundary type
// 0: bam
// 1: shibata
//
// define GaussInt
// for Using Gauss-Legendre quadrature in theta direction
//
// define ABEtype
// 0: BSSN vacuum
// 1: coupled to scalar field
// 2: Z4c vacuum
// 3: coupled to Maxwell field
//
// define With_AHF
// using Apparent Horizon Finder
//
// define Psi4type
// Psi4 calculation method
// 0: EB method
// 1: 4-D method
//
// define Point_Psi4
// for Using point psi4 or not
//
// define RPS
// RestrictProlong in Step (0) or after Step (1)
//
// define AGM
// Enforce algebra constraint
// for every RK4 sub step: 0
// only when iter_count == 3: 1
// after routine Step: 2
//
// define RPB
// Restrict Prolong using BAM style 1 or old style 0
//
// define MAPBH
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
//
// define PSTR
// parallel structure
// 0: level by level
// 1: considering all levels
// 2: as 1 but reverse the CPU order
// 3: Frank's scheme
//
// define REGLEV
// regrid for every level or for all levels at a time
// 0: for every level;
// 1: for all
//
// define BSSN_FINE_TIMING
// enable fine-grained per-timestep timing monitor
//
// define BSSN_FINE_TIMING_EVERY
// report timing every N coarse timesteps
//
// define BSSN_FINE_TIMING_TOPN
// number of hottest timing buckets shown in stdout
//
// define BSSN_KERNEL_FINE_TIMING
// enable split timing inside compute_rhs_bssn
//
// define BSSN_ENABLE_STDIN_ABORT_POLL
// poll stdin and broadcast abort flag every coarse step
//
// define USE_GPU
// use gpu or not
//
// define CHECKDETAIL
// use checkpoint for every process
//
// define FAKECHECK
// use FakeCheckPrepare to write CheckPoint
//
////================================================================ ////================================================================
// some basic parameters for numerical calculation // some basic parameters for numerical calculation
////================================================================
#define dim 3 #define dim 3
//#define Cell or Vertex in "macrodef.fh" //#define Cell or Vertex in "microdef.fh"
// ******
// buffer point number for mesh refinement interface
#define buffer_width 6 #define buffer_width 6
// ******
// buffer point number shell-box interface, on shell
#define SC_width buffer_width #define SC_width buffer_width
// buffer point number shell-box interface, on box
#define CS_width (2*buffer_width) #define CS_width (2*buffer_width)
//
// define Cell or Vertex in "macrodef.fh"
//
// define buffer_width
// buffer point number for mesh refinement interface
//
// define SC_width buffer_width
// buffer point number shell-box interface, on shell
//
// define CS_width
// buffer point number shell-box interface, on box
//
#if(buffer_width < ghost_width) #if(buffer_width < ghost_width)
# error we always assume buffer_width>ghost_width #error we always assume buffer_width>ghost_width
#endif #endif
#define PACK 1 #define PACK 1

View File

@@ -2,92 +2,6 @@
include makefile.inc include makefile.inc
-include AMSS_NCKU_build.mk
ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
ifeq ($(USE_TRANSFER_CACHE),auto)
ifeq ($(ABE_TYPE),0)
EFFECTIVE_USE_TRANSFER_CACHE = 1
else
EFFECTIVE_USE_TRANSFER_CACHE = 0
endif
else
EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
endif
ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
ifeq ($(ABE_TYPE),1)
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
ifeq ($(USE_CXX_KERNELS),0)
$(error USE_CXX_ESCALAR_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_escalar_rhs_c.C reuses the C BSSN kernel)
endif
endif
ifeq ($(USE_CXX_EM_KERNEL),1)
ifeq ($(ABE_TYPE),3)
EFFECTIVE_USE_CXX_EM_KERNEL = 1
else
EFFECTIVE_USE_CXX_EM_KERNEL = 0
endif
else
EFFECTIVE_USE_CXX_EM_KERNEL = 0
endif
ifeq ($(EFFECTIVE_USE_CXX_EM_KERNEL),1)
ifeq ($(USE_CXX_KERNELS),0)
$(error USE_CXX_EM_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_em_rhs_c.C reuses the C BSSN kernel)
endif
endif
EM_KERNEL_FLAG = -DBSSN_USE_EM_C_KERNEL=$(EFFECTIVE_USE_CXX_EM_KERNEL)
## polint(ordn=6) kernel selector:
## 1 (default): barycentric fast path
## 0 : fallback to Neville path
POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
FMISC_SAFE_FLAG = -DUSE_FMISC_SAFE_MODE=$(USE_FMISC_SAFE_MODE)
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
## make -> opt (PGO-guided, maximum performance)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(PGO_MODE),instrument)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(FMISC_SAFE_FLAG) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
f90appflags = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
$(FMISC_SAFE_FLAG)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(FMISC_SAFE_FLAG) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
f90appflags = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
$(FMISC_SAFE_FLAG)
endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu
.f90.o: .f90.o:
@@ -96,121 +10,25 @@ endif
.C.o: .C.o:
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
# ShellPatch.C uses OpenMP for setupintintstuff search loops
ShellPatch.o: ShellPatch.C
${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
.for.o: .for.o:
$(f77) -c $< -o $@ $(f77) -c $< -o $@
.cu.o: .cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_c.o: fderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_c.o: fdderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
kodiss_c.o: kodiss_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_c.o: lopsided_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
# C rewrite of shell-patch derivative kernels
fderivs_sh_c.o: fderivs_sh_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_sh_c.o: fdderivs_sh_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_shc_c.o: fderivs_shc_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_shc_c.o: fdderivs_shc_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
kodiss_sh_c.o: kodiss_sh_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
bssn_em_rhs_c.o: bssn_em_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
z4c_rhs_c.o: z4c_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
TP_OPTFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
TwoPunctures.o: TwoPunctures.C TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
CFILES =
else
# C++ mode (default): C rewrite of bssn/bssn-escalar rhs and helper kernels
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
CFILES += bssn_escalar_rhs_c.o
endif
ifeq ($(EFFECTIVE_USE_CXX_EM_KERNEL),1)
CFILES += bssn_em_rhs_c.o
endif
endif
ifeq ($(USE_CXX_Z4C_KERNELS),1)
CFILES += z4c_rhs_c.o
Z4C_F90_OBJ =
else
Z4C_F90_OBJ = Z4c_rhs.o
endif
## RK4 kernel switch (independent from USE_CXX_KERNELS)
ifeq ($(USE_CXX_RK4),1)
CFILES += rungekutta4_rout_c.o
RK4_F90_OBJ =
else
RK4_F90_OBJ = rungekutta4_rout.o
endif
## Shell-patch derivative kernel switch (independent from USE_CXX_KERNELS)
## 1 : use C++ rewrite of shell derivative functions (experimental)
## 0 : use original Fortran diff_new_sh.o and kodiss_sh.o (default)
USE_CXX_SHELL_KERNELS ?= 0
ifeq ($(USE_CXX_SHELL_KERNELS),1)
CFILES += fderivs_sh_c.o fdderivs_sh_c.o fderivs_shc_c.o fdderivs_shc_c.o kodiss_sh_c.o
SH_F90_OBJ =
else
SH_F90_OBJ = diff_new_sh.o kodiss_sh.o point_diff_new_sh.o
endif
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o NullShellPatch2_Evo.o writefile_f.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o surface_integral.o ShellPatch.o\ cgh.o surface_integral.o ShellPatch.o\
@@ -220,27 +38,19 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
NullShellPatch2_Evo.o \ NullShellPatch2_Evo.o \
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\
$(RK4_F90_OBJ) diff_new.o kodiss.o\ rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o $(SH_F90_OBJ)\ lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o\ fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\ cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
getnpem2.o empart.o NullNews.o fourdcurvature.o\ getnpem2.o empart.o NullNews.o fourdcurvature.o\
bssn2adm.o adm_constraint.o adm_ricci_gamma.o\ bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\ scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
NullNews2.o tool_f.o NullNews2.o tool_f.o
ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: include original bssn_rhs.o
F90FILES = $(F90FILES_BASE) bssn_rhs.o
else
# C++ mode (default): bssn_rhs.o replaced by C++ kernel
F90FILES = $(F90FILES_BASE)
endif
F77FILES = zbesh.o F77FILES = zbesh.o
AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \ AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
@@ -253,7 +63,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
# file dependences # file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -276,7 +86,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h $(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h
TwoPunctureFILES: TwoPunctures.h TwoPunctureFILES: TwoPunctures.h
@@ -285,14 +95,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
misc.o : zbesh.o misc.o : zbesh.o
# projects # projects
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

View File

@@ -8,79 +8,18 @@ filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
PGO_MODE ?= opt
## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation
## profile : Pass 1 — instrument Interp_Points to collect timing profile
## optimize : Pass 2 — read profile and apply block rebalancing
INTERP_LB_MODE ?= off
ifeq ($(INTERP_LB_MODE),profile)
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
else ifeq ($(INTERP_LB_MODE),optimize)
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
else
INTERP_LB_FLAGS =
endif
## Kernel implementation switch
## 1 : use C++ rewrite of bssn_rhs and helper kernels (faster)
## 0 (default): fall back to original Fortran kernels
USE_CXX_KERNELS ?= 0
## Z4C Cartesian RHS kernel switch
## 1 : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
## 0 (default): use original Fortran Z4c_rhs.o
USE_CXX_Z4C_KERNELS ?= 0
## BSSN-EScalar RHS switch
## 1 : use BSSN-EScalar C wrapper on the normal patch path
## 0 : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
USE_CXX_ESCALAR_KERNEL ?= 0
## BSSN-EM RHS switch
## 1 : use BSSN-EM C kernel (bssn_em_rhs_c.C) on the normal patch path
## 0 : keep the original Fortran empart.f90 RHS for the EM fields (default)
## Note: experimental, requires USE_CXX_KERNELS=1
USE_CXX_EM_KERNEL ?= 0
## Cached transfer switch
## auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
## 1 : force cached Sync/Restrict/OutBd transfer on evolution hot paths
## 0 : force the original uncached transfer path
USE_TRANSFER_CACHE ?= auto
## RK4 kernel implementation switch
## 1 : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
## 0 (default): use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 0
## fmisc conservative mode switch
## 1 : restore lower-optimization / legacy fmisc numerics
## 0 (default): keep the optimized fmisc paths
USE_FMISC_SAFE_MODE ?= 0
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
## -fprofile-instr-use: use collected profile data to guide optimization decisions
## (branch prediction, basic block layout, inlining, loop unrolling)
PROFDATA = ../../pgo_profile/default.profdata
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(PROFDATA) \
-align array64byte -fpp -I${MKLROOT}/include
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx
CXX = icpx CXX = icpx

View File

@@ -1934,35 +1934,18 @@
! when if=1 -> ic=0, this is different to vertex center grid ! when if=1 -> ic=0, this is different to vertex center grid
real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc
integer,dimension(3) :: cxI integer,dimension(3) :: cxI
integer :: i,j,k,ii,jj,kk,px,py,pz integer :: i,j,k,ii,jj,kk
real*8, dimension(6,6) :: tmp2 real*8, dimension(6,6) :: tmp2
real*8, dimension(6) :: tmp1 real*8, dimension(6) :: tmp1
integer, dimension(extf(1)) :: cix
integer, dimension(extf(2)) :: ciy
integer, dimension(extf(3)) :: ciz
integer, dimension(extf(1)) :: pix
integer, dimension(extf(2)) :: piy
integer, dimension(extf(3)) :: piz
real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3 real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3 real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
real*8, dimension(6,2), parameter :: WC = reshape((/&
C1,C2,C3,C4,C5,C6,&
C6,C5,C4,C3,C2,C1/), (/6,2/))
integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
integer::maxcx,maxcy,maxcz
real*8,dimension(3) :: CD,FD real*8,dimension(3) :: CD,FD
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
real*8 :: v1, v2, v3, v4, v5, v6
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
logical :: need_full_symmetry
real*8 :: res_line
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
if(wei.ne.3)then if(wei.ne.3)then
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension" write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
write(*,*)"dim = ",wei write(*,*)"dim = ",wei
@@ -2037,140 +2020,145 @@
return return
endif endif
do i = imino,imaxo call symmetry_bd(3,extc,func,funcc,SoA)
ii = i + lbf(1) - 1
cix(i) = ii/2 - lbc(1) + 1
if(ii/2*2 == ii)then
pix(i) = 1
else
pix(i) = 2
endif
enddo
do j = jmino,jmaxo
jj = j + lbf(2) - 1
ciy(j) = jj/2 - lbc(2) + 1
if(jj/2*2 == jj)then
piy(j) = 1
else
piy(j) = 2
endif
enddo
do k = kmino,kmaxo
kk = k + lbf(3) - 1
ciz(k) = kk/2 - lbc(3) + 1
if(kk/2*2 == kk)then
piz(k) = 1
else
piz(k) = 2
endif
enddo
ic_min = minval(cix(imino:imaxo))
ic_max = maxval(cix(imino:imaxo))
jc_min = minval(ciy(jmino:jmaxo))
jc_max = maxval(ciy(jmino:jmaxo))
kc_min = minval(ciz(kmino:kmaxo))
kc_max = maxval(ciz(kmino:kmaxo))
maxcx = ic_max
maxcy = jc_max
maxcz = kc_max
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
write(*,*)"error in prolong"
return
endif
i_lo = ic_min - 2
i_hi = ic_max + 3
j_lo = jc_min - 2
j_hi = jc_max + 3
k_lo = kc_min - 2
k_hi = kc_max + 3
need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
if(need_full_symmetry)then
call symmetry_bd(3,extc,func,funcc,SoA)
else
funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
endif
! 对每个 kpz, kc 固定)预计算 Z 向插值的 2D 切片
do k = kmino, kmaxo
pz = piz(k); kc = ciz(k)
! --- Pass 1: Z 方向,只算一次 ---
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
end do
end do
do j = jmino, jmaxo
py = piy(j); jc = ciy(j)
! --- Pass 2: Y 方向 ---
do ii = ic_min-2, ic_max+3
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
end do
! --- Pass 3: X 方向 ---
do i = imino, imaxo
funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
end do
end do
end do
!~~~~~~> prolongation start... !~~~~~~> prolongation start...
do k = kmino,kmaxo
do j = jmino,jmaxo
do i = imino,imaxo
cxI(1) = i
cxI(2) = j
cxI(3) = k
! change to coarse level reference
!|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---|
!|=======x===============x===============x===============x=======|
cxI = (cxI+lbf-1)/2
! change to array index
cxI = cxI - lbc + 1
if(any(cxI+3 > extc)) write(*,*)"error in prolong"
ii=i+lbf(1)-1
jj=j+lbf(2)-1
kk=k+lbf(3)-1
#if 0 #if 0
do k = kmino, kmaxo if(ii/2*2==ii)then
pz = piz(k) if(jj/2*2==jj)then
kc = ciz(k) if(kk/2*2==kk)then
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
else
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
endif
else
if(kk/2*2==kk)then
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
else
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
endif
endif
else
if(jj/2*2==jj)then
if(kk/2*2==kk)then
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
else
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
endif
else
if(kk/2*2==kk)then
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
else
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
endif
endif
endif
#else
if(kk/2*2==kk)then
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
else
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
endif
do j = jmino, jmaxo if(jj/2*2==jj)then
py = piy(j) tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
jc = ciy(j) else
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
endif
! --- 步骤 1 & 2 融合:分段处理 X 轴,提升 Cache 命中率 --- if(ii/2*2==ii)then
! 我们将 ii 循环逻辑重组,减少对 funcc 的跨行重复访问 funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
do ii = 1, extc(1) else
! 1. 先做 Z 方向的 6 条线插值(针对当前的 ii 和当前的 6 个 iy funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
! 我们直接在这里把 Y 方向的加权也做了,省去 tmp_yz 数组 endif
! 这样 funcc 的数据读进来后立即完成所有维度的贡献,不再写回内存
res_line = 0.0d0
do jj = 1, 6
iy = jc - 3 + jj
! 这一行代码是核心:一次性完成 Z 插值并加上 Y 的权重
! 编译器会把 WC(jj, py) 存在寄存器里
res_line = res_line + WC(jj, py) * ( &
WC(1, pz) * funcc(ii, iy, kc-2) + &
WC(2, pz) * funcc(ii, iy, kc-1) + &
WC(3, pz) * funcc(ii, iy, kc ) + &
WC(4, pz) * funcc(ii, iy, kc+1) + &
WC(5, pz) * funcc(ii, iy, kc+2) + &
WC(6, pz) * funcc(ii, iy, kc+3) )
end do
tmp_xyz_line(ii) = res_line
end do
! 3. 【降维X 向】最后在最内层只处理 X 方向的 6 点加权
! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
do i = imino, imaxo
px = pix(i)
ic = cix(i)
! 直接从预计算好的 line 中读取连续的 6 个点
! ic-2 到 ic+3 对应原始 6 点算子
funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
WC(2,px)*tmp_xyz_line(ic-1) + &
WC(3,px)*tmp_xyz_line(ic ) + &
WC(4,px)*tmp_xyz_line(ic+1) + &
WC(5,px)*tmp_xyz_line(ic+2) + &
WC(6,px)*tmp_xyz_line(ic+3)
end do
end do
end do
#endif #endif
enddo
enddo
enddo
return return
end subroutine prolong3 end subroutine prolong3
@@ -2370,13 +2358,6 @@ end do
real*8,dimension(3) :: CD,FD real*8,dimension(3) :: CD,FD
real*8 :: tmp_xz_plane(-1:extf(1), 6)
real*8 :: tmp_x_line(-1:extf(1))
integer :: fi, fj, fk, ii, jj, kk
integer :: fi_min, fi_max, ii_lo, ii_hi
integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
logical :: need_full_symmetry
if(wei.ne.3)then if(wei.ne.3)then
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension" write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
write(*,*)"dim = ",wei write(*,*)"dim = ",wei
@@ -2455,86 +2436,9 @@ end do
stop stop
endif endif
! 仅计算 X 向最终写回所需的窗口: call symmetry_bd(2,extf,funf,funff,SoA)
! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
ii_lo = fi_min - 2
ii_hi = fi_max + 3
jj_lo = fj_min - 2
jj_hi = fj_max + 3
kk_lo = fk_min - 2
kk_hi = fk_max + 3
if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
jj_lo < -1 .or. jj_hi > extf(2) .or. &
kk_lo < -1 .or. kk_hi > extf(3))then
write(*,*)"restrict3: invalid stencil window"
write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
write(*,*)"extf=",extf
stop
endif
need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
if(need_full_symmetry)then
call symmetry_bd(2,extf,funf,funff,SoA)
else
funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
endif
!~~~~~~> restriction start... !~~~~~~> restriction start...
do k = kmino, kmaxo
fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
do j = jmino, jmaxo
fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
! 确保 ii 循环是最内层且连续访问
!DIR$ VECTOR ALWAYS
do ii = ii_lo, ii_hi
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
C3*(funff(ii,fj-2,fk )+funff(ii,fj-2,fk+1))
tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
C3*(funff(ii,fj-1,fk )+funff(ii,fj-1,fk+1))
tmp_xz_plane(ii, 3) = C1*(funff(ii,fj ,fk-2)+funff(ii,fj ,fk+3)) + &
C2*(funff(ii,fj ,fk-1)+funff(ii,fj ,fk+2)) + &
C3*(funff(ii,fj ,fk )+funff(ii,fj ,fk+1))
tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
C3*(funff(ii,fj+1,fk )+funff(ii,fj+1,fk+1))
tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
C3*(funff(ii,fj+2,fk )+funff(ii,fj+2,fk+1))
tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
C3*(funff(ii,fj+3,fk )+funff(ii,fj+3,fk+1))
end do
! 优化点 2: 同样向量化 Y 方向压缩
!DIR$ VECTOR ALWAYS
do ii = ii_lo, ii_hi
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
end do
! 优化点 3: 最终写入,利用已经缓存在 tmp_x_line 的数据
do i = imino, imaxo
fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
C3*(tmp_x_line(fi ) + tmp_x_line(fi+1))
end do
end do
end do
#if 0
do k = kmino,kmaxo do k = kmino,kmaxo
do j = jmino,jmaxo do j = jmino,jmaxo
do i = imino,imaxo do i = imino,imaxo
@@ -2558,7 +2462,7 @@ end do
enddo enddo
enddo enddo
enddo enddo
#endif
return return
end subroutine restrict3 end subroutine restrict3

View File

@@ -1,212 +0,0 @@
#include "rungekutta4_rout.h"
#include <cstdio>
#include <cstdlib>
#include <cstddef>
#include <complex>
#include <immintrin.h>
namespace {
inline void rk4_stage0(std::size_t n,
const double *__restrict f0,
const double *__restrict frhs,
double *__restrict f1,
double c) {
std::size_t i = 0;
#if defined(__AVX512F__)
const __m512d vc = _mm512_set1_pd(c);
for (; i + 7 < n; i += 8) {
const __m512d v0 = _mm512_loadu_pd(f0 + i);
const __m512d vr = _mm512_loadu_pd(frhs + i);
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0));
}
#elif defined(__AVX2__)
const __m256d vc = _mm256_set1_pd(c);
for (; i + 3 < n; i += 4) {
const __m256d v0 = _mm256_loadu_pd(f0 + i);
const __m256d vr = _mm256_loadu_pd(frhs + i);
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0));
}
#endif
#pragma ivdep
for (; i < n; ++i) {
f1[i] = f0[i] + c * frhs[i];
}
}
inline void rk4_rhs_accum(std::size_t n,
const double *__restrict f1,
double *__restrict frhs) {
std::size_t i = 0;
#if defined(__AVX512F__)
const __m512d v2 = _mm512_set1_pd(2.0);
for (; i + 7 < n; i += 8) {
const __m512d v1 = _mm512_loadu_pd(f1 + i);
const __m512d vrhs = _mm512_loadu_pd(frhs + i);
_mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs));
}
#elif defined(__AVX2__)
const __m256d v2 = _mm256_set1_pd(2.0);
for (; i + 3 < n; i += 4) {
const __m256d v1 = _mm256_loadu_pd(f1 + i);
const __m256d vrhs = _mm256_loadu_pd(frhs + i);
_mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs));
}
#endif
#pragma ivdep
for (; i < n; ++i) {
frhs[i] = frhs[i] + 2.0 * f1[i];
}
}
inline void rk4_f1_from_f0_f1(std::size_t n,
const double *__restrict f0,
double *__restrict f1,
double c) {
std::size_t i = 0;
#if defined(__AVX512F__)
const __m512d vc = _mm512_set1_pd(c);
for (; i + 7 < n; i += 8) {
const __m512d v0 = _mm512_loadu_pd(f0 + i);
const __m512d v1 = _mm512_loadu_pd(f1 + i);
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0));
}
#elif defined(__AVX2__)
const __m256d vc = _mm256_set1_pd(c);
for (; i + 3 < n; i += 4) {
const __m256d v0 = _mm256_loadu_pd(f0 + i);
const __m256d v1 = _mm256_loadu_pd(f1 + i);
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0));
}
#endif
#pragma ivdep
for (; i < n; ++i) {
f1[i] = f0[i] + c * f1[i];
}
}
inline void rk4_stage3(std::size_t n,
const double *__restrict f0,
double *__restrict f1,
const double *__restrict frhs,
double c) {
std::size_t i = 0;
#if defined(__AVX512F__)
const __m512d vc = _mm512_set1_pd(c);
for (; i + 7 < n; i += 8) {
const __m512d v0 = _mm512_loadu_pd(f0 + i);
const __m512d v1 = _mm512_loadu_pd(f1 + i);
const __m512d vr = _mm512_loadu_pd(frhs + i);
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0));
}
#elif defined(__AVX2__)
const __m256d vc = _mm256_set1_pd(c);
for (; i + 3 < n; i += 4) {
const __m256d v0 = _mm256_loadu_pd(f0 + i);
const __m256d v1 = _mm256_loadu_pd(f1 + i);
const __m256d vr = _mm256_loadu_pd(frhs + i);
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0));
}
#endif
#pragma ivdep
for (; i < n; ++i) {
f1[i] = f0[i] + c * (f1[i] + frhs[i]);
}
}
} // namespace
extern "C" {
void f_rungekutta4_scalar(double &dT, double &f0, double &f1, double &f_rhs, int &RK4) {
constexpr double F1o6 = 1.0 / 6.0;
constexpr double HLF = 0.5;
constexpr double TWO = 2.0;
switch (RK4) {
case 0:
f1 = f0 + HLF * dT * f_rhs;
break;
case 1:
f_rhs = f_rhs + TWO * f1;
f1 = f0 + HLF * dT * f1;
break;
case 2:
f_rhs = f_rhs + TWO * f1;
f1 = f0 + dT * f1;
break;
case 3:
f1 = f0 + F1o6 * dT * (f1 + f_rhs);
break;
default:
std::fprintf(stderr, "rungekutta4_scalar_c: invalid RK4 stage %d\n", RK4);
std::abort();
}
}
void rungekutta4_cplxscalar_(double &dT,
std::complex<double> &f0,
std::complex<double> &f1,
std::complex<double> &f_rhs,
int &RK4) {
constexpr double F1o6 = 1.0 / 6.0;
constexpr double HLF = 0.5;
constexpr double TWO = 2.0;
switch (RK4) {
case 0:
f1 = f0 + HLF * dT * f_rhs;
break;
case 1:
f_rhs = f_rhs + TWO * f1;
f1 = f0 + HLF * dT * f1;
break;
case 2:
f_rhs = f_rhs + TWO * f1;
f1 = f0 + dT * f1;
break;
case 3:
f1 = f0 + F1o6 * dT * (f1 + f_rhs);
break;
default:
std::fprintf(stderr, "rungekutta4_cplxscalar_c: invalid RK4 stage %d\n", RK4);
std::abort();
}
}
int f_rungekutta4_rout(int *ex, double &dT,
double *f0, double *f1, double *f_rhs,
int &RK4) {
const std::size_t n = static_cast<std::size_t>(ex[0]) *
static_cast<std::size_t>(ex[1]) *
static_cast<std::size_t>(ex[2]);
const double *const __restrict f0r = f0;
double *const __restrict f1r = f1;
double *const __restrict frhs = f_rhs;
if (__builtin_expect(static_cast<unsigned>(RK4) > 3u, 0)) {
std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4);
std::abort();
}
switch (RK4) {
case 0:
rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT);
break;
case 1:
rk4_rhs_accum(n, f1r, frhs);
rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT);
break;
case 2:
rk4_rhs_accum(n, f1r, frhs);
rk4_f1_from_f0_f1(n, f0r, f1r, dT);
break;
default:
rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT);
break;
}
return 0;
}
} // extern "C"

View File

@@ -1,372 +0,0 @@
#ifndef SHARE_FUNC_H
#define SHARE_FUNC_H
#include <stdlib.h>
#include <stddef.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
/* 主网格0-based -> 1D */
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
const int ex1 = ex[0], ex2 = ex[1];
return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
}
/*
* fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
* ord=2 => shift=1
* iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex
*/
static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
const int shift = 1;
const int nx = ex[0] + 2; // ex1 + ord
const int ny = ex[1] + 2;
const int ii = iF + shift; // 0..ex1+1
const int jj = jF + shift; // 0..ex2+1
const int kk = kF + shift; // 0..ex3+1
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
* ord=3 => shift=2
* iF/jF/kF 是 Fortran 索引(可为负)
*/
static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
const int shift = 2; // ord=3 -> -2..ex
const int nx = ex[0] + 3; // ex1 + ord
const int ny = ex[1] + 3;
const int ii = iF + shift; // 0..ex1+2
const int jj = jF + shift; // 0..ex2+2
const int kk = kF + shift; // 0..ex3+2
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* fh 对应 Fortran: fh(0:ex1, 0:ex2, 0:ex3)
* ord=1 => shift=0
* iF/jF/kF 为 Fortran 索引 (0..ex)
*/
static inline size_t idx_fh_F_ord1(int iF, int jF, int kF, const int ex[3]) {
const int nx = ex[0] + 1; // ex1 + ord
const int ny = ex[1] + 1;
return (size_t)iF + (size_t)jF * (size_t)nx + (size_t)kF * (size_t)nx * (size_t)ny;
}
/*
* fh 对应 Fortran: fh(-3:ex1, -3:ex2, -3:ex3)
* ord=4 => shift=3
*/
static inline size_t idx_fh_F_ord4(int iF, int jF, int kF, const int ex[3]) {
const int shift = 3;
const int nx = ex[0] + 4; // ex1 + ord
const int ny = ex[1] + 4;
const int ii = iF + shift; // 0..ex1+3
const int jj = jF + shift; // 0..ex2+3
const int kk = kF + shift; // 0..ex3+3
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* fh 对应 Fortran: fh(-4:ex1, -4:ex2, -4:ex3)
* ord=5 => shift=4
*/
static inline size_t idx_fh_F_ord5(int iF, int jF, int kF, const int ex[3]) {
const int shift = 4;
const int nx = ex[0] + 5; // ex1 + ord
const int ny = ex[1] + 5;
const int ii = iF + shift; // 0..ex1+4
const int jj = jF + shift; // 0..ex2+4
const int kk = kF + shift; // 0..ex3+4
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran
* funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
*
* C 里我们把:
* func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
* funcc 用“平移下标”存为一维数组:
* iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1]
* 总长度 nx = extc1 + ord
* 同理 ny = extc2 + ord, nz = extc3 + ord
*/
static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
const int nx = extc[0], ny = extc[1];
return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
}
static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
const int shift = ord - 1; // iF = -shift .. extc1
const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个
const int ny = extc[1] + ord;
const int ii = iF + shift; // 0..extc1+shift
const int jj = jF + shift; // 0..extc2+shift
const int kk = kF + shift; // 0..extc3+shift
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
/*
* 等价于 Fortran:
* funcc(1:extc1,1:extc2,1:extc3)=func
* do i=0,ord-1
* funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
* enddo
* do i=0,ord-1
* funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
* enddo
* do i=0,ord-1
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
* enddo
*/
static inline void symmetry_bd_impl(int ord,
int shift,
const int extc[3],
const double *__restrict func,
double *__restrict funcc,
const double SoA[3])
{
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
const int nx = extc1 + ord;
const int ny = extc2 + ord;
const size_t snx = (size_t)nx;
const size_t splane = (size_t)nx * (size_t)ny;
const size_t interior_i = (size_t)shift + 1u; /* iF = 1 */
const size_t interior_j = ((size_t)shift + 1u) * snx; /* jF = 1 */
const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */
const size_t interior0 = interior_k + interior_j + interior_i;
/* 1) funcc(1:extc1,1:extc2,1:extc3) = func */
for (int k0 = 0; k0 < extc3; ++k0) {
const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
const size_t dst_k0 = interior0 + (size_t)k0 * splane;
for (int j0 = 0; j0 < extc2; ++j0) {
const double *src = src_k + (size_t)j0 * (size_t)extc1;
double *dst = funcc + dst_k0 + (size_t)j0 * snx;
memcpy(dst, src, (size_t)extc1 * sizeof(double));
}
}
/* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */
const double s1 = SoA[0];
if (s1 == 1.0) {
for (int ii = 0; ii < ord; ++ii) {
const size_t dst_i = (size_t)(shift - ii);
const size_t src_i = (size_t)(shift + ii + 1);
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
for (int j0 = 0; j0 < extc2; ++j0) {
const size_t off = kbase + (size_t)j0 * snx;
funcc[off + dst_i] = funcc[off + src_i];
}
}
}
} else if (s1 == -1.0) {
for (int ii = 0; ii < ord; ++ii) {
const size_t dst_i = (size_t)(shift - ii);
const size_t src_i = (size_t)(shift + ii + 1);
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
for (int j0 = 0; j0 < extc2; ++j0) {
const size_t off = kbase + (size_t)j0 * snx;
funcc[off + dst_i] = -funcc[off + src_i];
}
}
}
} else {
for (int ii = 0; ii < ord; ++ii) {
const size_t dst_i = (size_t)(shift - ii);
const size_t src_i = (size_t)(shift + ii + 1);
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
for (int j0 = 0; j0 < extc2; ++j0) {
const size_t off = kbase + (size_t)j0 * snx;
funcc[off + dst_i] = funcc[off + src_i] * s1;
}
}
}
}
/* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */
const double s2 = SoA[1];
if (s2 == 1.0) {
for (int jj = 0; jj < ord; ++jj) {
const size_t dst_j = (size_t)(shift - jj) * snx;
const size_t src_j = (size_t)(shift + jj + 1) * snx;
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = interior_k + (size_t)k0 * splane;
double *dst = funcc + kbase + dst_j;
const double *src = funcc + kbase + src_j;
for (int i = 0; i < nx; ++i) dst[i] = src[i];
}
}
} else if (s2 == -1.0) {
for (int jj = 0; jj < ord; ++jj) {
const size_t dst_j = (size_t)(shift - jj) * snx;
const size_t src_j = (size_t)(shift + jj + 1) * snx;
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = interior_k + (size_t)k0 * splane;
double *dst = funcc + kbase + dst_j;
const double *src = funcc + kbase + src_j;
for (int i = 0; i < nx; ++i) dst[i] = -src[i];
}
}
} else {
for (int jj = 0; jj < ord; ++jj) {
const size_t dst_j = (size_t)(shift - jj) * snx;
const size_t src_j = (size_t)(shift + jj + 1) * snx;
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = interior_k + (size_t)k0 * splane;
double *dst = funcc + kbase + dst_j;
const double *src = funcc + kbase + src_j;
for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2;
}
}
}
/* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */
const double s3 = SoA[2];
if (s3 == 1.0) {
for (int kk = 0; kk < ord; ++kk) {
const size_t dst_k = (size_t)(shift - kk) * splane;
const size_t src_k = (size_t)(shift + kk + 1) * splane;
double *dst = funcc + dst_k;
const double *src = funcc + src_k;
for (size_t p = 0; p < splane; ++p) dst[p] = src[p];
}
} else if (s3 == -1.0) {
for (int kk = 0; kk < ord; ++kk) {
const size_t dst_k = (size_t)(shift - kk) * splane;
const size_t src_k = (size_t)(shift + kk + 1) * splane;
double *dst = funcc + dst_k;
const double *src = funcc + src_k;
for (size_t p = 0; p < splane; ++p) dst[p] = -src[p];
}
} else {
for (int kk = 0; kk < ord; ++kk) {
const size_t dst_k = (size_t)(shift - kk) * splane;
const size_t src_k = (size_t)(shift + kk + 1) * splane;
double *dst = funcc + dst_k;
const double *src = funcc + src_k;
for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3;
}
}
}
static inline void symmetry_bd(int ord,
const int extc[3],
const double *func,
double *funcc,
const double SoA[3])
{
if (ord <= 0) return;
if (ord == 1) {
symmetry_bd_impl(1, 0, extc, func, funcc, SoA);
return;
}
if (ord == 2) {
symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
return;
}
if (ord == 3) {
symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
return;
}
if (ord == 4) {
symmetry_bd_impl(4, 3, extc, func, funcc, SoA);
return;
}
symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
}
/*
* symmetry_stbd — shell-patch (staggered boundary) ghost fill.
*
* Fortran: funcc(-ord+1:extc1+ord, -ord+1:extc2+ord, extc3)
* Only 2 SoA values (x/y). No z symmetry fill.
* Ghost on BOTH positive and negative sides of x and y.
* Reflection uses i+2 (skips boundary) instead of i+1.
* nx = extc1 + 2*ord, ny = extc2 + 2*ord
*/
static inline void symmetry_stbd(int ord,
const int extc[3],
const double *func,
double *funcc,
const double SoA[2])
{
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
const int nx = extc1 + 2 * ord;
const int ny = extc2 + 2 * ord;
const int sh = ord - 1;
const size_t snx = (size_t)nx;
const size_t splane = snx * (size_t)ny;
/* 1) Copy interior: funcc(1:extc1, 1:extc2, 1:extc3) = func */
for (int k0 = 0; k0 < extc3; ++k0) {
const double *src = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
const size_t kbase = (size_t)k0 * splane;
for (int j0 = 0; j0 < extc2; ++j0) {
double *dst = funcc + kbase + (size_t)(sh + j0 + 1) * snx + (size_t)(sh + 1);
const double *s = src + (size_t)j0 * (size_t)extc1;
for (int i0 = 0; i0 < extc1; ++i0) dst[i0] = s[i0];
}
}
/* 2) x-direction ghost fill */
const double s1 = SoA[0];
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = (size_t)k0 * splane;
for (int j0 = 0; j0 < extc2; ++j0) {
const size_t off = kbase + (size_t)(sh + j0 + 1) * snx;
/* left side: funcc(-i) = funcc(i+2) * s1 */
for (int i = 0; i < ord; ++i) {
funcc[off + (size_t)(sh - i)] = funcc[off + (size_t)(sh + i + 2)] * s1;
/* right side: funcc(extc1+1+i) = funcc(extc1-1-i) * s1 */
funcc[off + (size_t)(sh + extc1 + 1 + i)] = funcc[off + (size_t)(sh + extc1 - 1 - i)] * s1;
}
}
}
/* 3) y-direction ghost fill */
const double s2 = SoA[1];
for (int i = 0; i < nx; ++i) {
for (int k0 = 0; k0 < extc3; ++k0) {
const size_t kbase = (size_t)k0 * splane;
/* bottom: funcc(:,-i,:) = funcc(:,i+2,:) * s2 */
for (int jj = 0; jj < ord; ++jj) {
funcc[kbase + (size_t)(sh - jj) * snx + (size_t)i] =
funcc[kbase + (size_t)(sh + jj + 2) * snx + (size_t)i] * s2;
/* top: funcc(:,extc2+1+jj,:) = funcc(:,extc2-1-jj,:) * s2 */
funcc[kbase + (size_t)(sh + extc2 + 1 + jj) * snx + (size_t)i] =
funcc[kbase + (size_t)(sh + extc2 - 1 - jj) * snx + (size_t)i] * s2;
}
}
}
}
/*
* Indexing for shell fh buffer: Fortran fh(-ord+1:extc1+ord, -ord+1:extc2+ord, extc3)
* C 0-based: ii = iF + ord - 1
* nx = extc1 + 2*ord, ny = extc2 + 2*ord
*/
static inline size_t idx_fh_stbd(int iF, int jF, int kF, int ord, const int extc[3]) {
const int sh = ord - 1;
const int nx = extc[0] + 2 * ord;
const int ny = extc[1] + 2 * ord;
const int ii = iF + sh;
const int jj = jF + sh;
const int kk = kF - 1; // Fortran 1-based kF → C 0-based
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -36,11 +36,6 @@ private:
double *nx_g, *ny_g, *nz_g; // global list of unit normals double *nx_g, *ny_g, *nz_g; // global list of unit normals
int myrank, cpusize; int myrank, cpusize;
int wave_cache_spinw, wave_cache_maxl, wave_cache_modes;
double *wave_theta_pos, *wave_theta_neg;
double *wave_phi_cos, *wave_phi_sin;
void clear_wave_cache();
void build_wave_cache(int spinw, int maxl);
public: public:
surface_integral(int iSymmetry); surface_integral(int iSymmetry);
@@ -87,29 +82,13 @@ public:
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true); double *Rout, monitor *Monitor);
void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK, void surf_MassPAng(double rex, int lev, ShellPatch *GH, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true); double *Rout, monitor *Monitor);
void surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_WaveMassPAng(double rex, int lev, ShellPatch *GH,
var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP,
var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields = true);
void surf_Wave(double rex, cgh *GH, ShellPatch *SH, void surf_Wave(double rex, cgh *GH, ShellPatch *SH,
var *chi, var *trK, var *chi, var *trK,
var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
@@ -136,7 +115,7 @@ public:
var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
var *Gmx, var *Gmy, var *Gmz, var *Gmx, var *Gmy, var *Gmz,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, // temparay memory for mass^i
double *Rout, monitor *Monitor, MPI_Comm Comm_here, bool refresh_mass_fields = true); double *Rout, monitor *Monitor, MPI_Comm Comm_here);
void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
int spinw, int maxl, int NN, double *RP, double *IP, int spinw, int maxl, int NN, double *RP, double *IP,
monitor *Monitor, MPI_Comm Comm_here); monitor *Monitor, MPI_Comm Comm_here);

View File

@@ -1,33 +0,0 @@
#include "share_func.h"
void fdderivs(const int ex[3],
const double *f,
double *fxx, double *fxy, double *fxz,
double *fyy, double *fyz, double *fzz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff);
void fderivs(const int ex[3],
const double *f,
double *fx, double *fy, double *fz,
const double *X, const double *Y, const double *Z,
double SYM1, double SYM2, double SYM3,
int Symmetry, int onoff);
void kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double SoA[3],
int Symmetry, double eps);
void lopsided(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3]);
void lopsided_kodis(const int ex[3],
const double *X, const double *Y, const double *Z,
const double *f, double *f_rhs,
const double *Sfx, const double *Sfy, const double *Sfz,
int Symmetry, const double SoA[3], double eps);

View File

@@ -1,901 +0,0 @@
#include "macrodef.h"
#include "bssn_rhs.h"
#include "fmisc.h"
#include "ricci_gamma.h"
#include "share_func.h"
#include "tool.h"
#include <vector>
#ifdef fortran1
#define f_constraint_bssn constraint_bssn
#define f_z4c_rhs_point z4c_rhs_point
#endif
#ifdef fortran2
#define f_constraint_bssn CONSTRAINT_BSSN
#define f_z4c_rhs_point Z4C_RHS_POINT
#endif
#ifdef fortran3
#define f_constraint_bssn constraint_bssn_
#define f_z4c_rhs_point z4c_rhs_point_
#endif
extern "C" void f_constraint_bssn(int *, double *, double *, double *,
double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *,
int &);
extern "C" void f_z4c_rhs_point(
double &A11,
double &A12,
double &A13,
double &A22,
double &A23,
double &A33,
double &alpha,
double &B1,
double &B2,
double &B3,
double &beta1,
double &beta2,
double &beta3,
double &chi,
double &chiDivFloor,
double &da1,
double &dA111,
double &dA112,
double &dA113,
double &dA122,
double &dA123,
double &dA133,
double &da2,
double &dA211,
double &dA212,
double &dA213,
double &dA222,
double &dA223,
double &dA233,
double &da3,
double &dA311,
double &dA312,
double &dA313,
double &dA322,
double &dA323,
double &dA333,
double &db11,
double &dB11,
double &db12,
double &dB12,
double &db13,
double &dB13,
double &db21,
double &dB21,
double &db22,
double &dB22,
double &db23,
double &dB23,
double &db31,
double &dB31,
double &db32,
double &dB32,
double &db33,
double &dB33,
double &dchi1,
double &dchi2,
double &dchi3,
double &dda11,
double &dda12,
double &dda13,
double &dda22,
double &dda23,
double &dda33,
double &ddb111,
double &ddb112,
double &ddb113,
double &ddb121,
double &ddb122,
double &ddb123,
double &ddb131,
double &ddb132,
double &ddb133,
double &ddb221,
double &ddb222,
double &ddb223,
double &ddb231,
double &ddb232,
double &ddb233,
double &ddb331,
double &ddb332,
double &ddb333,
double &ddchi11,
double &ddchi12,
double &ddchi13,
double &ddchi22,
double &ddchi23,
double &ddchi33,
double &deldelg1111,
double &deldelg1112,
double &deldelg1113,
double &deldelg1122,
double &deldelg1123,
double &deldelg1133,
double &deldelg1211,
double &deldelg1212,
double &deldelg1213,
double &deldelg1222,
double &deldelg1223,
double &deldelg1233,
double &deldelg1311,
double &deldelg1312,
double &deldelg1313,
double &deldelg1322,
double &deldelg1323,
double &deldelg1333,
double &deldelg2211,
double &deldelg2212,
double &deldelg2213,
double &deldelg2222,
double &deldelg2223,
double &deldelg2233,
double &deldelg2311,
double &deldelg2312,
double &deldelg2313,
double &deldelg2322,
double &deldelg2323,
double &deldelg2333,
double &deldelg3311,
double &deldelg3312,
double &deldelg3313,
double &deldelg3322,
double &deldelg3323,
double &deldelg3333,
double &delG11,
double &delg111,
double &delg112,
double &delg113,
double &delG12,
double &delg122,
double &delg123,
double &delG13,
double &delg133,
double &delG21,
double &delg211,
double &delg212,
double &delg213,
double &delG22,
double &delg222,
double &delg223,
double &delG23,
double &delg233,
double &delG31,
double &delg311,
double &delg312,
double &delg313,
double &delG32,
double &delg322,
double &delg323,
double &delG33,
double &delg333,
double &dKhat1,
double &dKhat2,
double &dKhat3,
double &dTheta1,
double &dTheta2,
double &dTheta3,
double &G1,
double &g11,
double &g12,
double &g13,
double &G2,
double &g22,
double &g23,
double &G3,
double &g33,
double &kappa1,
double &kappa2,
double &Khat,
double &rA11,
double &rA12,
double &rA13,
double &rA22,
double &rA23,
double &rA33,
double &rchi,
double &rG1,
double &rg11,
double &rg12,
double &rg13,
double &rG2,
double &rg22,
double &rg23,
double &rG3,
double &rg33,
double &rKhat,
double &rTheta,
double &Theta);
static inline void z4c_contract_gamma(
const double gxx, const double gxy, const double gxz,
const double gyy, const double gyz, const double gzz,
const double gxxx, const double gxyx, const double gxzx,
const double gyyx, const double gyzx, const double gzzx,
const double gxxy, const double gxyy, const double gxzy,
const double gyyy, const double gyzy, const double gzzy,
const double gxxz, const double gxyz, const double gxzz,
const double gyyz, const double gyzz, const double gzzz,
double &Gamxa, double &Gamya, double &Gamza)
{
double det = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz -
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz;
const double gupxx = (gyy * gzz - gyz * gyz) / det;
const double gupxy = -(gxy * gzz - gyz * gxz) / det;
const double gupxz = (gxy * gyz - gyy * gxz) / det;
const double gupyy = (gxx * gzz - gxz * gxz) / det;
const double gupyz = -(gxx * gyz - gxy * gxz) / det;
const double gupzz = (gxx * gyy - gxy * gxy) / det;
const double Gamxxx = 0.5 * (gupxx * gxxx + gupxy * (2.0 * gxyx - gxxy) + gupxz * (2.0 * gxzx - gxxz));
const double Gamyxx = 0.5 * (gupxy * gxxx + gupyy * (2.0 * gxyx - gxxy) + gupyz * (2.0 * gxzx - gxxz));
const double Gamzxx = 0.5 * (gupxz * gxxx + gupyz * (2.0 * gxyx - gxxy) + gupzz * (2.0 * gxzx - gxxz));
const double Gamxyy = 0.5 * (gupxx * (2.0 * gxyy - gyyx) + gupxy * gyyy + gupxz * (2.0 * gyzy - gyyz));
const double Gamyyy = 0.5 * (gupxy * (2.0 * gxyy - gyyx) + gupyy * gyyy + gupyz * (2.0 * gyzy - gyyz));
const double Gamzyy = 0.5 * (gupxz * (2.0 * gxyy - gyyx) + gupyz * gyyy + gupzz * (2.0 * gyzy - gyyz));
const double Gamxzz = 0.5 * (gupxx * (2.0 * gxzz - gzzx) + gupxy * (2.0 * gyzz - gzzy) + gupxz * gzzz);
const double Gamyzz = 0.5 * (gupxy * (2.0 * gxzz - gzzx) + gupyy * (2.0 * gyzz - gzzy) + gupyz * gzzz);
const double Gamzzz = 0.5 * (gupxz * (2.0 * gxzz - gzzx) + gupyz * (2.0 * gyzz - gzzy) + gupzz * gzzz);
const double Gamxxy = 0.5 * (gupxx * gxxy + gupxy * gyyx + gupxz * (gxzy + gyzx - gxyz));
const double Gamyxy = 0.5 * (gupxy * gxxy + gupyy * gyyx + gupyz * (gxzy + gyzx - gxyz));
const double Gamzxy = 0.5 * (gupxz * gxxy + gupyz * gyyx + gupzz * (gxzy + gyzx - gxyz));
const double Gamxxz = 0.5 * (gupxx * gxxz + gupxy * (gxyz + gyzx - gxzy) + gupxz * gzzx);
const double Gamyxz = 0.5 * (gupxy * gxxz + gupyy * (gxyz + gyzx - gxzy) + gupyz * gzzx);
const double Gamzxz = 0.5 * (gupxz * gxxz + gupyz * (gxyz + gyzx - gxzy) + gupzz * gzzx);
const double Gamxyz = 0.5 * (gupxx * (gxyz + gxzy - gyzx) + gupxy * gyyz + gupxz * gzzy);
const double Gamyyz = 0.5 * (gupxy * (gxyz + gxzy - gyzx) + gupyy * gyyz + gupyz * gzzy);
const double Gamzyz = 0.5 * (gupxz * (gxyz + gxzy - gyzx) + gupyz * gyyz + gupzz * gzzy);
Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz +
2.0 * (gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz);
Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz +
2.0 * (gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz);
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz +
2.0 * (gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz);
}
static int compute_rhs_z4c_cartesian(
int *ex, double &T, double *X, double *Y, double *Z,
double *chi_state, double *chi_constraints, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *TZ,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *TZ_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
int &Symmetry, int &Lev, double &eps, int &co)
{
(void)T;
const int nx = ex[0];
const int ny = ex[1];
const int nz = ex[2];
const int all = nx * ny * nz;
double alpn1[all], chin1[all], gxx[all], gyy[all], gzz[all];
double chix[all], chiy[all], chiz[all], chixx[all], chixy[all], chixz[all], chiyy[all], chiyz[all], chizz[all];
double gxxx[all], gxyx[all], gxzx[all], gyyx[all], gyzx[all], gzzx[all];
double gxxy[all], gxyy[all], gxzy[all], gyyy[all], gyzy[all], gzzy[all];
double gxxz[all], gxyz[all], gxzz[all], gyyz[all], gyzz[all], gzzz[all];
double gxxxx[all], gxxxy[all], gxxxz[all], gxxyy[all], gxxyz[all], gxxzz[all];
double gxyxx[all], gxyxy[all], gxyxz[all], gxyyy[all], gxyyz[all], gxyzz[all];
double gxzxx[all], gxzxy[all], gxzxz[all], gxzyy[all], gxzyz[all], gxzzz[all];
double gyyxx[all], gyyxy[all], gyyxz[all], gyyyy[all], gyyyz[all], gyyzz[all];
double gyzxx[all], gyzxy[all], gyzxz[all], gyzyy[all], gyzyz[all], gyzzz[all];
double gzzxx[all], gzzxy[all], gzzxz[all], gzzyy[all], gzzyz[all], gzzzz[all];
double Lapx[all], Lapy[all], Lapz[all], Lapxx[all], Lapxy[all], Lapxz[all], Lapyy[all], Lapyz[all], Lapzz[all];
double betaxx[all], betaxy[all], betaxz[all], betayx[all], betayy[all], betayz[all], betazx[all], betazy[all], betazz[all];
double dBxx[all], dBxy[all], dBxz[all], dByx[all], dByy[all], dByz[all], dBzx[all], dBzy[all], dBzz[all];
double sfxxx[all], sfxxy[all], sfxxz[all], sfxyy[all], sfxyz[all], sfxzz[all];
double sfyxx[all], sfyxy[all], sfyxz[all], sfyyy[all], sfyyz[all], sfyzz[all];
double sfzxx[all], sfzxy[all], sfzxz[all], sfzyy[all], sfzyz[all], sfzzz[all];
double Gamxx[all], Gamxy[all], Gamxz[all], Gamyx[all], Gamyy[all], Gamyz[all], Gamzx[all], Gamzy[all], Gamzz[all];
double Kx[all], Ky[all], Kz[all], TZx[all], TZy[all], TZz[all];
double Axxx[all], Axxy[all], Axxz[all], Axyx[all], Axyy[all], Axyz[all];
double Axzx[all], Axzy[all], Axzz[all], Ayyx[all], Ayyy[all], Ayyz[all];
double Ayzx[all], Ayzy[all], Ayzz[all], Azzx[all], Azzy[all], Azzz[all];
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
double reta[all];
#endif
const double SSS[3] = {1.0, 1.0, 1.0};
const double AAS[3] = {-1.0, -1.0, 1.0};
const double ASA[3] = {-1.0, 1.0, -1.0};
const double SAA[3] = {1.0, -1.0, -1.0};
const double ASS[3] = {-1.0, 1.0, 1.0};
const double SAS[3] = {1.0, -1.0, 1.0};
const double SSA[3] = {1.0, 1.0, -1.0};
const double ONE = 1.0;
const double TWO = 2.0;
const double ZEO = 0.0;
double chiDivfloor = 1.0e-5;
double kappa1 = 2.0e-2;
double kappa2 = 0.0;
double FF = 0.75;
double eta = 2.0;
for (int idx = 0; idx < all; ++idx)
{
alpn1[idx] = Lap[idx] + ONE;
chin1[idx] = chi_state[idx] + ONE;
gxx[idx] = dxx[idx] + ONE;
gyy[idx] = dyy[idx] + ONE;
gzz[idx] = dzz[idx] + ONE;
}
fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, dtSfx, dBxx, dBxy, dBxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, dtSfy, dByx, dByy, dByz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, dtSfz, dBzx, dBzy, dBzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, chi_state, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, dxx, gxxxx, gxxxy, gxxxz, gxxyy, gxxyz, gxxzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, dyy, gyyxx, gyyxy, gyyxz, gyyyy, gyyyz, gyyzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, dzz, gzzxx, gzzxy, gzzxz, gzzyy, gzzyz, gzzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, gxy, gxyxx, gxyxy, gxyxz, gxyyy, gxyyz, gxyzz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
fdderivs(ex, gxz, gxzxx, gxzxy, gxzxz, gxzyy, gxzyz, gxzzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
fdderivs(ex, gyz, gyzxx, gyzxy, gyzxz, gyzyy, gyzyz, gyzzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
fderivs(ex, Gamx, Gamxx, Gamxy, Gamxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Gamy, Gamyx, Gamyy, Gamyz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, Gamz, Gamzx, Gamzy, Gamzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, trK, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, TZ, TZx, TZy, TZz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, betax, sfxxx, sfxxy, sfxxz, sfxyy, sfxyz, sfxzz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, betay, sfyxx, sfyxy, sfyxz, sfyyy, sfyyz, sfyzz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fdderivs(ex, betaz, sfzxx, sfzxy, sfzxz, sfzyy, sfzyz, sfzzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fdderivs(ex, chi_state, chixx, chixy, chixz, chiyy, chiyz, chizz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, Lap, Lapxx, Lapxy, Lapxz, Lapyy, Lapyz, Lapzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Axx, Axxx, Axxy, Axxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Axy, Axyx, Axyy, Axyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, Axz, Axzx, Axzy, Axzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, Ayy, Ayyx, Ayyy, Ayyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Ayz, Ayzx, Ayzy, Ayzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
fderivs(ex, Azz, Azzx, Azzy, Azzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
for (int idx = 0; idx < all; ++idx)
{
double point_kappa1 = 0.0;
f_z4c_rhs_point(
Axx[idx], Axy[idx], Axz[idx], Ayy[idx], Ayz[idx], Azz[idx],
alpn1[idx], dtSfx[idx], dtSfy[idx], dtSfz[idx],
betax[idx], betay[idx], betaz[idx],
chin1[idx], chiDivfloor,
Lapx[idx],
Axxx[idx], Axyx[idx], Axzx[idx], Ayyx[idx], Ayzx[idx], Azzx[idx],
Lapy[idx],
Axxy[idx], Axyy[idx], Axzy[idx], Ayyy[idx], Ayzy[idx], Azzy[idx],
Lapz[idx],
Axxz[idx], Axyz[idx], Axzz[idx], Ayyz[idx], Ayzz[idx], Azzz[idx],
betaxx[idx], dBxx[idx], betayx[idx], dByx[idx], betazx[idx], dBzx[idx],
betaxy[idx], dBxy[idx], betayy[idx], dByy[idx], betazy[idx], dBzy[idx],
betaxz[idx], dBxz[idx], betayz[idx], dByz[idx], betazz[idx], dBzz[idx],
chix[idx], chiy[idx], chiz[idx],
Lapxx[idx], Lapxy[idx], Lapxz[idx], Lapyy[idx], Lapyz[idx], Lapzz[idx],
sfxxx[idx], sfyxx[idx], sfzxx[idx],
sfxxy[idx], sfyxy[idx], sfzxy[idx],
sfxxz[idx], sfyxz[idx], sfzxz[idx],
sfxyy[idx], sfyyy[idx], sfzyy[idx],
sfxyz[idx], sfyyz[idx], sfzyz[idx],
sfxzz[idx], sfyzz[idx], sfzzz[idx],
chixx[idx], chixy[idx], chixz[idx], chiyy[idx], chiyz[idx], chizz[idx],
gxxxx[idx], gxyxx[idx], gxzxx[idx], gyyxx[idx], gyzxx[idx], gzzxx[idx],
gxxxy[idx], gxyxy[idx], gxzxy[idx], gyyxy[idx], gyzxy[idx], gzzxy[idx],
gxxxz[idx], gxyxz[idx], gxzxz[idx], gyyxz[idx], gyzxz[idx], gzzxz[idx],
gxxyy[idx], gxyyy[idx], gxzyy[idx], gyyyy[idx], gyzyy[idx], gzzyy[idx],
gxxyz[idx], gxyyz[idx], gxzyz[idx], gyyyz[idx], gyzyz[idx], gzzyz[idx],
gxxzz[idx], gxyzz[idx], gxzzz[idx], gyyzz[idx], gyzzz[idx], gzzzz[idx],
Gamxx[idx], gxxx[idx], gxyx[idx], gxzx[idx],
Gamyx[idx], gyyx[idx], gyzx[idx],
Gamzx[idx], gzzx[idx],
Gamxy[idx], gxxy[idx], gxyy[idx], gxzy[idx],
Gamyy[idx], gyyy[idx], gyzy[idx],
Gamzy[idx], gzzy[idx],
Gamxz[idx], gxxz[idx], gxyz[idx], gxzz[idx],
Gamyz[idx], gyyz[idx], gyzz[idx],
Gamzz[idx], gzzz[idx],
Kx[idx], Ky[idx], Kz[idx],
TZx[idx], TZy[idx], TZz[idx],
Gamx[idx], gxx[idx], gxy[idx], gxz[idx],
Gamy[idx], gyy[idx], gyz[idx],
Gamz[idx], gzz[idx],
point_kappa1, kappa2,
trK[idx],
Axx_rhs[idx], Axy_rhs[idx], Axz_rhs[idx], Ayy_rhs[idx], Ayz_rhs[idx], Azz_rhs[idx],
chi_rhs[idx],
Gamx_rhs[idx], gxx_rhs[idx], gxy_rhs[idx], gxz_rhs[idx],
Gamy_rhs[idx], gyy_rhs[idx], gyz_rhs[idx],
Gamz_rhs[idx], gzz_rhs[idx], trK_rhs[idx], TZ_rhs[idx], TZ[idx]);
}
for (int idx = 0; idx < all; ++idx)
Lap_rhs[idx] = -TWO * alpn1[idx] * trK[idx];
#if (GAUGE == 0)
for (int idx = 0; idx < all; ++idx)
{
betax_rhs[idx] = FF * dtSfx[idx];
betay_rhs[idx] = FF * dtSfy[idx];
betaz_rhs[idx] = FF * dtSfz[idx];
dtSfx_rhs[idx] = Gamx_rhs[idx] - eta * dtSfx[idx];
dtSfy_rhs[idx] = Gamy_rhs[idx] - eta * dtSfy[idx];
dtSfz_rhs[idx] = Gamz_rhs[idx] - eta * dtSfz[idx];
}
#elif (GAUGE == 1)
for (int idx = 0; idx < all; ++idx)
{
betax_rhs[idx] = Gamx[idx] - eta * betax[idx];
betay_rhs[idx] = Gamy[idx] - eta * betay[idx];
betaz_rhs[idx] = Gamz[idx] - eta * betaz[idx];
dtSfx_rhs[idx] = ZEO;
dtSfy_rhs[idx] = ZEO;
dtSfz_rhs[idx] = ZEO;
}
#elif (GAUGE == 2)
/* Variable-eta gamma-driver, chi-sqrt denominator */
for (int idx = 0; idx < all; ++idx)
{
const double chin1i = chin1[idx];
const double det = gxx[idx] * gyy[idx] * gzz[idx]
+ gxy[idx] * gyz[idx] * gxz[idx] * 2.0
- gxz[idx] * gyy[idx] * gxz[idx]
- gxy[idx] * gxy[idx] * gzz[idx]
- gxx[idx] * gyz[idx] * gyz[idx];
const double idet = ONE / det;
const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
const double grdchi2 =
upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
+ TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
const double sqchi = sqrt(chin1i);
reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - sqchi) * (ONE - sqchi));
betax_rhs[idx] = FF * dtSfx[idx];
betay_rhs[idx] = FF * dtSfy[idx];
betaz_rhs[idx] = FF * dtSfz[idx];
dtSfx_rhs[idx] = Gamx_rhs[idx] - reta[idx] * dtSfx[idx];
dtSfy_rhs[idx] = Gamy_rhs[idx] - reta[idx] * dtSfy[idx];
dtSfz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * dtSfz[idx];
}
#elif (GAUGE == 3)
/* Variable-eta gamma-driver, chi-linear denominator */
for (int idx = 0; idx < all; ++idx)
{
const double chin1i = chin1[idx];
const double det = gxx[idx] * gyy[idx] * gzz[idx]
+ gxy[idx] * gyz[idx] * gxz[idx] * 2.0
- gxz[idx] * gyy[idx] * gxz[idx]
- gxy[idx] * gxy[idx] * gzz[idx]
- gxx[idx] * gyz[idx] * gyz[idx];
const double idet = ONE / det;
const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
const double grdchi2 =
upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
+ TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - chin1i) * (ONE - chin1i));
betax_rhs[idx] = FF * dtSfx[idx];
betay_rhs[idx] = FF * dtSfy[idx];
betaz_rhs[idx] = FF * dtSfz[idx];
dtSfx_rhs[idx] = Gamx_rhs[idx] - reta[idx] * dtSfx[idx];
dtSfy_rhs[idx] = Gamy_rhs[idx] - reta[idx] * dtSfy[idx];
dtSfz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * dtSfz[idx];
}
#elif (GAUGE == 4)
/* Variable-eta gamma-driver, first-order, chi-sqrt denominator */
for (int idx = 0; idx < all; ++idx)
{
const double chin1i = chin1[idx];
const double det = gxx[idx] * gyy[idx] * gzz[idx]
+ gxy[idx] * gyz[idx] * gxz[idx] * 2.0
- gxz[idx] * gyy[idx] * gxz[idx]
- gxy[idx] * gxy[idx] * gzz[idx]
- gxx[idx] * gyz[idx] * gyz[idx];
const double idet = ONE / det;
const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
const double grdchi2 =
upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
+ TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
const double sqchi = sqrt(chin1i);
reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - sqchi) * (ONE - sqchi));
betax_rhs[idx] = Gamx_rhs[idx] - reta[idx] * betax[idx];
betay_rhs[idx] = Gamy_rhs[idx] - reta[idx] * betay[idx];
betaz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * betaz[idx];
dtSfx_rhs[idx] = ZEO;
dtSfy_rhs[idx] = ZEO;
dtSfz_rhs[idx] = ZEO;
}
#elif (GAUGE == 5)
/* Variable-eta gamma-driver, first-order, chi-linear denominator */
for (int idx = 0; idx < all; ++idx)
{
const double chin1i = chin1[idx];
const double det = gxx[idx] * gyy[idx] * gzz[idx]
+ gxy[idx] * gyz[idx] * gxz[idx] * 2.0
- gxz[idx] * gyy[idx] * gxz[idx]
- gxy[idx] * gxy[idx] * gzz[idx]
- gxx[idx] * gyz[idx] * gyz[idx];
const double idet = ONE / det;
const double upxx = (gyy[idx] * gzz[idx] - gyz[idx] * gyz[idx]) * idet;
const double upxy = -(gxy[idx] * gzz[idx] - gyz[idx] * gxz[idx]) * idet;
const double upxz = (gxy[idx] * gyz[idx] - gyy[idx] * gxz[idx]) * idet;
const double upyy = (gxx[idx] * gzz[idx] - gxz[idx] * gxz[idx]) * idet;
const double upyz = -(gxx[idx] * gyz[idx] - gxy[idx] * gxz[idx]) * idet;
const double upzz = (gxx[idx] * gyy[idx] - gxy[idx] * gxy[idx]) * idet;
const double grdchi2 =
upxx * chix[idx] * chix[idx] + upyy * chiy[idx] * chiy[idx] + upzz * chiz[idx] * chiz[idx]
+ TWO * (upxy * chix[idx] * chiy[idx] + upxz * chix[idx] * chiz[idx] + upyz * chiy[idx] * chiz[idx]);
reta[idx] = 1.31 / TWO * sqrt(grdchi2 / chin1i) / ((ONE - chin1i) * (ONE - chin1i));
betax_rhs[idx] = Gamx_rhs[idx] - reta[idx] * betax[idx];
betay_rhs[idx] = Gamy_rhs[idx] - reta[idx] * betay[idx];
betaz_rhs[idx] = Gamz_rhs[idx] - reta[idx] * betaz[idx];
dtSfx_rhs[idx] = ZEO;
dtSfy_rhs[idx] = ZEO;
dtSfz_rhs[idx] = ZEO;
}
#elif (GAUGE == 6 || GAUGE == 7)
{
/* Jason's position-dependent damping: rational (6) or exponential (7) */
int BHN = 0;
double Porg[9] = {0.0};
double Mass[3] = {0.0};
#ifdef fortran1
extern "C" { void getpbh(int &, double *, double *); }
#elif defined(fortran2)
extern "C" { void GETPBH(int &, double *, double *); }
#else
extern "C" { void getpbh_(int &, double *, double *); }
#endif
{
#ifdef fortran1
getpbh(BHN, Porg, Mass);
#elif defined(fortran2)
GETPBH(BHN, Porg, Mass);
#else
getpbh_(BHN, Porg, Mass);
#endif
}
if (BHN == 2)
{
const double M = Mass[0] + Mass[1];
const double A = 2.0 / M;
const double w1 = 12.0, w2 = 12.0;
const double C1 = 1.0 / Mass[0] - A;
const double C2 = 1.0 / Mass[1] - A;
const double BH_sep2 = (Porg[3] - Porg[0]) * (Porg[3] - Porg[0])
+ (Porg[4] - Porg[1]) * (Porg[4] - Porg[1])
+ (Porg[5] - Porg[2]) * (Porg[5] - Porg[2]);
const double inv_BH_sep2 = 1.0 / BH_sep2;
for (int k0 = 0; k0 < nz; ++k0) {
for (int j0 = 0; j0 < ny; ++j0) {
for (int i0 = 0; i0 < nx; ++i0) {
const size_t idx = idx_ex(i0, j0, k0, ex);
const double xp = X[i0], yp = Y[j0], zp = Z[k0];
const double r1 = ((Porg[0]-xp)*(Porg[0]-xp) + (Porg[1]-yp)*(Porg[1]-yp) + (Porg[2]-zp)*(Porg[2]-zp)) * inv_BH_sep2;
const double r2 = ((Porg[3]-xp)*(Porg[3]-xp) + (Porg[4]-yp)*(Porg[4]-yp) + (Porg[5]-zp)*(Porg[5]-zp)) * inv_BH_sep2;
#if (GAUGE == 6)
const double reta_val = A + C1 / (1.0 + w1 * r1) + C2 / (1.0 + w2 * r2);
#else
const double reta_val = A + C1 * exp(-w1 * r1) + C2 * exp(-w2 * r2);
#endif
betax_rhs[idx] = FF * dtSfx[idx];
betay_rhs[idx] = FF * dtSfy[idx];
betaz_rhs[idx] = FF * dtSfz[idx];
dtSfx_rhs[idx] = Gamx_rhs[idx] - reta_val * dtSfx[idx];
dtSfy_rhs[idx] = Gamy_rhs[idx] - reta_val * dtSfy[idx];
dtSfz_rhs[idx] = Gamz_rhs[idx] - reta_val * dtSfz[idx];
}}}
}
else
{
fprintf(stderr, "z4c_rhs_c: GAUGE %d requires BHN=2, got BHN=%d\n", (int)GAUGE, BHN);
return 1;
}
}
#else
#error "z4c_rhs_c.C: unsupported GAUGE value"
#endif
lopsided(ex, X, Y, Z, gxx, gxx_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, gxy, gxy_rhs, betax, betay, betaz, Symmetry, AAS);
lopsided(ex, X, Y, Z, gxz, gxz_rhs, betax, betay, betaz, Symmetry, ASA);
lopsided(ex, X, Y, Z, gyy, gyy_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, gyz, gyz_rhs, betax, betay, betaz, Symmetry, SAA);
lopsided(ex, X, Y, Z, gzz, gzz_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Axx, Axx_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Axy, Axy_rhs, betax, betay, betaz, Symmetry, AAS);
lopsided(ex, X, Y, Z, Axz, Axz_rhs, betax, betay, betaz, Symmetry, ASA);
lopsided(ex, X, Y, Z, Ayy, Ayy_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Ayz, Ayz_rhs, betax, betay, betaz, Symmetry, SAA);
lopsided(ex, X, Y, Z, Azz, Azz_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, chi_state, chi_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, trK, trK_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Gamx, Gamx_rhs, betax, betay, betaz, Symmetry, ASS);
lopsided(ex, X, Y, Z, Gamy, Gamy_rhs, betax, betay, betaz, Symmetry, SAS);
lopsided(ex, X, Y, Z, Gamz, Gamz_rhs, betax, betay, betaz, Symmetry, SSA);
lopsided(ex, X, Y, Z, Lap, Lap_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, betax, betax_rhs, betax, betay, betaz, Symmetry, ASS);
lopsided(ex, X, Y, Z, betay, betay_rhs, betax, betay, betaz, Symmetry, SAS);
lopsided(ex, X, Y, Z, betaz, betaz_rhs, betax, betay, betaz, Symmetry, SSA);
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
lopsided(ex, X, Y, Z, dtSfx, dtSfx_rhs, betax, betay, betaz, Symmetry, ASS);
lopsided(ex, X, Y, Z, dtSfy, dtSfy_rhs, betax, betay, betaz, Symmetry, SAS);
lopsided(ex, X, Y, Z, dtSfz, dtSfz_rhs, betax, betay, betaz, Symmetry, SSA);
#endif
lopsided(ex, X, Y, Z, TZ, TZ_rhs, betax, betay, betaz, Symmetry, SSS);
for (int idx = 0; idx < all; ++idx)
{
double Gamxa = 0.0, Gamya = 0.0, Gamza = 0.0;
z4c_contract_gamma(
gxx[idx], gxy[idx], gxz[idx], gyy[idx], gyz[idx], gzz[idx],
gxxx[idx], gxyx[idx], gxzx[idx], gyyx[idx], gyzx[idx], gzzx[idx],
gxxy[idx], gxyy[idx], gxzy[idx], gyyy[idx], gyzy[idx], gzzy[idx],
gxxz[idx], gxyz[idx], gxzz[idx], gyyz[idx], gyzz[idx], gzzz[idx],
Gamxa, Gamya, Gamza);
TZ_rhs[idx] -= alpn1[idx] * (TWO + kappa2) * kappa1 * TZ[idx];
trK_rhs[idx] += alpn1[idx] * kappa1 * (ONE - kappa2) * TZ[idx];
Gamx_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamx[idx] - Gamxa);
Gamy_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamy[idx] - Gamya);
Gamz_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamz[idx] - Gamza);
}
if (eps > 0.0)
{
kodis(ex, X, Y, Z, chi_state, chi_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, trK, trK_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, gxx, gxx_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, gxy, gxy_rhs, AAS, Symmetry, eps);
kodis(ex, X, Y, Z, gxz, gxz_rhs, ASA, Symmetry, eps);
kodis(ex, X, Y, Z, gyy, gyy_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, gyz, gyz_rhs, SAA, Symmetry, eps);
kodis(ex, X, Y, Z, gzz, gzz_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Axx, Axx_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Axy, Axy_rhs, AAS, Symmetry, eps);
kodis(ex, X, Y, Z, Axz, Axz_rhs, ASA, Symmetry, eps);
kodis(ex, X, Y, Z, Ayy, Ayy_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Ayz, Ayz_rhs, SAA, Symmetry, eps);
kodis(ex, X, Y, Z, Azz, Azz_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Gamx, Gamx_rhs, ASS, Symmetry, eps);
kodis(ex, X, Y, Z, Gamy, Gamy_rhs, SAS, Symmetry, eps);
kodis(ex, X, Y, Z, Gamz, Gamz_rhs, SSA, Symmetry, eps);
kodis(ex, X, Y, Z, Lap, Lap_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, betax, betax_rhs, ASS, Symmetry, eps);
kodis(ex, X, Y, Z, betay, betay_rhs, SAS, Symmetry, eps);
kodis(ex, X, Y, Z, betaz, betaz_rhs, SSA, Symmetry, eps);
#if (GAUGE == 0 || GAUGE == 2 || GAUGE == 3 || GAUGE == 6 || GAUGE == 7)
kodis(ex, X, Y, Z, dtSfx, dtSfx_rhs, ASS, Symmetry, eps);
kodis(ex, X, Y, Z, dtSfy, dtSfy_rhs, SAS, Symmetry, eps);
kodis(ex, X, Y, Z, dtSfz, dtSfz_rhs, SSA, Symmetry, eps);
#endif
kodis(ex, X, Y, Z, TZ, TZ_rhs, SSS, Symmetry, eps);
}
if (co == 0)
{
#if (ABV == 0)
f_ricci_gamma(ex, X, Y, Z,
chi_constraints,
dxx, gxy, gxz, dyy, gyz, dzz,
Gamx, Gamy, Gamz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Symmetry);
#endif
f_constraint_bssn(ex, X, Y, Z,
chi_constraints, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry);
}
return 0;
}
extern "C" int f_compute_rhs_Z4c(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *TZ,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *TZ_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
int &Symmetry, int &Lev, double &eps, int &co)
{
return compute_rhs_z4c_cartesian(
ex, T, X, Y, Z,
chi, chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz,
dtSfx, dtSfy, dtSfz,
TZ,
chi_rhs, trK_rhs,
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
Gamx_rhs, Gamy_rhs, Gamz_rhs,
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
TZ_rhs,
rho, Sx, Sy, Sz,
Sxx, Sxy, Sxz, Syy, Syz, Szz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry, Lev, eps, co);
}
extern "C" int f_compute_rhs_Z4cnot(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *TZ,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *TZ_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
int &Symmetry, int &Lev, double &eps, int &co, double &chitiny)
{
const int all = ex[0] * ex[1] * ex[2];
std::vector<double> chi_clamped(chi, chi + all);
f_lowerboundset(ex, chi_clamped.data(), chitiny);
const int ret = compute_rhs_z4c_cartesian(
ex, T, X, Y, Z,
chi_clamped.data(), chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz,
dtSfx, dtSfy, dtSfz,
TZ,
chi_rhs, trK_rhs,
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
Gamx_rhs, Gamy_rhs, Gamz_rhs,
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
TZ_rhs,
rho, Sx, Sy, Sz,
Sxx, Sxy, Sxz, Syy, Syz, Szz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry, Lev, eps, co);
if (ret != 0 || co != 0)
return ret;
#if (ABV == 0)
f_ricci_gamma(ex, X, Y, Z,
chi,
dxx, gxy, gxz, dyy, gyz, dzz,
Gamx, Gamy, Gamz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Symmetry);
#endif
f_constraint_bssn(ex, X, Y, Z,
chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry);
return ret;
}

View File

@@ -1,211 +0,0 @@
# BSSN Build Config Migration
This note records the build-configuration fix needed when replacing
`AMSS_NCKU_Input.py` or `generate_macrodef.py` with a newer upstream version.
## Problem
`AMSS_NCKU_source/macrodef.h` is not the authoritative file used by normal
runs. `AMSS_NCKU_Program.py` first generates macro files under
`input_data.File_directory`, copies `AMSS_NCKU_source` to
`<File_directory>/AMSS_NCKU_source_copy`, then copies the generated macro files
into that copied source tree and compiles there.
Therefore, makefile logic must not depend only on the stale
`AMSS_NCKU_source/macrodef.h`. The actual equation path must be passed to the
copied build tree from the same generation step that creates `macrodef.h`.
The performance regression was caused by compiling/linking the
`BSSN-EScalar` C wrapper into BSSN vacuum builds. For BSSN vacuum (`ABEtype=0`),
the build must use:
```make
BSSN_USE_TRANSFER_CACHE=1
BSSN_USE_ESCALAR_C_KERNEL=0
```
and must not link `bssn_escalar_rhs_c.o`.
## Required Migration Steps
### 1. Add an ABE type helper in `generate_macrodef.py`
Add a helper that maps `input_data.Equation_Class` to the numeric `ABEtype`.
Use the same mapping as `macrodef.h`:
```python
def get_abe_type():
if ( input_data.Equation_Class == "BSSN" ):
return 0
elif ( input_data.Equation_Class == "BSSN-EScalar" ):
return 1
elif ( input_data.Equation_Class == "BSSN-EM" ):
return 3
elif ( input_data.Equation_Class == "Z4C" ):
return 2
else:
raise ValueError("Equation_Class setting error!!!")
```
Update `generate_macrodef_h()` to print `#define ABEtype {get_abe_type()}`
instead of duplicating the if/elif mapping.
### 2. Generate a makefile fragment
In `generate_macrodef.py`, add:
```python
def generate_build_config():
file1 = open(os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
print("# Generated by generate_macrodef.py; do not edit manually.", file=file1)
print(f"ABE_TYPE := {get_abe_type()}", file=file1)
file1.close()
```
This file is the build-time authority for the equation path.
### 3. Call and copy the generated build config
In `AMSS_NCKU_Program.py`, after generating `macrodef.h` and `macrodef.fh`, call:
```python
generate_macrodef.generate_build_config()
print(" AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. ")
```
When copying generated files into `AMSS_NCKU_source_copy`, also copy:
```python
build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
```
### 4. Make the source makefile consume the generated config
At the top of `AMSS_NCKU_source/makefile`, after `include makefile.inc`, add:
```make
-include AMSS_NCKU_build.mk
ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
```
The generated `AMSS_NCKU_build.mk` is used during normal Python-driven builds.
The fallback keeps manual source-tree builds usable.
### 5. Gate path-specific build options by `ABE_TYPE`
Use effective build switches:
```make
ifeq ($(USE_TRANSFER_CACHE),auto)
ifeq ($(ABE_TYPE),0)
EFFECTIVE_USE_TRANSFER_CACHE = 1
else
EFFECTIVE_USE_TRANSFER_CACHE = 0
endif
else
EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
endif
ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
ifeq ($(ABE_TYPE),1)
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
```
Only add `bssn_escalar_rhs_c.o` when the effective EScalar C kernel switch is
enabled:
```make
ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
CFILES += bssn_escalar_rhs_c.o
endif
```
### 6. Use safe transfer-cache default
In `AMSS_NCKU_source/makefile.inc`, keep:
```make
USE_TRANSFER_CACHE ?= auto
```
With the effective switch logic above, this enables cached transfer for BSSN
vacuum while keeping non-BSSN paths on the uncached path by default.
## Verification Checklist
Run these checks after migrating:
```bash
python3 -c "import generate_macrodef; generate_macrodef.generate_build_config()"
cat GW150914/AMSS_NCKU_build.mk
```
For BSSN, the generated file should contain:
```make
ABE_TYPE := 0
```
Dry-run the copied or source makefile:
```bash
make -n -B INTERP_LB_MODE=off ABE | grep -E 'BSSN_USE_TRANSFER_CACHE|BSSN_USE_ESCALAR_C_KERNEL|bssn_escalar_rhs_c'
```
Expected BSSN result:
```text
-DBSSN_USE_TRANSFER_CACHE=1 -DBSSN_USE_ESCALAR_C_KERNEL=0
```
and no `bssn_escalar_rhs_c.o` in the final link command.
Run the full workflow:
```bash
python3 AMSS_NCKU_Program.py
```
For the 10-step BSSN test, compare coordinate output:
```bash
python3 - <<'PY'
from pathlib import Path
old = Path('../GW150914-06457/AMSS_NCKU_output/bssn_BH.dat')
new = Path('GW150914/AMSS_NCKU_output/bssn_BH.dat')
def rows(path):
out = []
for line in path.read_text().splitlines():
if not line.strip() or line.lstrip().startswith('#'):
continue
out.append([float(x) for x in line.split()])
return out
ro, rn = rows(old), rows(new)
n = min(len(ro), len(rn))
max_abs = 0.0
for i in range(n):
for a, b in zip(ro[i], rn[i]):
max_abs = max(max_abs, abs(a - b))
print(f"old_rows={len(ro)} new_rows={len(rn)} compared_rows={n}")
print(f"max_abs_diff={max_abs:.17g}")
PY
```
For the validated migration, the first 10 rows matched exactly:
```text
max_abs_diff=0
```

View File

@@ -1,72 +0,0 @@
#!/usr/bin/env python3
"""Convert interp_lb_profile.bin to a C header for compile-time embedding."""
import struct, sys
if len(sys.argv) < 3:
print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
sys.exit(1)
with open(sys.argv[1], 'rb') as f:
magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
threshold = struct.unpack('d', f.read(8))[0]
times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
# (or vice versa depending on which neighbor is lighter)
splits = []
for hr in heavy:
prev_t = times[hr - 1] if hr > 0 else 1e30
next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
if prev_t <= next_t:
splits.append((hr, hr - 1, hr)) # (block_id, r_left, r_right)
else:
splits.append((hr, hr, hr + 1))
# Also remap the displaced neighbor blocks
remaps = {}
for hr, r_l, r_r in splits:
if r_l != hr:
# We took r_l's slot, so remap block r_l to its other neighbor
displaced = r_l
if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
remaps[displaced] = displaced - 1
elif displaced < nprocs - 1:
remaps[displaced] = displaced + 1
else:
displaced = r_r
if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
remaps[displaced] = displaced + 1
elif displaced > 0:
remaps[displaced] = displaced - 1
with open(sys.argv[2], 'w') as out:
out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
out.write(", ".join(str(h) for h in heavy))
out.write("};\n\n")
out.write("/* Split table: {block_id, r_left, r_right} */\n")
out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
for bid, rl, rr in splits:
out.write(f" {{{bid}, {rl}, {rr}}},\n")
out.write("};\n\n")
out.write("/* Rank remap for displaced neighbor blocks */\n")
out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
out.write(f"static const int interp_lb_remaps[][2] = {{\n")
for src, dst in sorted(remaps.items()):
out.write(f" {{{src}, {dst}}},\n")
if not remaps:
out.write(" {-1, -1},\n")
out.write("};\n\n")
out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
print(f"Generated {sys.argv[2]}:")
print(f" {num_heavy} heavy blocks to split: {heavy}")
for bid, rl, rr in splits:
print(f" block {bid}: split -> rank {rl} (left), rank {rr} (right)")
for src, dst in sorted(remaps.items()):
print(f" block {src}: remap -> rank {dst}")

View File

@@ -12,37 +12,6 @@ import os
import AMSS_NCKU_Input as input_data ## import program input file import AMSS_NCKU_Input as input_data ## import program input file
##################################################################
def get_abe_type():
if ( input_data.Equation_Class == "BSSN" ):
return 0
elif ( input_data.Equation_Class == "BSSN-EScalar" ):
return 1
elif ( input_data.Equation_Class == "BSSN-EM" ):
return 3
elif ( input_data.Equation_Class == "Z4C" ):
return 2
else:
raise ValueError("Equation_Class setting error!!!")
##################################################################
## Generate the makefile fragment used by the copied source tree.
## The source-tree macrodef.h is not authoritative because macro files
## are regenerated under File_directory for each run.
def generate_build_config():
file1 = open( os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
print( "# Generated by generate_macrodef.py; do not edit manually.", file=file1 )
print( f"ABE_TYPE := {get_abe_type()}", file=file1 )
file1.close()
################################################################## ##################################################################
## Generate the macro file macrodef.h according to user settings ## Generate the macro file macrodef.h according to user settings
@@ -89,10 +58,19 @@ def generate_macrodef_h():
# 2: Z4c vacuum # 2: Z4c vacuum
# 3: coupled to Maxwell field # 3: coupled to Maxwell field
try: if ( input_data.Equation_Class == "BSSN" ):
print( f"#define ABEtype {get_abe_type()}", file=file1 ) print( "#define ABEtype 0", file=file1 )
print( file=file1 ) print( file=file1 )
except ValueError: elif ( input_data.Equation_Class == "BSSN-EScalar" ):
print( "#define ABEtype 1", file=file1 )
print( file=file1 )
elif ( input_data.Equation_Class == "BSSN-EM" ):
print( "#define ABEtype 3", file=file1 )
print( file=file1 )
elif ( input_data.Equation_Class == "Z4C" ):
print( "#define ABEtype 2", file=file1 )
print( file=file1 )
else:
print( "Equation_Class setting error!!!" ) print( "Equation_Class setting error!!!" )
print() print()
print( "# Equation type #define ABEtype setting error!!!", file=file1 ) print( "# Equation type #define ABEtype setting error!!!", file=file1 )
@@ -166,62 +144,6 @@ def generate_macrodef_h():
print( "#define REGLEV 0", file=file1 ) print( "#define REGLEV 0", file=file1 )
print( file=file1 ) print( file=file1 )
# Define fine-grained timing/debug macros.
# All of them default to OFF so production builds do not pay profiling overhead.
fine_timing = getattr(input_data, "Fine_Timing",
getattr(input_data, "Finegrained_Timing", "no"))
kernel_fine_timing = getattr(input_data, "Kernel_Fine_Timing",
getattr(input_data, "BSSN_Kernel_Fine_Timing", "no"))
stdin_abort_poll = getattr(input_data, "Enable_Stdin_Abort_Poll",
getattr(input_data, "Stdin_Abort_Poll", "no"))
timing_report_every = max(1, int(getattr(
input_data, "Timing_Every_Steps",
getattr(input_data, "Timing_Report_Every", 1))))
timing_top_hotspots = max(1, int(getattr(
input_data, "Timing_Top_Hotspots", 8)))
if ( fine_timing == "yes" ):
print( "#define BSSN_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( fine_timing == "no" ):
print( "#define BSSN_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Fine_Timing setting error!!!" )
print()
print( "# Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
print( file=file1 )
if ( kernel_fine_timing == "yes" ):
print( "#define BSSN_KERNEL_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( kernel_fine_timing == "no" ):
print( "#define BSSN_KERNEL_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Kernel_Fine_Timing setting error!!!" )
print()
print( "# Kernel_Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
if ( stdin_abort_poll == "yes" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 1", file=file1 )
print( file=file1 )
elif ( stdin_abort_poll == "no" ):
print( "#define BSSN_ENABLE_STDIN_ABORT_POLL 0", file=file1 )
print( file=file1 )
else:
print( "Enable_Stdin_Abort_Poll setting error!!!" )
print()
print( "# Enable_Stdin_Abort_Poll setting error!!!", file=file1 )
print( file=file1 )
# Define macro USE_GPU # Define macro USE_GPU
# use GPU or not # use GPU or not
@@ -302,21 +224,6 @@ def generate_macrodef_h():
print( "// 0: for every level;", file=file1 ) print( "// 0: for every level;", file=file1 )
print( "// 1: for all", file=file1 ) print( "// 1: for all", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING", file=file1 )
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
print( "// report timing every N coarse timesteps", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
print( "// number of hottest timing buckets shown in stdout", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_KERNEL_FINE_TIMING", file=file1 )
print( "// enable split timing inside compute_rhs_bssn", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_ENABLE_STDIN_ABORT_POLL", file=file1 )
print( "// poll stdin and broadcast abort flag every coarse step", file=file1 )
print( "//", file=file1 )
print( "// define USE_GPU", file=file1 ) print( "// define USE_GPU", file=file1 )
print( "// use gpu or not", file=file1 ) print( "// use gpu or not", file=file1 )
print( "//", file=file1 ) print( "//", file=file1 )

View File

@@ -11,47 +11,17 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import subprocess import subprocess
import time import time
## CPU core binding configuration using taskset
## taskset ensures all child processes inherit the CPU affinity mask
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
#NUMACTL_CPU_BIND = "taskset -c 0-111"
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
## Build parallelism configuration
def get_last_n_cores_per_socket(n=32): ## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
""" ## Set make -j to utilize available cores for faster builds
Read CPU topology via lscpu and return a taskset -c string BUILD_JOBS = 96
selecting the last `n` cores of each NUMA node (socket).
Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
-> "taskset -c 24-55,80-111"
"""
result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
# Build a dict: node_id -> sorted list of CPU ids
node_cpus = {}
for line in result.stdout.splitlines():
if line.startswith("#") or not line.strip():
continue
parts = line.split(",")
if len(parts) < 2:
continue
node_id, cpu_id = int(parts[0]), int(parts[1])
node_cpus.setdefault(node_id, []).append(cpu_id)
segments = []
for node_id in sorted(node_cpus):
cpus = sorted(node_cpus[node_id])
selected = cpus[-n:] # last n cores of this socket
segments.append(f"{selected[0]}-{selected[-1]}")
cpu_str = ",".join(segments)
total = len(segments) * n
print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)")
#return f"taskset -c {cpu_str}"
return f""
## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
## Build parallelism: match the number of bound cores
BUILD_JOBS = 64
################################################################## ##################################################################
@@ -70,7 +40,7 @@ def makefile_ABE():
## Build command with CPU binding to nohz_full cores ## Build command with CPU binding to nohz_full cores
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE" makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
else: else:

View File

@@ -0,0 +1,97 @@
# AMSS-NCKU PGO Profile Analysis Report
## 1. Profiling Environment
| Item | Value |
|------|-------|
| Compiler | Intel oneAPI DPC++/C++ 2025.3.0 (icpx/ifx) |
| Instrumentation Flag | `-fprofile-instr-generate` |
| Optimization Level (instrumented) | `-O2 -xHost -fma` |
| MPI Processes | 1 (single process to avoid MPI+instrumentation deadlock) |
| Profile File | `default_9725750769337483397_0.profraw` (327 KB) |
| Merged Profile | `default.profdata` (394 KB) |
| llvm-profdata | `/home/intel/oneapi/compiler/2025.3/bin/compiler/llvm-profdata` |
## 2. Reduced Simulation Parameters (for profiling run)
| Parameter | Production Value | Profiling Value |
|-----------|-----------------|-----------------|
| MPI_processes | 64 | 1 |
| grid_level | 9 | 4 |
| static_grid_level | 5 | 3 |
| static_grid_number | 96 | 24 |
| moving_grid_number | 48 | 16 |
| largest_box_xyz_max | 320^3 | 160^3 |
| Final_Evolution_Time | 1000.0 | 10.0 |
| Evolution_Step_Number | 10,000,000 | 1,000 |
| Detector_Number | 12 | 2 |
## 3. Profile Summary
| Metric | Value |
|--------|-------|
| Total instrumented functions | 1,392 |
| Functions with non-zero counts | 117 (8.4%) |
| Functions with zero counts | 1,275 (91.6%) |
| Maximum function entry count | 386,459,248 |
| Maximum internal block count | 370,477,680 |
| Total block count | 4,198,023,118 |
## 4. Top 20 Hotspot Functions
| Rank | Total Count | Max Block Count | Function | Category |
|------|------------|-----------------|----------|----------|
| 1 | 1,241,601,732 | 370,477,680 | `polint_` | Interpolation |
| 2 | 755,994,435 | 230,156,640 | `prolong3_` | Grid prolongation |
| 3 | 667,964,095 | 3,697,792 | `compute_rhs_bssn_` | BSSN RHS evolution |
| 4 | 539,736,051 | 386,459,248 | `symmetry_bd_` | Symmetry boundary |
| 5 | 277,310,808 | 53,170,728 | `lopsided_` | Lopsided FD stencil |
| 6 | 155,534,488 | 94,535,040 | `decide3d_` | 3D grid decision |
| 7 | 119,267,712 | 19,266,048 | `rungekutta4_rout_` | RK4 time integrator |
| 8 | 91,574,616 | 48,824,160 | `kodis_` | Kreiss-Oliger dissipation |
| 9 | 67,555,389 | 43,243,680 | `fderivs_` | Finite differences |
| 10 | 55,296,000 | 42,246,144 | `misc::fact(int)` | Factorial utility |
| 11 | 43,191,071 | 27,663,328 | `fdderivs_` | 2nd-order FD derivatives |
| 12 | 36,233,965 | 22,429,440 | `restrict3_` | Grid restriction |
| 13 | 24,698,512 | 17,231,520 | `polin3_` | Polynomial interpolation |
| 14 | 22,962,942 | 20,968,768 | `copy_` | Data copy |
| 15 | 20,135,696 | 17,259,168 | `Ansorg::barycentric(...)` | Spectral interpolation |
| 16 | 14,650,224 | 7,224,768 | `Ansorg::barycentric_omega(...)` | Spectral weights |
| 17 | 13,242,296 | 2,871,920 | `global_interp_` | Global interpolation |
| 18 | 12,672,000 | 7,734,528 | `sommerfeld_rout_` | Sommerfeld boundary |
| 19 | 6,872,832 | 1,880,064 | `sommerfeld_routbam_` | Sommerfeld boundary (BAM) |
| 20 | 5,709,900 | 2,809,632 | `l2normhelper_` | L2 norm computation |
## 5. Hotspot Category Breakdown
Top 20 functions account for ~98% of total execution counts:
| Category | Functions | Combined Count | Share |
|----------|-----------|---------------|-------|
| Interpolation / Prolongation / Restriction | polint_, prolong3_, restrict3_, polin3_, global_interp_, Ansorg::* | ~2,093M | ~50% |
| BSSN RHS + FD stencils | compute_rhs_bssn_, lopsided_, fderivs_, fdderivs_ | ~1,056M | ~25% |
| Boundary conditions | symmetry_bd_, sommerfeld_rout_, sommerfeld_routbam_ | ~559M | ~13% |
| Time integration | rungekutta4_rout_ | ~119M | ~3% |
| Dissipation | kodis_ | ~92M | ~2% |
| Utilities | misc::fact, decide3d_, copy_, l2normhelper_ | ~256M | ~6% |
## 6. Conclusions
1. **Profile data is valid**: 1,392 functions instrumented, 117 exercised with ~4.2 billion total counts.
2. **Hotspot concentration is high**: Top 5 functions alone account for ~76% of all counts, which is ideal for PGO — the compiler has strong branch/layout optimization targets.
3. **Fortran numerical kernels dominate**: `polint_`, `prolong3_`, `compute_rhs_bssn_`, `symmetry_bd_`, `lopsided_` are all Fortran routines in the inner evolution loop. PGO will optimize their branch prediction and basic block layout.
4. **91.6% of functions have zero counts**: These are code paths for unused features (GPU, BSSN-EScalar, BSSN-EM, Z4C, etc.). PGO will deprioritize them, improving instruction cache utilization.
5. **Profile is representative**: Despite the reduced grid size, the code path coverage matches production — the same kernels (RHS, prolongation, restriction, boundary) are exercised. PGO branch probabilities from this profile will transfer well to full-scale runs.
## 7. PGO Phase 2 Usage
To apply the profile, use the following flags in `makefile.inc`:
```makefile
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-Dfortran3 -Dnewc -I${MKLROOT}/include
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=/home/amss/AMSS-NCKU/pgo_profile/default.profdata \
-align array64byte -fpp -I${MKLROOT}/include
```

Binary file not shown.

Binary file not shown.

Binary file not shown.