Compare commits
3 Commits
chb-rebase
...
chb-new
| Author | SHA1 | Date | |
|---|---|---|---|
|
cc06e30404
|
|||
|
25c79dc7cd
|
|||
|
a725d34dd3
|
@@ -270,12 +270,6 @@ if not os.path.exists( ABE_file ):
|
|||||||
## Copy the executable ABE (or ABEGPU) into the run directory
|
## Copy the executable ABE (or ABEGPU) into the run directory
|
||||||
shutil.copy2(ABE_file, output_directory)
|
shutil.copy2(ABE_file, output_directory)
|
||||||
|
|
||||||
## Copy interp load balance profile if present (for optimize pass)
|
|
||||||
interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
|
|
||||||
if os.path.exists(interp_lb_profile):
|
|
||||||
shutil.copy2(interp_lb_profile, output_directory)
|
|
||||||
print( " Copied interp_lb_profile.bin to run directory " )
|
|
||||||
|
|
||||||
###########################
|
###########################
|
||||||
|
|
||||||
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
|
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
|
||||||
|
|||||||
@@ -1,13 +1,9 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
|
AMSS-NCKU GW150914 Simulation Regression Test Script
|
||||||
|
|
||||||
Verification Requirements:
|
Verification Requirements:
|
||||||
1. RMS errors < 1% for:
|
1. XY-plane trajectory RMS error < 1% (Optimized vs. baseline, max of BH1 and BH2)
|
||||||
- 3D Vector Total RMS
|
|
||||||
- X Component RMS
|
|
||||||
- Y Component RMS
|
|
||||||
- Z Component RMS
|
|
||||||
2. ADM constraint violation < 2 (Grid Level 0)
|
2. ADM constraint violation < 2 (Grid Level 0)
|
||||||
|
|
||||||
RMS Calculation Method:
|
RMS Calculation Method:
|
||||||
@@ -61,62 +57,79 @@ def load_constraint_data(filepath):
|
|||||||
data.append([float(x) for x in parts[:8]])
|
data.append([float(x) for x in parts[:8]])
|
||||||
return np.array(data)
|
return np.array(data)
|
||||||
|
|
||||||
def calculate_all_rms_errors(bh_data_ref, bh_data_target):
|
|
||||||
|
def calculate_rms_error(bh_data_ref, bh_data_target):
|
||||||
"""
|
"""
|
||||||
Calculate 3D Vector RMS and component-wise RMS (X, Y, Z) independently.
|
Calculate trajectory-based RMS error on the XY plane between baseline and optimized simulations.
|
||||||
Uses r = sqrt(x^2 + y^2) as the denominator for all error normalizations.
|
|
||||||
Returns the maximum error between BH1 and BH2 for each category.
|
This function computes the RMS error independently for BH1 and BH2 trajectories,
|
||||||
|
then returns the maximum of the two as the final RMS error metric.
|
||||||
|
|
||||||
|
For each black hole, the RMS is calculated as:
|
||||||
|
RMS = sqrt( (1/M) * sum( (Δr_i / r_i^max)^2 ) ) × 100%
|
||||||
|
|
||||||
|
where:
|
||||||
|
Δr_i = sqrt((x_ref,i - x_new,i)^2 + (y_ref,i - y_new,i)^2)
|
||||||
|
r_i^max = max(sqrt(x_ref,i^2 + y_ref,i^2), sqrt(x_new,i^2 + y_new,i^2))
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bh_data_ref: Reference (baseline) trajectory data
|
||||||
|
bh_data_target: Target (optimized) trajectory data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
rms_value: Final RMS error as a percentage (max of BH1 and BH2)
|
||||||
|
error: Error message if any
|
||||||
"""
|
"""
|
||||||
|
# Align data: truncate to the length of the shorter dataset
|
||||||
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
|
M = min(len(bh_data_ref['time']), len(bh_data_target['time']))
|
||||||
|
|
||||||
if M < 10:
|
if M < 10:
|
||||||
return None, "Insufficient data points for comparison"
|
return None, "Insufficient data points for comparison"
|
||||||
|
|
||||||
results = {}
|
# Extract XY coordinates for both black holes
|
||||||
|
x1_ref = bh_data_ref['x1'][:M]
|
||||||
|
y1_ref = bh_data_ref['y1'][:M]
|
||||||
|
x2_ref = bh_data_ref['x2'][:M]
|
||||||
|
y2_ref = bh_data_ref['y2'][:M]
|
||||||
|
|
||||||
for bh in ['1', '2']:
|
x1_new = bh_data_target['x1'][:M]
|
||||||
x_r, y_r, z_r = bh_data_ref[f'x{bh}'][:M], bh_data_ref[f'y{bh}'][:M], bh_data_ref[f'z{bh}'][:M]
|
y1_new = bh_data_target['y1'][:M]
|
||||||
x_n, y_n, z_n = bh_data_target[f'x{bh}'][:M], bh_data_target[f'y{bh}'][:M], bh_data_target[f'z{bh}'][:M]
|
x2_new = bh_data_target['x2'][:M]
|
||||||
|
y2_new = bh_data_target['y2'][:M]
|
||||||
|
|
||||||
# 核心修改:根据组委会的邮件指示,分母统一使用 r = sqrt(x^2 + y^2)
|
# Calculate RMS for BH1
|
||||||
r_ref = np.sqrt(x_r**2 + y_r**2)
|
delta_r1 = np.sqrt((x1_ref - x1_new)**2 + (y1_ref - y1_new)**2)
|
||||||
r_new = np.sqrt(x_n**2 + y_n**2)
|
r1_ref = np.sqrt(x1_ref**2 + y1_ref**2)
|
||||||
denom_max = np.maximum(r_ref, r_new)
|
r1_new = np.sqrt(x1_new**2 + y1_new**2)
|
||||||
|
r1_max = np.maximum(r1_ref, r1_new)
|
||||||
|
|
||||||
valid = denom_max > 1e-15
|
# Calculate RMS for BH2
|
||||||
if np.sum(valid) < 10:
|
delta_r2 = np.sqrt((x2_ref - x2_new)**2 + (y2_ref - y2_new)**2)
|
||||||
results[f'BH{bh}'] = { '3D_Vector': 0.0, 'X_Component': 0.0, 'Y_Component': 0.0, 'Z_Component': 0.0 }
|
r2_ref = np.sqrt(x2_ref**2 + y2_ref**2)
|
||||||
continue
|
r2_new = np.sqrt(x2_new**2 + y2_new**2)
|
||||||
|
r2_max = np.maximum(r2_ref, r2_new)
|
||||||
|
|
||||||
def calc_rms(delta):
|
# Avoid division by zero for BH1
|
||||||
# 将对应分量的偏差除以统一的轨道半径分母 denom_max
|
valid_mask1 = r1_max > 1e-15
|
||||||
return np.sqrt(np.mean((delta[valid] / denom_max[valid])**2)) * 100
|
if np.sum(valid_mask1) < 10:
|
||||||
|
return None, "Insufficient valid data points for BH1"
|
||||||
|
|
||||||
# 1. Total 3D Vector RMS
|
terms1 = (delta_r1[valid_mask1] / r1_max[valid_mask1])**2
|
||||||
delta_vec = np.sqrt((x_r - x_n)**2 + (y_r - y_n)**2 + (z_r - z_n)**2)
|
rms_bh1 = np.sqrt(np.mean(terms1)) * 100
|
||||||
rms_3d = calc_rms(delta_vec)
|
|
||||||
|
|
||||||
# 2. Component-wise RMS (分离计算各轴,但共用半径分母)
|
# Avoid division by zero for BH2
|
||||||
rms_x = calc_rms(np.abs(x_r - x_n))
|
valid_mask2 = r2_max > 1e-15
|
||||||
rms_y = calc_rms(np.abs(y_r - y_n))
|
if np.sum(valid_mask2) < 10:
|
||||||
rms_z = calc_rms(np.abs(z_r - z_n))
|
return None, "Insufficient valid data points for BH2"
|
||||||
|
|
||||||
results[f'BH{bh}'] = {
|
terms2 = (delta_r2[valid_mask2] / r2_max[valid_mask2])**2
|
||||||
'3D_Vector': rms_3d,
|
rms_bh2 = np.sqrt(np.mean(terms2)) * 100
|
||||||
'X_Component': rms_x,
|
|
||||||
'Y_Component': rms_y,
|
|
||||||
'Z_Component': rms_z
|
|
||||||
}
|
|
||||||
|
|
||||||
# 获取 BH1 和 BH2 中的最大误差
|
# Final RMS is the maximum of BH1 and BH2
|
||||||
max_rms = {
|
rms_final = max(rms_bh1, rms_bh2)
|
||||||
'3D_Vector': max(results['BH1']['3D_Vector'], results['BH2']['3D_Vector']),
|
|
||||||
'X_Component': max(results['BH1']['X_Component'], results['BH2']['X_Component']),
|
return rms_final, None
|
||||||
'Y_Component': max(results['BH1']['Y_Component'], results['BH2']['Y_Component']),
|
|
||||||
'Z_Component': max(results['BH1']['Z_Component'], results['BH2']['Z_Component'])
|
|
||||||
}
|
|
||||||
|
|
||||||
return max_rms, None
|
|
||||||
|
|
||||||
def analyze_constraint_violation(constraint_data, n_levels=9):
|
def analyze_constraint_violation(constraint_data, n_levels=9):
|
||||||
"""
|
"""
|
||||||
@@ -142,32 +155,34 @@ def analyze_constraint_violation(constraint_data, n_levels=9):
|
|||||||
|
|
||||||
|
|
||||||
def print_header():
|
def print_header():
|
||||||
|
"""Print report header"""
|
||||||
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
print(Color.BOLD + " AMSS-NCKU GW150914 Comprehensive Regression Test" + Color.RESET)
|
print(Color.BOLD + " AMSS-NCKU GW150914 Simulation Regression Test Report" + Color.RESET)
|
||||||
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
|
|
||||||
def print_rms_results(rms_dict, error, threshold=1.0):
|
|
||||||
print(f"\n{Color.BOLD}1. RMS Error Analysis (Maximums of BH1 & BH2){Color.RESET}")
|
def print_rms_results(rms_rel, error, threshold=1.0):
|
||||||
print("-" * 65)
|
"""Print RMS error results"""
|
||||||
|
print(f"\n{Color.BOLD}1. RMS Error Analysis (Baseline vs Optimized){Color.RESET}")
|
||||||
|
print("-" * 45)
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
print(f" {Color.RED}Error: {error}{Color.RESET}")
|
print(f" {Color.RED}Error: {error}{Color.RESET}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
all_passed = True
|
passed = rms_rel < threshold
|
||||||
print(f" Requirement: < {threshold}%\n")
|
|
||||||
|
|
||||||
for key, val in rms_dict.items():
|
print(f" RMS relative error: {rms_rel:.4f}%")
|
||||||
passed = val < threshold
|
print(f" Requirement: < {threshold}%")
|
||||||
all_passed = all_passed and passed
|
print(f" Status: {get_status_text(passed)}")
|
||||||
status = get_status_text(passed)
|
|
||||||
print(f" {key:15}: {val:8.4f}% | Status: {status}")
|
return passed
|
||||||
|
|
||||||
return all_passed
|
|
||||||
|
|
||||||
def print_constraint_results(results, threshold=2.0):
|
def print_constraint_results(results, threshold=2.0):
|
||||||
|
"""Print constraint violation results"""
|
||||||
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
|
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
|
||||||
print("-" * 65)
|
print("-" * 45)
|
||||||
|
|
||||||
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
|
names = ['Ham', 'Px', 'Py', 'Pz', 'Gx', 'Gy', 'Gz']
|
||||||
for i, name in enumerate(names):
|
for i, name in enumerate(names):
|
||||||
@@ -185,6 +200,7 @@ def print_constraint_results(results, threshold=2.0):
|
|||||||
|
|
||||||
|
|
||||||
def print_summary(rms_passed, constraint_passed):
|
def print_summary(rms_passed, constraint_passed):
|
||||||
|
"""Print summary"""
|
||||||
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
print(Color.BOLD + "Verification Summary" + Color.RESET)
|
print(Color.BOLD + "Verification Summary" + Color.RESET)
|
||||||
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||||
@@ -194,7 +210,7 @@ def print_summary(rms_passed, constraint_passed):
|
|||||||
res_rms = get_status_text(rms_passed)
|
res_rms = get_status_text(rms_passed)
|
||||||
res_con = get_status_text(constraint_passed)
|
res_con = get_status_text(constraint_passed)
|
||||||
|
|
||||||
print(f" [1] Comprehensive RMS check: {res_rms}")
|
print(f" [1] RMS trajectory check: {res_rms}")
|
||||||
print(f" [2] ADM constraint check: {res_con}")
|
print(f" [2] ADM constraint check: {res_con}")
|
||||||
|
|
||||||
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
|
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
|
||||||
@@ -203,48 +219,61 @@ def print_summary(rms_passed, constraint_passed):
|
|||||||
|
|
||||||
return all_passed
|
return all_passed
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
# Determine target (optimized) output directory
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
target_dir = sys.argv[1]
|
target_dir = sys.argv[1]
|
||||||
else:
|
else:
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
|
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
|
||||||
|
|
||||||
|
# Determine reference (baseline) directory
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
|
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
|
||||||
|
|
||||||
|
# Data file paths
|
||||||
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
|
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
|
||||||
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
|
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
|
||||||
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
|
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
|
||||||
|
|
||||||
|
# Check if files exist
|
||||||
if not os.path.exists(bh_file_ref):
|
if not os.path.exists(bh_file_ref):
|
||||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not os.path.exists(bh_file_target):
|
if not os.path.exists(bh_file_target):
|
||||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Target trajectory file not found: {bh_file_target}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if not os.path.exists(constraint_file):
|
if not os.path.exists(constraint_file):
|
||||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
|
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Print header
|
||||||
print_header()
|
print_header()
|
||||||
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
|
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
|
||||||
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
|
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
|
||||||
|
|
||||||
|
# Load data
|
||||||
bh_data_ref = load_bh_trajectory(bh_file_ref)
|
bh_data_ref = load_bh_trajectory(bh_file_ref)
|
||||||
bh_data_target = load_bh_trajectory(bh_file_target)
|
bh_data_target = load_bh_trajectory(bh_file_target)
|
||||||
constraint_data = load_constraint_data(constraint_file)
|
constraint_data = load_constraint_data(constraint_file)
|
||||||
|
|
||||||
# Output modified RMS results
|
# Calculate RMS error
|
||||||
rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
|
rms_rel, error = calculate_rms_error(bh_data_ref, bh_data_target)
|
||||||
rms_passed = print_rms_results(rms_dict, error)
|
rms_passed = print_rms_results(rms_rel, error)
|
||||||
|
|
||||||
# Output constraint results
|
# Analyze constraint violation
|
||||||
constraint_results = analyze_constraint_violation(constraint_data)
|
constraint_results = analyze_constraint_violation(constraint_data)
|
||||||
constraint_passed = print_constraint_results(constraint_results)
|
constraint_passed = print_constraint_results(constraint_results)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
all_passed = print_summary(rms_passed, constraint_passed)
|
all_passed = print_summary(rms_passed, constraint_passed)
|
||||||
|
|
||||||
|
# Return exit code
|
||||||
sys.exit(0 if all_passed else 1)
|
sys.exit(0 if all_passed else 1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -7,178 +7,12 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <new>
|
#include <new>
|
||||||
#include <vector>
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "MPatch.h"
|
#include "MPatch.h"
|
||||||
#include "Parallel.h"
|
#include "Parallel.h"
|
||||||
#include "fmisc.h"
|
#include "fmisc.h"
|
||||||
#ifdef INTERP_LB_PROFILE
|
|
||||||
#include "interp_lb_profile.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace
|
|
||||||
{
|
|
||||||
struct InterpBlockView
|
|
||||||
{
|
|
||||||
Block *bp;
|
|
||||||
double llb[dim];
|
|
||||||
double uub[dim];
|
|
||||||
};
|
|
||||||
|
|
||||||
struct BlockBinIndex
|
|
||||||
{
|
|
||||||
int bins[dim];
|
|
||||||
double lo[dim];
|
|
||||||
double inv[dim];
|
|
||||||
vector<InterpBlockView> views;
|
|
||||||
vector<vector<int>> bin_to_blocks;
|
|
||||||
bool valid;
|
|
||||||
|
|
||||||
BlockBinIndex() : valid(false)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < dim; i++)
|
|
||||||
{
|
|
||||||
bins[i] = 1;
|
|
||||||
lo[i] = 0.0;
|
|
||||||
inv[i] = 0.0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
inline int clamp_int(int v, int lo, int hi)
|
|
||||||
{
|
|
||||||
return (v < lo) ? lo : ((v > hi) ? hi : v);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int coord_to_bin(double x, double lo, double inv, int nb)
|
|
||||||
{
|
|
||||||
if (nb <= 1 || inv <= 0.0)
|
|
||||||
return 0;
|
|
||||||
int b = int(floor((x - lo) * inv));
|
|
||||||
return clamp_int(b, 0, nb - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
|
|
||||||
{
|
|
||||||
return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < dim; i++)
|
|
||||||
{
|
|
||||||
if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
|
|
||||||
{
|
|
||||||
index = BlockBinIndex();
|
|
||||||
|
|
||||||
MyList<Block> *Bp = patch->blb;
|
|
||||||
while (Bp)
|
|
||||||
{
|
|
||||||
Block *BP = Bp->data;
|
|
||||||
InterpBlockView view;
|
|
||||||
view.bp = BP;
|
|
||||||
for (int i = 0; i < dim; i++)
|
|
||||||
{
|
|
||||||
#ifdef Vertex
|
|
||||||
#ifdef Cell
|
|
||||||
#error Both Cell and Vertex are defined
|
|
||||||
#endif
|
|
||||||
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
|
||||||
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
|
||||||
#else
|
|
||||||
#ifdef Cell
|
|
||||||
view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
|
||||||
view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
|
||||||
#else
|
|
||||||
#error Not define Vertex nor Cell
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
index.views.push_back(view);
|
|
||||||
if (Bp == patch->ble)
|
|
||||||
break;
|
|
||||||
Bp = Bp->next;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int nblocks = int(index.views.size());
|
|
||||||
if (nblocks <= 0)
|
|
||||||
return;
|
|
||||||
|
|
||||||
int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
|
|
||||||
bins_1d = clamp_int(bins_1d, 1, 32);
|
|
||||||
for (int i = 0; i < dim; i++)
|
|
||||||
{
|
|
||||||
index.bins[i] = bins_1d;
|
|
||||||
index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
|
|
||||||
const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
|
|
||||||
if (hi > index.lo[i] && bins_1d > 1)
|
|
||||||
index.inv[i] = bins_1d / (hi - index.lo[i]);
|
|
||||||
else
|
|
||||||
index.inv[i] = 0.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
|
|
||||||
|
|
||||||
for (int bi = 0; bi < nblocks; bi++)
|
|
||||||
{
|
|
||||||
const InterpBlockView &view = index.views[bi];
|
|
||||||
int bmin[dim], bmax[dim];
|
|
||||||
for (int d = 0; d < dim; d++)
|
|
||||||
{
|
|
||||||
const double low = view.llb[d] - DH[d] / 2;
|
|
||||||
const double up = view.uub[d] + DH[d] / 2;
|
|
||||||
bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
|
|
||||||
bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
|
|
||||||
if (bmax[d] < bmin[d])
|
|
||||||
{
|
|
||||||
int t = bmin[d];
|
|
||||||
bmin[d] = bmax[d];
|
|
||||||
bmax[d] = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int bz = bmin[2]; bz <= bmax[2]; bz++)
|
|
||||||
for (int by = bmin[1]; by <= bmax[1]; by++)
|
|
||||||
for (int bx = bmin[0]; bx <= bmax[0]; bx++)
|
|
||||||
index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
|
|
||||||
}
|
|
||||||
|
|
||||||
index.valid = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
|
|
||||||
{
|
|
||||||
if (!index.valid)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
|
|
||||||
const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
|
|
||||||
const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
|
|
||||||
const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
|
|
||||||
|
|
||||||
for (size_t ci = 0; ci < cand.size(); ci++)
|
|
||||||
{
|
|
||||||
const int bi = cand[ci];
|
|
||||||
if (point_in_block_view(index.views[bi], pox, DH))
|
|
||||||
return bi;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fallback to full scan for numerical edge cases around bin boundaries.
|
|
||||||
for (size_t bi = 0; bi < index.views.size(); bi++)
|
|
||||||
if (point_in_block_view(index.views[bi], pox, DH))
|
|
||||||
return int(bi);
|
|
||||||
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||||
{
|
{
|
||||||
@@ -530,11 +364,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
owner_rank[j] = -1;
|
owner_rank[j] = -1;
|
||||||
|
|
||||||
double DH[dim];
|
double DH[dim], llb[dim], uub[dim];
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
DH[i] = getdX(i);
|
DH[i] = getdX(i);
|
||||||
BlockBinIndex block_index;
|
|
||||||
build_block_bin_index(this, DH, block_index);
|
|
||||||
|
|
||||||
for (int j = 0; j < NN; j++) // run along points
|
for (int j = 0; j < NN; j++) // run along points
|
||||||
{
|
{
|
||||||
@@ -557,24 +389,57 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
MyList<Block> *Bp = blb;
|
||||||
if (block_i >= 0)
|
bool notfind = true;
|
||||||
|
while (notfind && Bp) // run along Blocks
|
||||||
{
|
{
|
||||||
Block *BP = block_index.views[block_i].bp;
|
Block *BP = Bp->data;
|
||||||
owner_rank[j] = BP->rank;
|
|
||||||
if (myrank == BP->rank)
|
bool flag = true;
|
||||||
|
for (int i = 0; i < dim; i++)
|
||||||
{
|
{
|
||||||
//---> interpolation
|
#ifdef Vertex
|
||||||
varl = VarList;
|
#ifdef Cell
|
||||||
int k = 0;
|
#error Both Cell and Vertex are defined
|
||||||
while (varl) // run along variables
|
#endif
|
||||||
|
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||||
|
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||||
|
#else
|
||||||
|
#ifdef Cell
|
||||||
|
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||||
|
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||||
|
#else
|
||||||
|
#error Not define Vertex nor Cell
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
||||||
{
|
{
|
||||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
flag = false;
|
||||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
break;
|
||||||
varl = varl->next;
|
|
||||||
k++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (flag)
|
||||||
|
{
|
||||||
|
notfind = false;
|
||||||
|
owner_rank[j] = BP->rank;
|
||||||
|
if (myrank == BP->rank)
|
||||||
|
{
|
||||||
|
//---> interpolation
|
||||||
|
varl = VarList;
|
||||||
|
int k = 0;
|
||||||
|
while (varl) // run along variables
|
||||||
|
{
|
||||||
|
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||||
|
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||||
|
varl = varl->next;
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Bp == ble)
|
||||||
|
break;
|
||||||
|
Bp = Bp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -642,9 +507,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
// Targeted point-to-point overload: each owner sends each point only to
|
// Targeted point-to-point overload: each owner sends each point only to
|
||||||
// the one rank that needs it for integration (consumer), reducing
|
// the one rank that needs it for integration (consumer), reducing
|
||||||
// communication volume by ~nprocs times compared to the Bcast version.
|
// communication volume by ~nprocs times compared to the Bcast version.
|
||||||
#ifdef INTERP_LB_PROFILE
|
|
||||||
double t_interp_start = MPI_Wtime();
|
|
||||||
#endif
|
|
||||||
int myrank, nprocs;
|
int myrank, nprocs;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
@@ -667,11 +529,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
owner_rank[j] = -1;
|
owner_rank[j] = -1;
|
||||||
|
|
||||||
double DH[dim];
|
double DH[dim], llb[dim], uub[dim];
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
DH[i] = getdX(i);
|
DH[i] = getdX(i);
|
||||||
BlockBinIndex block_index;
|
|
||||||
build_block_bin_index(this, DH, block_index);
|
|
||||||
|
|
||||||
// --- Interpolation phase (identical to original) ---
|
// --- Interpolation phase (identical to original) ---
|
||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
@@ -695,31 +555,59 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
MyList<Block> *Bp = blb;
|
||||||
if (block_i >= 0)
|
bool notfind = true;
|
||||||
|
while (notfind && Bp)
|
||||||
{
|
{
|
||||||
Block *BP = block_index.views[block_i].bp;
|
Block *BP = Bp->data;
|
||||||
owner_rank[j] = BP->rank;
|
|
||||||
if (myrank == BP->rank)
|
bool flag = true;
|
||||||
|
for (int i = 0; i < dim; i++)
|
||||||
{
|
{
|
||||||
varl = VarList;
|
#ifdef Vertex
|
||||||
int k = 0;
|
#ifdef Cell
|
||||||
while (varl)
|
#error Both Cell and Vertex are defined
|
||||||
|
#endif
|
||||||
|
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||||
|
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||||
|
#else
|
||||||
|
#ifdef Cell
|
||||||
|
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||||
|
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||||
|
#else
|
||||||
|
#error Not define Vertex nor Cell
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
||||||
{
|
{
|
||||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
flag = false;
|
||||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
break;
|
||||||
varl = varl->next;
|
|
||||||
k++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (flag)
|
||||||
|
{
|
||||||
|
notfind = false;
|
||||||
|
owner_rank[j] = BP->rank;
|
||||||
|
if (myrank == BP->rank)
|
||||||
|
{
|
||||||
|
varl = VarList;
|
||||||
|
int k = 0;
|
||||||
|
while (varl)
|
||||||
|
{
|
||||||
|
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||||
|
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||||
|
varl = varl->next;
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Bp == ble)
|
||||||
|
break;
|
||||||
|
Bp = Bp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef INTERP_LB_PROFILE
|
|
||||||
double t_interp_end = MPI_Wtime();
|
|
||||||
double t_interp_local = t_interp_end - t_interp_start;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// --- Error check for unfound points ---
|
// --- Error check for unfound points ---
|
||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
{
|
{
|
||||||
@@ -876,31 +764,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
delete[] recv_count;
|
delete[] recv_count;
|
||||||
delete[] consumer_rank;
|
delete[] consumer_rank;
|
||||||
delete[] owner_rank;
|
delete[] owner_rank;
|
||||||
|
|
||||||
#ifdef INTERP_LB_PROFILE
|
|
||||||
{
|
|
||||||
static bool profile_written = false;
|
|
||||||
if (!profile_written) {
|
|
||||||
double *all_times = nullptr;
|
|
||||||
if (myrank == 0) all_times = new double[nprocs];
|
|
||||||
MPI_Gather(&t_interp_local, 1, MPI_DOUBLE,
|
|
||||||
all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
|
||||||
if (myrank == 0) {
|
|
||||||
int heavy[64];
|
|
||||||
int nh = InterpLBProfile::identify_heavy_ranks(
|
|
||||||
all_times, nprocs, 2.5, heavy, 64);
|
|
||||||
InterpLBProfile::write_profile(
|
|
||||||
"interp_lb_profile.bin", nprocs,
|
|
||||||
all_times, heavy, nh, 2.5);
|
|
||||||
printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
|
|
||||||
for (int i = 0; i < nh; i++)
|
|
||||||
printf(" Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
|
|
||||||
delete[] all_times;
|
|
||||||
}
|
|
||||||
profile_written = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
void Patch::Interp_Points(MyList<var> *VarList,
|
void Patch::Interp_Points(MyList<var> *VarList,
|
||||||
int NN, double **XX,
|
int NN, double **XX,
|
||||||
@@ -934,11 +797,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
|
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
|
||||||
MPI_Comm_group(Comm_here, &local_group);
|
MPI_Comm_group(Comm_here, &local_group);
|
||||||
|
|
||||||
double DH[dim];
|
double DH[dim], llb[dim], uub[dim];
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
DH[i] = getdX(i);
|
DH[i] = getdX(i);
|
||||||
BlockBinIndex block_index;
|
|
||||||
build_block_bin_index(this, DH, block_index);
|
|
||||||
|
|
||||||
for (int j = 0; j < NN; j++) // run along points
|
for (int j = 0; j < NN; j++) // run along points
|
||||||
{
|
{
|
||||||
@@ -961,24 +822,57 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
MyList<Block> *Bp = blb;
|
||||||
if (block_i >= 0)
|
bool notfind = true;
|
||||||
|
while (notfind && Bp) // run along Blocks
|
||||||
{
|
{
|
||||||
Block *BP = block_index.views[block_i].bp;
|
Block *BP = Bp->data;
|
||||||
owner_rank[j] = BP->rank;
|
|
||||||
if (myrank == BP->rank)
|
bool flag = true;
|
||||||
|
for (int i = 0; i < dim; i++)
|
||||||
{
|
{
|
||||||
//---> interpolation
|
#ifdef Vertex
|
||||||
varl = VarList;
|
#ifdef Cell
|
||||||
int k = 0;
|
#error Both Cell and Vertex are defined
|
||||||
while (varl) // run along variables
|
#endif
|
||||||
|
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
|
||||||
|
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
|
||||||
|
#else
|
||||||
|
#ifdef Cell
|
||||||
|
llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
|
||||||
|
uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
|
||||||
|
#else
|
||||||
|
#error Not define Vertex nor Cell
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
|
||||||
{
|
{
|
||||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
flag = false;
|
||||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
break;
|
||||||
varl = varl->next;
|
|
||||||
k++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (flag)
|
||||||
|
{
|
||||||
|
notfind = false;
|
||||||
|
owner_rank[j] = BP->rank;
|
||||||
|
if (myrank == BP->rank)
|
||||||
|
{
|
||||||
|
//---> interpolation
|
||||||
|
varl = VarList;
|
||||||
|
int k = 0;
|
||||||
|
while (varl) // run along variables
|
||||||
|
{
|
||||||
|
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||||
|
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||||
|
varl = varl->next;
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Bp == ble)
|
||||||
|
break;
|
||||||
|
Bp = Bp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -32,16 +32,6 @@ namespace Parallel
|
|||||||
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
|
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
|
||||||
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
|
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
|
||||||
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
|
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
|
||||||
MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
|
|
||||||
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
|
||||||
int ib0_orig, int ib3_orig,
|
|
||||||
int jb1_orig, int jb4_orig,
|
|
||||||
int kb2_orig, int kb5_orig,
|
|
||||||
Patch* PP, int r_left, int r_right,
|
|
||||||
int ingfsi, int fngfsi, bool periodic,
|
|
||||||
Block* &split_first_block, Block* &split_last_block);
|
|
||||||
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
|
|
||||||
int block_id, int ingfsi, int fngfsi, int lev);
|
|
||||||
void KillBlocks(MyList<Patch> *PatchLIST);
|
void KillBlocks(MyList<Patch> *PatchLIST);
|
||||||
|
|
||||||
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
|
void setfunction(MyList<Block> *BlL, var *vn, double func(double x, double y, double z));
|
||||||
|
|||||||
@@ -321,22 +321,7 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
Pp = Pp->next;
|
Pp = Pp->next;
|
||||||
}
|
}
|
||||||
// check error information
|
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||||
{
|
|
||||||
int erh = ERROR;
|
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
|
||||||
<< ", lev = " << lev << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
// evolve Shell Patches
|
// evolve Shell Patches
|
||||||
@@ -354,9 +339,9 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
{
|
{
|
||||||
#if (AGM == 0)
|
#if (AGM == 0)
|
||||||
f_enforce_ga(cg->shape,
|
f_enforce_ga(cg->shape,
|
||||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||||
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||||
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@@ -468,24 +453,16 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
sPp = sPp->next;
|
sPp = sPp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// check error information
|
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||||
|
MPI_Request err_req_pre;
|
||||||
{
|
{
|
||||||
int erh = ERROR;
|
int erh = ERROR;
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
Parallel::AsyncSyncState async_pre;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
@@ -498,12 +475,30 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
{
|
{
|
||||||
prev_clock = curr_clock;
|
prev_clock = curr_clock;
|
||||||
curr_clock = clock();
|
curr_clock = clock();
|
||||||
cout << " Shell stuff synchronization used "
|
cout << " Shell stuff synchronization used "
|
||||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||||
<< " seconds! " << endl;
|
<< " seconds! " << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||||
|
|
||||||
|
#ifdef WithShell
|
||||||
|
// Complete non-blocking error reduction and check
|
||||||
|
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
|
||||||
|
if (ERROR)
|
||||||
|
{
|
||||||
|
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||||
|
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
if (ErrorMonitor->outfile)
|
||||||
|
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
||||||
|
<< ", lev = " << lev << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// for black hole position
|
// for black hole position
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
@@ -693,23 +688,7 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
Pp = Pp->next;
|
Pp = Pp->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check error information
|
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||||
{
|
|
||||||
int erh = ERROR;
|
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
|
||||||
<< " variables at t = " << PhysTime
|
|
||||||
<< ", lev = " << lev << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
// evolve Shell Patches
|
// evolve Shell Patches
|
||||||
@@ -850,25 +829,16 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
sPp = sPp->next;
|
sPp = sPp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// check error information
|
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||||
|
MPI_Request err_req_cor;
|
||||||
{
|
{
|
||||||
int erh = ERROR;
|
int erh = ERROR;
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
|
|
||||||
<< " variables at t = " << PhysTime << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
Parallel::AsyncSyncState async_cor;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||||
|
|
||||||
#ifdef WithShell
|
#ifdef WithShell
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
@@ -881,11 +851,30 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
{
|
{
|
||||||
prev_clock = curr_clock;
|
prev_clock = curr_clock;
|
||||||
curr_clock = clock();
|
curr_clock = clock();
|
||||||
cout << " Shell stuff synchronization used "
|
cout << " Shell stuff synchronization used "
|
||||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||||
<< " seconds! " << endl;
|
<< " seconds! " << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||||
|
|
||||||
|
#ifdef WithShell
|
||||||
|
// Complete non-blocking error reduction and check
|
||||||
|
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||||
|
if (ERROR)
|
||||||
|
{
|
||||||
|
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||||
|
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
if (ErrorMonitor->outfile)
|
||||||
|
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||||
|
<< " variables at t = " << PhysTime
|
||||||
|
<< ", lev = " << lev << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
// for black hole position
|
// for black hole position
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
@@ -1252,22 +1241,7 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
// check error information
|
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||||
{
|
|
||||||
int erh = ERROR;
|
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
|
||||||
<< ", lev = " << lev << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// evolve Shell Patches
|
// evolve Shell Patches
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
@@ -1542,23 +1516,15 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
// check error information
|
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||||
|
MPI_Request err_req_pre;
|
||||||
{
|
{
|
||||||
int erh = ERROR;
|
int erh = ERROR;
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN in state variables on Shell Patches at t = " << PhysTime << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
Parallel::AsyncSyncState async_pre;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||||
|
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
{
|
{
|
||||||
@@ -1570,8 +1536,8 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
{
|
{
|
||||||
prev_clock = curr_clock;
|
prev_clock = curr_clock;
|
||||||
curr_clock = clock();
|
curr_clock = clock();
|
||||||
cout << " Shell stuff synchronization used "
|
cout << " Shell stuff synchronization used "
|
||||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||||
<< " seconds! " << endl;
|
<< " seconds! " << endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1620,6 +1586,22 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
|
||||||
|
|
||||||
|
// Complete non-blocking error reduction and check
|
||||||
|
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
|
||||||
|
if (ERROR)
|
||||||
|
{
|
||||||
|
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
||||||
|
SH->Dump_Data(StateList, 0, PhysTime, dT_lev);
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
if (ErrorMonitor->outfile)
|
||||||
|
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
||||||
|
<< ", lev = " << lev << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// for black hole position
|
// for black hole position
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
@@ -1841,23 +1823,7 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
Pp = Pp->next;
|
Pp = Pp->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check error information
|
// NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls
|
||||||
{
|
|
||||||
int erh = ERROR;
|
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
|
||||||
<< " variables at t = " << PhysTime
|
|
||||||
<< ", lev = " << lev << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// evolve Shell Patches
|
// evolve Shell Patches
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
@@ -2103,24 +2069,15 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
sPp = sPp->next;
|
sPp = sPp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// check error information
|
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
|
||||||
|
MPI_Request err_req_cor;
|
||||||
{
|
{
|
||||||
int erh = ERROR;
|
int erh = ERROR;
|
||||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
||||||
}
|
|
||||||
if (ERROR)
|
|
||||||
{
|
|
||||||
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
|
||||||
if (myrank == 0)
|
|
||||||
{
|
|
||||||
if (ErrorMonitor->outfile)
|
|
||||||
ErrorMonitor->outfile << "find NaN on Shell Patches in RK4 substep#" << iter_count
|
|
||||||
<< " variables at t = " << PhysTime << endl;
|
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
Parallel::AsyncSyncState async_cor;
|
||||||
|
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||||
|
|
||||||
if (lev == 0)
|
if (lev == 0)
|
||||||
{
|
{
|
||||||
@@ -2132,8 +2089,8 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
{
|
{
|
||||||
prev_clock = curr_clock;
|
prev_clock = curr_clock;
|
||||||
curr_clock = clock();
|
curr_clock = clock();
|
||||||
cout << " Shell stuff synchronization used "
|
cout << " Shell stuff synchronization used "
|
||||||
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
|
||||||
<< " seconds! " << endl;
|
<< " seconds! " << endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2170,6 +2127,23 @@ void Z4c_class::Step(int lev, int YN)
|
|||||||
}
|
}
|
||||||
// end smooth
|
// end smooth
|
||||||
#endif
|
#endif
|
||||||
|
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
|
||||||
|
|
||||||
|
// Complete non-blocking error reduction and check
|
||||||
|
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||||
|
if (ERROR)
|
||||||
|
{
|
||||||
|
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
||||||
|
SH->Dump_Data(SynchList_pre, 0, PhysTime, dT_lev);
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
if (ErrorMonitor->outfile)
|
||||||
|
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
||||||
|
<< " variables at t = " << PhysTime
|
||||||
|
<< ", lev = " << lev << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// for black hole position
|
// for black hole position
|
||||||
if (BH_num > 0 && lev == GH->levels - 1)
|
if (BH_num > 0 && lev == GH->levels - 1)
|
||||||
|
|||||||
@@ -2426,9 +2426,9 @@ void bssn_class::RecursiveStep(int lev)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (REGLEV == 0)
|
#if (REGLEV == 0)
|
||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@@ -2605,9 +2605,9 @@ void bssn_class::ParallelStep()
|
|||||||
delete[] tporg;
|
delete[] tporg;
|
||||||
delete[] tporgo;
|
delete[] tporgo;
|
||||||
#if (REGLEV == 0)
|
#if (REGLEV == 0)
|
||||||
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@@ -2772,9 +2772,9 @@ void bssn_class::ParallelStep()
|
|||||||
if (lev + 1 >= GH->movls)
|
if (lev + 1 >= GH->movls)
|
||||||
{
|
{
|
||||||
// GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
|
// GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
|
||||||
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -2787,9 +2787,9 @@ void bssn_class::ParallelStep()
|
|||||||
// for this level
|
// for this level
|
||||||
if (YN == 1)
|
if (YN == 1)
|
||||||
{
|
{
|
||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -2806,9 +2806,9 @@ void bssn_class::ParallelStep()
|
|||||||
if (YN == 1)
|
if (YN == 1)
|
||||||
{
|
{
|
||||||
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -2822,9 +2822,9 @@ void bssn_class::ParallelStep()
|
|||||||
if (i % 4 == 3)
|
if (i % 4 == 3)
|
||||||
{
|
{
|
||||||
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,36 +0,0 @@
|
|||||||
#ifndef BSSN_RHS_CUDA_H
|
|
||||||
#define BSSN_RHS_CUDA_H
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int f_compute_rhs_bssn(int *ex, double &T,
|
|
||||||
double *X, double *Y, double *Z,
|
|
||||||
double *chi, double *trK,
|
|
||||||
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
|
||||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
|
||||||
double *Gamx, double *Gamy, double *Gamz,
|
|
||||||
double *Lap, double *betax, double *betay, double *betaz,
|
|
||||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
|
||||||
double *chi_rhs, double *trK_rhs,
|
|
||||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
|
||||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
|
||||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
|
||||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
|
||||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
|
||||||
double *rho, double *Sx, double *Sy, double *Sz,
|
|
||||||
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
|
||||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
|
||||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
|
||||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
|
||||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
|
||||||
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
|
||||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
|
||||||
int &Symmetry, int &Lev, double &eps, int &co);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif
|
|
||||||
@@ -130,11 +130,7 @@ void cgh::compose_cgh(int nprocs)
|
|||||||
for (int lev = 0; lev < levels; lev++)
|
for (int lev = 0; lev < levels; lev++)
|
||||||
{
|
{
|
||||||
checkPatchList(PatL[lev], false);
|
checkPatchList(PatL[lev], false);
|
||||||
#ifdef INTERP_LB_OPTIMIZE
|
|
||||||
Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
|
|
||||||
#else
|
|
||||||
Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
|
Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
|
||||||
#endif
|
|
||||||
#if (RPB == 1)
|
#if (RPB == 1)
|
||||||
// we need distributed box of PatL[lev] and PatL[lev-1]
|
// we need distributed box of PatL[lev] and PatL[lev-1]
|
||||||
if (lev > 0)
|
if (lev > 0)
|
||||||
@@ -1305,13 +1301,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||||
MyList<var> *OldList, MyList<var> *StateList,
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
||||||
monitor *ErrorMonitor)
|
monitor *ErrorMonitor)
|
||||||
{
|
{
|
||||||
if (lev < movls)
|
if (lev < movls)
|
||||||
return false;
|
return;
|
||||||
|
|
||||||
#if (0)
|
#if (0)
|
||||||
// #if (PSTR == 1 || PSTR == 2)
|
// #if (PSTR == 1 || PSTR == 2)
|
||||||
@@ -1400,7 +1396,7 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
|
|||||||
for (bhi = 0; bhi < BH_num; bhi++)
|
for (bhi = 0; bhi < BH_num; bhi++)
|
||||||
delete[] tmpPorg[bhi];
|
delete[] tmpPorg[bhi];
|
||||||
delete[] tmpPorg;
|
delete[] tmpPorg;
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
// x direction
|
// x direction
|
||||||
rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
|
rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
|
||||||
@@ -1504,7 +1500,6 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
|
|||||||
for (int bhi = 0; bhi < BH_num; bhi++)
|
for (int bhi = 0; bhi < BH_num; bhi++)
|
||||||
delete[] tmpPorg[bhi];
|
delete[] tmpPorg[bhi];
|
||||||
delete[] tmpPorg;
|
delete[] tmpPorg;
|
||||||
return tot_flag;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ public:
|
|||||||
MyList<var> *OldList, MyList<var> *StateList,
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
MyList<var> *FutureList, MyList<var> *tmList,
|
MyList<var> *FutureList, MyList<var> *tmList,
|
||||||
int Symmetry, bool BB);
|
int Symmetry, bool BB);
|
||||||
bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||||
MyList<var> *OldList, MyList<var> *StateList,
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
||||||
monitor *ErrorMonitor);
|
monitor *ErrorMonitor);
|
||||||
|
|||||||
@@ -69,12 +69,10 @@
|
|||||||
fy = ZEO
|
fy = ZEO
|
||||||
fz = ZEO
|
fz = ZEO
|
||||||
|
|
||||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
|
||||||
!DIR$ UNROLL PARTIAL(4)
|
|
||||||
do k=1,ex(3)-1
|
do k=1,ex(3)-1
|
||||||
do j=1,ex(2)-1
|
do j=1,ex(2)-1
|
||||||
do i=1,ex(1)-1
|
do i=1,ex(1)-1
|
||||||
! x direction
|
! x direction
|
||||||
if(i+1 <= imax .and. i-1 >= imin)then
|
if(i+1 <= imax .and. i-1 >= imin)then
|
||||||
!
|
!
|
||||||
! - f(i-1) + f(i+1)
|
! - f(i-1) + f(i+1)
|
||||||
@@ -373,8 +371,6 @@
|
|||||||
fxz = ZEO
|
fxz = ZEO
|
||||||
fyz = ZEO
|
fyz = ZEO
|
||||||
|
|
||||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
|
||||||
!DIR$ UNROLL PARTIAL(4)
|
|
||||||
do k=1,ex(3)-1
|
do k=1,ex(3)-1
|
||||||
do j=1,ex(2)-1
|
do j=1,ex(2)-1
|
||||||
do i=1,ex(1)-1
|
do i=1,ex(1)-1
|
||||||
|
|||||||
@@ -1,332 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
void fdderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fxx, double *fxy, double *fxz,
|
|
||||||
double *fyy, double *fyz, double *fzz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff)
|
|
||||||
{
|
|
||||||
(void)onoff;
|
|
||||||
|
|
||||||
const int NO_SYMM = 0, EQ_SYMM = 1;
|
|
||||||
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
|
|
||||||
const double F1o4 = 2.5e-1; // 1/4
|
|
||||||
const double F8 = 8.0;
|
|
||||||
const double F16 = 16.0;
|
|
||||||
const double F30 = 30.0;
|
|
||||||
const double F1o12 = ONE / 12.0;
|
|
||||||
const double F1o144 = ONE / 144.0;
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
|
|
||||||
|
|
||||||
const double SoA[3] = { SYM1, SYM2, SYM3 };
|
|
||||||
|
|
||||||
/* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
|
|
||||||
const size_t nx = (size_t)ex1 + 2;
|
|
||||||
const size_t ny = (size_t)ex2 + 2;
|
|
||||||
const size_t nz = (size_t)ex3 + 2;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
|
|
||||||
static double *fh = NULL;
|
|
||||||
static size_t cap = 0;
|
|
||||||
|
|
||||||
if (fh_size > cap) {
|
|
||||||
free(fh);
|
|
||||||
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
|
|
||||||
cap = fh_size;
|
|
||||||
}
|
|
||||||
// double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return;
|
|
||||||
|
|
||||||
symmetry_bd(2, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
/* 系数:按 Fortran 原式 */
|
|
||||||
const double Sdxdx = ONE / (dX * dX);
|
|
||||||
const double Sdydy = ONE / (dY * dY);
|
|
||||||
const double Sdzdz = ONE / (dZ * dZ);
|
|
||||||
|
|
||||||
const double Fdxdx = F1o12 / (dX * dX);
|
|
||||||
const double Fdydy = F1o12 / (dY * dY);
|
|
||||||
const double Fdzdz = F1o12 / (dZ * dZ);
|
|
||||||
|
|
||||||
const double Sdxdy = F1o4 / (dX * dY);
|
|
||||||
const double Sdxdz = F1o4 / (dX * dZ);
|
|
||||||
const double Sdydz = F1o4 / (dY * dZ);
|
|
||||||
|
|
||||||
const double Fdxdy = F1o144 / (dX * dY);
|
|
||||||
const double Fdxdz = F1o144 / (dX * dZ);
|
|
||||||
const double Fdydz = F1o144 / (dY * dZ);
|
|
||||||
|
|
||||||
/* 只清零不被主循环覆盖的边界面 */
|
|
||||||
{
|
|
||||||
/* 高边界:k0=ex3-1 */
|
|
||||||
for (int j0 = 0; j0 < ex2; ++j0)
|
|
||||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
|
||||||
const size_t p = idx_ex(i0, j0, ex3 - 1, ex);
|
|
||||||
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
|
||||||
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
|
||||||
}
|
|
||||||
/* 高边界:j0=ex2-1 */
|
|
||||||
for (int k0 = 0; k0 < ex3 - 1; ++k0)
|
|
||||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
|
||||||
const size_t p = idx_ex(i0, ex2 - 1, k0, ex);
|
|
||||||
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
|
||||||
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
|
||||||
}
|
|
||||||
/* 高边界:i0=ex1-1 */
|
|
||||||
for (int k0 = 0; k0 < ex3 - 1; ++k0)
|
|
||||||
for (int j0 = 0; j0 < ex2 - 1; ++j0) {
|
|
||||||
const size_t p = idx_ex(ex1 - 1, j0, k0, ex);
|
|
||||||
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
|
||||||
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 低边界:当二阶模板也不可用时,对应 i0/j0/k0=0 面 */
|
|
||||||
if (kminF == 1) {
|
|
||||||
for (int j0 = 0; j0 < ex2; ++j0)
|
|
||||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
|
||||||
const size_t p = idx_ex(i0, j0, 0, ex);
|
|
||||||
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
|
||||||
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (jminF == 1) {
|
|
||||||
for (int k0 = 0; k0 < ex3; ++k0)
|
|
||||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
|
||||||
const size_t p = idx_ex(i0, 0, k0, ex);
|
|
||||||
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
|
||||||
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (iminF == 1) {
|
|
||||||
for (int k0 = 0; k0 < ex3; ++k0)
|
|
||||||
for (int j0 = 0; j0 < ex2; ++j0) {
|
|
||||||
const size_t p = idx_ex(0, j0, k0, ex);
|
|
||||||
fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO;
|
|
||||||
fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 两段式:
|
|
||||||
* 1) 二阶可用区域先计算二阶模板
|
|
||||||
* 2) 高阶可用区域再覆盖四阶模板
|
|
||||||
*/
|
|
||||||
const int i2_lo = (iminF > 0) ? iminF : 0;
|
|
||||||
const int j2_lo = (jminF > 0) ? jminF : 0;
|
|
||||||
const int k2_lo = (kminF > 0) ? kminF : 0;
|
|
||||||
const int i2_hi = ex1 - 2;
|
|
||||||
const int j2_hi = ex2 - 2;
|
|
||||||
const int k2_hi = ex3 - 2;
|
|
||||||
|
|
||||||
const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0;
|
|
||||||
const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0;
|
|
||||||
const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0;
|
|
||||||
const int i4_hi = ex1 - 3;
|
|
||||||
const int j4_hi = ex2 - 3;
|
|
||||||
const int k4_hi = ex3 - 3;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Strategy A:
|
|
||||||
* Avoid redundant work in overlap of 2nd/4th-order regions.
|
|
||||||
* Only compute 2nd-order on shell points that are NOT overwritten by
|
|
||||||
* the 4th-order pass.
|
|
||||||
*/
|
|
||||||
const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
|
|
||||||
|
|
||||||
if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
|
|
||||||
for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
|
|
||||||
if (has4 &&
|
|
||||||
i0 >= i4_lo && i0 <= i4_hi &&
|
|
||||||
j0 >= j4_lo && j0 <= j4_hi &&
|
|
||||||
k0 >= k4_lo && k0 <= k4_hi) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
fxx[p] = Sdxdx * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
|
||||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fyy[p] = Sdydy * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
|
||||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fzz[p] = Sdzdz * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
|
||||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fxy[p] = Sdxdy * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fxz[p] = Sdxdz * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fyz[p] = Sdydz * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (has4) {
|
|
||||||
for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = i4_lo; i0 <= i4_hi; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
fxx[p] = Fdxdx * (
|
|
||||||
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fyy[p] = Fdydy * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fzz[p] = Fdzdz * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
{
|
|
||||||
const double t_jm2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jm1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
|
|
||||||
|
|
||||||
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const double t_km2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
|
|
||||||
|
|
||||||
const double t_km1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
|
|
||||||
|
|
||||||
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const double t_km2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
|
|
||||||
|
|
||||||
const double t_km1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
|
|
||||||
|
|
||||||
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// free(fh);
|
|
||||||
}
|
|
||||||
@@ -1,150 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* C 版 fderivs
|
|
||||||
*
|
|
||||||
* Fortran:
|
|
||||||
* subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
|
|
||||||
*
|
|
||||||
* 约定:
|
|
||||||
* f, fx, fy, fz: ex1*ex2*ex3,按 idx_ex 布局
|
|
||||||
* X: ex1, Y: ex2, Z: ex3
|
|
||||||
*/
|
|
||||||
void fderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fx, double *fy, double *fz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff)
|
|
||||||
{
|
|
||||||
(void)onoff; // Fortran 里没用到
|
|
||||||
|
|
||||||
const double ZEO = 0.0, ONE = 1.0;
|
|
||||||
const double TWO = 2.0, EIT = 8.0;
|
|
||||||
const double F12 = 12.0;
|
|
||||||
|
|
||||||
const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
// dX = X(2)-X(1) -> C: X[1]-X[0]
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
|
|
||||||
// Fortran 1-based bounds
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
|
|
||||||
|
|
||||||
// SoA(1:3) = SYM1,SYM2,SYM3
|
|
||||||
const double SoA[3] = { SYM1, SYM2, SYM3 };
|
|
||||||
|
|
||||||
// fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
|
|
||||||
const size_t nx = (size_t)ex1 + 2;
|
|
||||||
const size_t ny = (size_t)ex2 + 2;
|
|
||||||
const size_t nz = (size_t)ex3 + 2;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
static double *fh = NULL;
|
|
||||||
static size_t cap = 0;
|
|
||||||
|
|
||||||
if (fh_size > cap) {
|
|
||||||
free(fh);
|
|
||||||
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
|
|
||||||
cap = fh_size;
|
|
||||||
}
|
|
||||||
// double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return;
|
|
||||||
|
|
||||||
// call symmetry_bd(2,ex,f,fh,SoA)
|
|
||||||
symmetry_bd(2, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
const double d12dx = ONE / F12 / dX;
|
|
||||||
const double d12dy = ONE / F12 / dY;
|
|
||||||
const double d12dz = ONE / F12 / dZ;
|
|
||||||
|
|
||||||
const double d2dx = ONE / TWO / dX;
|
|
||||||
const double d2dy = ONE / TWO / dY;
|
|
||||||
const double d2dz = ONE / TWO / dZ;
|
|
||||||
|
|
||||||
// fx = fy = fz = 0
|
|
||||||
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
|
|
||||||
for (size_t p = 0; p < all; ++p) {
|
|
||||||
fx[p] = ZEO;
|
|
||||||
fy[p] = ZEO;
|
|
||||||
fz[p] = ZEO;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fortran loops:
|
|
||||||
* do k=1,ex3-1
|
|
||||||
* do j=1,ex2-1
|
|
||||||
* do i=1,ex1-1
|
|
||||||
*
|
|
||||||
* C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
|
|
||||||
*/
|
|
||||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
// if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引)
|
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
||||||
{
|
|
||||||
fx[p] = d12dx * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fy[p] = d12dy * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] -
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fz[p] = d12dz * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] -
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// elseif(i+1 <= imax .and. i-1 >= imin ...)
|
|
||||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
||||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
||||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
||||||
{
|
|
||||||
fx[p] = d2dx * (
|
|
||||||
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fy[p] = d2dy * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fz[p] = d2dz * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// free(fh);
|
|
||||||
}
|
|
||||||
@@ -883,17 +883,13 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
|
|||||||
|
|
||||||
integer::i
|
integer::i
|
||||||
|
|
||||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
|
||||||
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
|
||||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
|
||||||
do i=0,ord-1
|
do i=0,ord-1
|
||||||
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
|
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+1,1:extc(2),1:extc(3))*SoA(1)
|
||||||
enddo
|
enddo
|
||||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
|
||||||
do i=0,ord-1
|
do i=0,ord-1
|
||||||
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
|
funcc(:,-i,1:extc(3)) = funcc(:,i+1,1:extc(3))*SoA(2)
|
||||||
enddo
|
enddo
|
||||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
|
||||||
do i=0,ord-1
|
do i=0,ord-1
|
||||||
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
||||||
enddo
|
enddo
|
||||||
@@ -1111,177 +1107,26 @@ end subroutine d2dump
|
|||||||
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
!~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
! common code for cell and vertex
|
! common code for cell and vertex
|
||||||
!------------------------------------------------------------------------------
|
!------------------------------------------------------------------------------
|
||||||
! Lagrangian polynomial interpolation
|
! Lagrangian polynomial interpolation
|
||||||
!------------------------------------------------------------------------------
|
!------------------------------------------------------------------------------
|
||||||
#ifndef POLINT6_USE_BARYCENTRIC
|
|
||||||
#define POLINT6_USE_BARYCENTRIC 1
|
subroutine polint(xa, ya, x, y, dy, ordn)
|
||||||
#endif
|
implicit none
|
||||||
|
|
||||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
|
integer, intent(in) :: ordn
|
||||||
subroutine polint6_neville(xa, ya, x, y, dy)
|
|
||||||
implicit none
|
|
||||||
|
|
||||||
real*8, dimension(6), intent(in) :: xa, ya
|
|
||||||
real*8, intent(in) :: x
|
|
||||||
real*8, intent(out) :: y, dy
|
|
||||||
|
|
||||||
integer :: i, m, ns, n_m
|
|
||||||
real*8, dimension(6) :: c, d, ho
|
|
||||||
real*8 :: dif, dift, hp, h, den_val
|
|
||||||
|
|
||||||
c = ya
|
|
||||||
d = ya
|
|
||||||
ho = xa - x
|
|
||||||
|
|
||||||
ns = 1
|
|
||||||
dif = abs(x - xa(1))
|
|
||||||
|
|
||||||
do i = 2, 6
|
|
||||||
dift = abs(x - xa(i))
|
|
||||||
if (dift < dif) then
|
|
||||||
ns = i
|
|
||||||
dif = dift
|
|
||||||
end if
|
|
||||||
end do
|
|
||||||
|
|
||||||
y = ya(ns)
|
|
||||||
ns = ns - 1
|
|
||||||
|
|
||||||
do m = 1, 5
|
|
||||||
n_m = 6 - m
|
|
||||||
do i = 1, n_m
|
|
||||||
hp = ho(i)
|
|
||||||
h = ho(i+m)
|
|
||||||
den_val = hp - h
|
|
||||||
|
|
||||||
if (den_val == 0.0d0) then
|
|
||||||
write(*,*) 'failure in polint for point',x
|
|
||||||
write(*,*) 'with input points: ',xa
|
|
||||||
stop
|
|
||||||
end if
|
|
||||||
|
|
||||||
den_val = (c(i+1) - d(i)) / den_val
|
|
||||||
|
|
||||||
d(i) = h * den_val
|
|
||||||
c(i) = hp * den_val
|
|
||||||
end do
|
|
||||||
|
|
||||||
if (2 * ns < n_m) then
|
|
||||||
dy = c(ns + 1)
|
|
||||||
else
|
|
||||||
dy = d(ns)
|
|
||||||
ns = ns - 1
|
|
||||||
end if
|
|
||||||
y = y + dy
|
|
||||||
end do
|
|
||||||
|
|
||||||
return
|
|
||||||
end subroutine polint6_neville
|
|
||||||
|
|
||||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_barycentric
|
|
||||||
subroutine polint6_barycentric(xa, ya, x, y, dy)
|
|
||||||
implicit none
|
|
||||||
|
|
||||||
real*8, dimension(6), intent(in) :: xa, ya
|
|
||||||
real*8, intent(in) :: x
|
|
||||||
real*8, intent(out) :: y, dy
|
|
||||||
|
|
||||||
integer :: i, j
|
|
||||||
logical :: is_uniform
|
|
||||||
real*8, dimension(6) :: lambda
|
|
||||||
real*8 :: dx, den_i, term, num, den, step, tol
|
|
||||||
real*8, parameter :: c_uniform(6) = (/ -1.d0, 5.d0, -10.d0, 10.d0, -5.d0, 1.d0 /)
|
|
||||||
|
|
||||||
do i = 1, 6
|
|
||||||
if (x == xa(i)) then
|
|
||||||
y = ya(i)
|
|
||||||
dy = 0.d0
|
|
||||||
return
|
|
||||||
end if
|
|
||||||
end do
|
|
||||||
|
|
||||||
step = xa(2) - xa(1)
|
|
||||||
is_uniform = (step /= 0.d0)
|
|
||||||
if (is_uniform) then
|
|
||||||
tol = 64.d0 * epsilon(1.d0) * max(1.d0, abs(step))
|
|
||||||
do i = 3, 6
|
|
||||||
if (abs((xa(i) - xa(i-1)) - step) > tol) then
|
|
||||||
is_uniform = .false.
|
|
||||||
exit
|
|
||||||
end if
|
|
||||||
end do
|
|
||||||
end if
|
|
||||||
|
|
||||||
if (is_uniform) then
|
|
||||||
num = 0.d0
|
|
||||||
den = 0.d0
|
|
||||||
do i = 1, 6
|
|
||||||
term = c_uniform(i) / (x - xa(i))
|
|
||||||
num = num + term * ya(i)
|
|
||||||
den = den + term
|
|
||||||
end do
|
|
||||||
y = num / den
|
|
||||||
dy = 0.d0
|
|
||||||
return
|
|
||||||
end if
|
|
||||||
|
|
||||||
do i = 1, 6
|
|
||||||
den_i = 1.d0
|
|
||||||
do j = 1, 6
|
|
||||||
if (j /= i) then
|
|
||||||
dx = xa(i) - xa(j)
|
|
||||||
if (dx == 0.0d0) then
|
|
||||||
write(*,*) 'failure in polint for point',x
|
|
||||||
write(*,*) 'with input points: ',xa
|
|
||||||
stop
|
|
||||||
end if
|
|
||||||
den_i = den_i * dx
|
|
||||||
end if
|
|
||||||
end do
|
|
||||||
lambda(i) = 1.d0 / den_i
|
|
||||||
end do
|
|
||||||
|
|
||||||
num = 0.d0
|
|
||||||
den = 0.d0
|
|
||||||
do i = 1, 6
|
|
||||||
term = lambda(i) / (x - xa(i))
|
|
||||||
num = num + term * ya(i)
|
|
||||||
den = den + term
|
|
||||||
end do
|
|
||||||
|
|
||||||
y = num / den
|
|
||||||
dy = 0.d0
|
|
||||||
|
|
||||||
return
|
|
||||||
end subroutine polint6_barycentric
|
|
||||||
|
|
||||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint
|
|
||||||
subroutine polint(xa, ya, x, y, dy, ordn)
|
|
||||||
implicit none
|
|
||||||
|
|
||||||
integer, intent(in) :: ordn
|
|
||||||
real*8, dimension(ordn), intent(in) :: xa, ya
|
real*8, dimension(ordn), intent(in) :: xa, ya
|
||||||
real*8, intent(in) :: x
|
real*8, intent(in) :: x
|
||||||
real*8, intent(out) :: y, dy
|
real*8, intent(out) :: y, dy
|
||||||
|
|
||||||
integer :: i, m, ns, n_m
|
integer :: i, m, ns, n_m
|
||||||
real*8, dimension(ordn) :: c, d, ho
|
real*8, dimension(ordn) :: c, d, ho
|
||||||
real*8 :: dif, dift, hp, h, den_val
|
real*8 :: dif, dift, hp, h, den_val
|
||||||
|
|
||||||
if (ordn == 6) then
|
c = ya
|
||||||
#if POLINT6_USE_BARYCENTRIC
|
d = ya
|
||||||
call polint6_barycentric(xa, ya, x, y, dy)
|
ho = xa - x
|
||||||
#else
|
|
||||||
call polint6_neville(xa, ya, x, y, dy)
|
|
||||||
#endif
|
|
||||||
return
|
|
||||||
end if
|
|
||||||
|
|
||||||
c = ya
|
|
||||||
d = ya
|
|
||||||
ho = xa - x
|
|
||||||
|
|
||||||
ns = 1
|
ns = 1
|
||||||
dif = abs(x - xa(1))
|
dif = abs(x - xa(1))
|
||||||
@@ -1325,77 +1170,13 @@ end subroutine d2dump
|
|||||||
y = y + dy
|
y = y + dy
|
||||||
end do
|
end do
|
||||||
|
|
||||||
return
|
return
|
||||||
end subroutine polint
|
end subroutine polint
|
||||||
|
!------------------------------------------------------------------------------
|
||||||
subroutine polint0(xa, ya, y, ordn)
|
!
|
||||||
! Lagrange interpolation at x=0, O(n) direct formula
|
! interpolation in 2 dimensions, follow yx order
|
||||||
implicit none
|
!
|
||||||
integer, intent(in) :: ordn
|
!------------------------------------------------------------------------------
|
||||||
real*8, dimension(ordn), intent(in) :: xa, ya
|
|
||||||
real*8, intent(out) :: y
|
|
||||||
|
|
||||||
integer :: j, k
|
|
||||||
real*8 :: wj
|
|
||||||
|
|
||||||
y = 0.d0
|
|
||||||
do j = 1, ordn
|
|
||||||
wj = 1.d0
|
|
||||||
do k = 1, ordn
|
|
||||||
if (k .ne. j) then
|
|
||||||
wj = wj * xa(k) / (xa(k) - xa(j))
|
|
||||||
endif
|
|
||||||
enddo
|
|
||||||
y = y + wj * ya(j)
|
|
||||||
enddo
|
|
||||||
|
|
||||||
return
|
|
||||||
end subroutine polint0
|
|
||||||
!------------------------------------------------------------------------------
|
|
||||||
!
|
|
||||||
! interpolation in 2 dimensions, follow yx order
|
|
||||||
!
|
|
||||||
!------------------------------------------------------------------------------
|
|
||||||
!------------------------------------------------------------------------------
|
|
||||||
! Compute Lagrange interpolation basis weights for one target point.
|
|
||||||
!------------------------------------------------------------------------------
|
|
||||||
!DIR$ ATTRIBUTES FORCEINLINE :: polint_lagrange_weights
|
|
||||||
subroutine polint_lagrange_weights(xa, x, w, ordn)
|
|
||||||
implicit none
|
|
||||||
|
|
||||||
integer, intent(in) :: ordn
|
|
||||||
real*8, dimension(1:ordn), intent(in) :: xa
|
|
||||||
real*8, intent(in) :: x
|
|
||||||
real*8, dimension(1:ordn), intent(out) :: w
|
|
||||||
|
|
||||||
integer :: i, j
|
|
||||||
real*8 :: num, den, dx
|
|
||||||
|
|
||||||
do i = 1, ordn
|
|
||||||
num = 1.d0
|
|
||||||
den = 1.d0
|
|
||||||
do j = 1, ordn
|
|
||||||
if (j /= i) then
|
|
||||||
dx = xa(i) - xa(j)
|
|
||||||
if (dx == 0.0d0) then
|
|
||||||
write(*,*) 'failure in polint for point',x
|
|
||||||
write(*,*) 'with input points: ',xa
|
|
||||||
stop
|
|
||||||
end if
|
|
||||||
num = num * (x - xa(j))
|
|
||||||
den = den * dx
|
|
||||||
end if
|
|
||||||
end do
|
|
||||||
w(i) = num / den
|
|
||||||
end do
|
|
||||||
|
|
||||||
return
|
|
||||||
end subroutine polint_lagrange_weights
|
|
||||||
!------------------------------------------------------------------------------
|
|
||||||
!
|
|
||||||
! interpolation in 2 dimensions, follow yx order
|
|
||||||
!
|
|
||||||
!------------------------------------------------------------------------------
|
|
||||||
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
|
subroutine polin2(x1a,x2a,ya,x1,x2,y,dy,ordn)
|
||||||
implicit none
|
implicit none
|
||||||
|
|
||||||
@@ -1443,11 +1224,11 @@ end subroutine d2dump
|
|||||||
real*8, intent(in) :: x1,x2,x3
|
real*8, intent(in) :: x1,x2,x3
|
||||||
real*8, intent(out) :: y,dy
|
real*8, intent(out) :: y,dy
|
||||||
|
|
||||||
#ifdef POLINT_LEGACY_ORDER
|
#ifdef POLINT_LEGACY_ORDER
|
||||||
integer :: i,j,m,n
|
integer :: i,j,m,n
|
||||||
real*8, dimension(ordn,ordn) :: yatmp
|
real*8, dimension(ordn,ordn) :: yatmp
|
||||||
real*8, dimension(ordn) :: ymtmp
|
real*8, dimension(ordn) :: ymtmp
|
||||||
real*8, dimension(ordn) :: yntmp
|
real*8, dimension(ordn) :: yntmp
|
||||||
real*8, dimension(ordn) :: yqtmp
|
real*8, dimension(ordn) :: yqtmp
|
||||||
|
|
||||||
m=size(x1a)
|
m=size(x1a)
|
||||||
@@ -1457,36 +1238,29 @@ end subroutine d2dump
|
|||||||
yqtmp=ya(i,j,:)
|
yqtmp=ya(i,j,:)
|
||||||
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
|
call polint(x3a,yqtmp,x3,yatmp(i,j),dy,ordn)
|
||||||
end do
|
end do
|
||||||
yntmp=yatmp(i,:)
|
yntmp=yatmp(i,:)
|
||||||
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
|
call polint(x2a,yntmp,x2,ymtmp(i),dy,ordn)
|
||||||
end do
|
end do
|
||||||
call polint(x1a,ymtmp,x1,y,dy,ordn)
|
call polint(x1a,ymtmp,x1,y,dy,ordn)
|
||||||
#else
|
#else
|
||||||
integer :: i, j, k
|
integer :: j, k
|
||||||
real*8, dimension(ordn) :: w1, w2
|
real*8, dimension(ordn,ordn) :: yatmp
|
||||||
real*8, dimension(ordn) :: ymtmp
|
real*8, dimension(ordn) :: ymtmp
|
||||||
real*8 :: yx_sum, x_sum
|
real*8 :: dy_temp
|
||||||
|
|
||||||
call polint_lagrange_weights(x1a, x1, w1, ordn)
|
do k=1,ordn
|
||||||
call polint_lagrange_weights(x2a, x2, w2, ordn)
|
do j=1,ordn
|
||||||
|
call polint(x1a, ya(:,j,k), x1, yatmp(j,k), dy_temp, ordn)
|
||||||
do k = 1, ordn
|
end do
|
||||||
yx_sum = 0.d0
|
end do
|
||||||
do j = 1, ordn
|
do k=1,ordn
|
||||||
x_sum = 0.d0
|
call polint(x2a, yatmp(:,k), x2, ymtmp(k), dy_temp, ordn)
|
||||||
do i = 1, ordn
|
end do
|
||||||
x_sum = x_sum + w1(i) * ya(i,j,k)
|
call polint(x3a, ymtmp, x3, y, dy, ordn)
|
||||||
end do
|
#endif
|
||||||
yx_sum = yx_sum + w2(j) * x_sum
|
|
||||||
end do
|
return
|
||||||
ymtmp(k) = yx_sum
|
end subroutine polin3
|
||||||
end do
|
|
||||||
|
|
||||||
call polint(x3a, ymtmp, x3, y, dy, ordn)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return
|
|
||||||
end subroutine polin3
|
|
||||||
!--------------------------------------------------------------------------------------
|
!--------------------------------------------------------------------------------------
|
||||||
! calculate L2norm
|
! calculate L2norm
|
||||||
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
|
subroutine l2normhelper(ex, X, Y, Z,xmin,ymin,zmin,xmax,ymax,zmax,&
|
||||||
@@ -1829,14 +1603,11 @@ deallocate(f_flat)
|
|||||||
! ^
|
! ^
|
||||||
! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
|
! f=3/8*f_1 + 3/4*f_2 - 1/8*f_3
|
||||||
|
|
||||||
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
|
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
|
||||||
integer :: i,j,k
|
|
||||||
|
fout = C1*f1+C2*f2+C3*f3
|
||||||
do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
|
|
||||||
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
|
return
|
||||||
end do
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
end subroutine average2
|
end subroutine average2
|
||||||
!-----------------------------------------------------------------------------
|
!-----------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -1,107 +0,0 @@
|
|||||||
#include "interp_lb_profile.h"
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
namespace InterpLBProfile {
|
|
||||||
|
|
||||||
bool write_profile(const char *filepath, int nprocs,
|
|
||||||
const double *rank_times,
|
|
||||||
const int *heavy_ranks, int num_heavy,
|
|
||||||
double threshold_ratio)
|
|
||||||
{
|
|
||||||
FILE *fp = fopen(filepath, "wb");
|
|
||||||
if (!fp) return false;
|
|
||||||
|
|
||||||
ProfileHeader hdr;
|
|
||||||
hdr.magic = MAGIC;
|
|
||||||
hdr.version = VERSION;
|
|
||||||
hdr.nprocs = nprocs;
|
|
||||||
hdr.num_heavy = num_heavy;
|
|
||||||
hdr.threshold_ratio = threshold_ratio;
|
|
||||||
|
|
||||||
fwrite(&hdr, sizeof(hdr), 1, fp);
|
|
||||||
fwrite(rank_times, sizeof(double), nprocs, fp);
|
|
||||||
fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
|
|
||||||
fclose(fp);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool read_profile(const char *filepath, int current_nprocs,
|
|
||||||
int *heavy_ranks, int &num_heavy,
|
|
||||||
double *rank_times, MPI_Comm comm)
|
|
||||||
{
|
|
||||||
int myrank;
|
|
||||||
MPI_Comm_rank(comm, &myrank);
|
|
||||||
|
|
||||||
int valid = 0;
|
|
||||||
ProfileHeader hdr;
|
|
||||||
memset(&hdr, 0, sizeof(hdr));
|
|
||||||
|
|
||||||
if (myrank == 0) {
|
|
||||||
FILE *fp = fopen(filepath, "rb");
|
|
||||||
if (fp) {
|
|
||||||
if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
|
|
||||||
hdr.magic == MAGIC && hdr.version == VERSION &&
|
|
||||||
hdr.nprocs == current_nprocs)
|
|
||||||
{
|
|
||||||
if (fread(rank_times, sizeof(double), current_nprocs, fp)
|
|
||||||
== (size_t)current_nprocs &&
|
|
||||||
fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
|
|
||||||
== (size_t)hdr.num_heavy)
|
|
||||||
{
|
|
||||||
num_heavy = hdr.num_heavy;
|
|
||||||
valid = 1;
|
|
||||||
}
|
|
||||||
} else if (fp) {
|
|
||||||
printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
|
|
||||||
"nprocs=%d (current=%d)\n",
|
|
||||||
hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
|
|
||||||
}
|
|
||||||
fclose(fp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
|
|
||||||
if (!valid) return false;
|
|
||||||
|
|
||||||
MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
|
|
||||||
MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
|
|
||||||
MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int identify_heavy_ranks(const double *rank_times, int nprocs,
|
|
||||||
double threshold_ratio,
|
|
||||||
int *heavy_ranks, int max_heavy)
|
|
||||||
{
|
|
||||||
double sum = 0;
|
|
||||||
for (int i = 0; i < nprocs; i++) sum += rank_times[i];
|
|
||||||
double mean = sum / nprocs;
|
|
||||||
double threshold = threshold_ratio * mean;
|
|
||||||
|
|
||||||
// Collect candidates
|
|
||||||
struct RankTime { int rank; double time; };
|
|
||||||
RankTime *candidates = new RankTime[nprocs];
|
|
||||||
int ncand = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < nprocs; i++) {
|
|
||||||
if (rank_times[i] > threshold)
|
|
||||||
candidates[ncand++] = {i, rank_times[i]};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort descending by time
|
|
||||||
std::sort(candidates, candidates + ncand,
|
|
||||||
[](const RankTime &a, const RankTime &b) {
|
|
||||||
return a.time > b.time;
|
|
||||||
});
|
|
||||||
|
|
||||||
int count = (ncand < max_heavy) ? ncand : max_heavy;
|
|
||||||
for (int i = 0; i < count; i++)
|
|
||||||
heavy_ranks[i] = candidates[i].rank;
|
|
||||||
|
|
||||||
delete[] candidates;
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace InterpLBProfile
|
|
||||||
Binary file not shown.
@@ -1,38 +0,0 @@
|
|||||||
#ifndef INTERP_LB_PROFILE_H
|
|
||||||
#define INTERP_LB_PROFILE_H
|
|
||||||
|
|
||||||
#include <mpi.h>
|
|
||||||
|
|
||||||
namespace InterpLBProfile {
|
|
||||||
|
|
||||||
static const unsigned int MAGIC = 0x494C4250; // "ILBP"
|
|
||||||
static const unsigned int VERSION = 1;
|
|
||||||
|
|
||||||
struct ProfileHeader {
|
|
||||||
unsigned int magic;
|
|
||||||
unsigned int version;
|
|
||||||
int nprocs;
|
|
||||||
int num_heavy;
|
|
||||||
double threshold_ratio;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Write profile file (rank 0 only)
|
|
||||||
bool write_profile(const char *filepath, int nprocs,
|
|
||||||
const double *rank_times,
|
|
||||||
const int *heavy_ranks, int num_heavy,
|
|
||||||
double threshold_ratio);
|
|
||||||
|
|
||||||
// Read profile file (rank 0 reads, then broadcasts to all)
|
|
||||||
// Returns true if file found and valid for current nprocs
|
|
||||||
bool read_profile(const char *filepath, int current_nprocs,
|
|
||||||
int *heavy_ranks, int &num_heavy,
|
|
||||||
double *rank_times, MPI_Comm comm);
|
|
||||||
|
|
||||||
// Identify heavy ranks: those with time > threshold_ratio * mean
|
|
||||||
int identify_heavy_ranks(const double *rank_times, int nprocs,
|
|
||||||
double threshold_ratio,
|
|
||||||
int *heavy_ranks, int max_heavy);
|
|
||||||
|
|
||||||
} // namespace InterpLBProfile
|
|
||||||
|
|
||||||
#endif /* INTERP_LB_PROFILE_H */
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
/* Auto-generated from interp_lb_profile.bin — do not edit */
|
|
||||||
#ifndef INTERP_LB_PROFILE_DATA_H
|
|
||||||
#define INTERP_LB_PROFILE_DATA_H
|
|
||||||
|
|
||||||
#define INTERP_LB_NPROCS 64
|
|
||||||
#define INTERP_LB_NUM_HEAVY 4
|
|
||||||
|
|
||||||
static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
|
|
||||||
|
|
||||||
/* Split table: {block_id, r_left, r_right} */
|
|
||||||
static const int interp_lb_splits[4][3] = {
|
|
||||||
{27, 26, 27},
|
|
||||||
{35, 34, 35},
|
|
||||||
{28, 28, 29},
|
|
||||||
{36, 36, 37},
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Rank remap for displaced neighbor blocks */
|
|
||||||
static const int interp_lb_num_remaps = 4;
|
|
||||||
static const int interp_lb_remaps[][2] = {
|
|
||||||
{26, 25},
|
|
||||||
{29, 30},
|
|
||||||
{34, 33},
|
|
||||||
{37, 38},
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif /* INTERP_LB_PROFILE_DATA_H */
|
|
||||||
@@ -65,8 +65,6 @@ real*8,intent(in) :: eps
|
|||||||
! dx^4
|
! dx^4
|
||||||
|
|
||||||
! note the sign (-1)^r-1, now r=2
|
! note the sign (-1)^r-1, now r=2
|
||||||
!DIR$ SIMD VECTORLENGTHFOR(KNOWN_INTEGER=8)
|
|
||||||
!DIR$ UNROLL PARTIAL(4)
|
|
||||||
do k=1,ex(3)
|
do k=1,ex(3)
|
||||||
do j=1,ex(2)
|
do j=1,ex(2)
|
||||||
do i=1,ex(1)
|
do i=1,ex(1)
|
||||||
|
|||||||
@@ -1,109 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* C 版 kodis
|
|
||||||
*
|
|
||||||
* Fortran signature:
|
|
||||||
* subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
|
|
||||||
*
|
|
||||||
* 约定:
|
|
||||||
* X: ex1, Y: ex2, Z: ex3
|
|
||||||
* f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
|
|
||||||
* SoA[3]
|
|
||||||
* eps: double
|
|
||||||
*/
|
|
||||||
void kodis(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double SoA[3],
|
|
||||||
int Symmetry, double eps)
|
|
||||||
{
|
|
||||||
const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
|
|
||||||
const double cof = 64.0; // 2^6
|
|
||||||
const int NO_SYMM = 0, OCTANT = 2;
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
// Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
(void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上
|
|
||||||
|
|
||||||
// Fortran: imax=ex(1) 等是 1-based 上界
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
// Fortran: imin=jmin=kmin=1,某些对称情况变 -2
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
|
||||||
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
|
|
||||||
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
|
|
||||||
|
|
||||||
// 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3
|
|
||||||
const size_t nx = (size_t)ex1 + 3;
|
|
||||||
const size_t ny = (size_t)ex2 + 3;
|
|
||||||
const size_t nz = (size_t)ex3 + 3;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
|
|
||||||
double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return;
|
|
||||||
|
|
||||||
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
|
||||||
symmetry_bd(3, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fortran loops:
|
|
||||||
* do k=1,ex3
|
|
||||||
* do j=1,ex2
|
|
||||||
* do i=1,ex1
|
|
||||||
*
|
|
||||||
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
|
|
||||||
* 并定义 Fortran index: iF=i0+1, ...
|
|
||||||
*/
|
|
||||||
for (int k0 = 0; k0 < ex3; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = 0; j0 < ex2; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
|
|
||||||
// Fortran if 条件:
|
|
||||||
// i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引)
|
|
||||||
if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
|
|
||||||
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
|
|
||||||
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
|
|
||||||
{
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
|
|
||||||
const double Dx_term =
|
|
||||||
( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
|
|
||||||
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
|
|
||||||
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
|
|
||||||
TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX;
|
|
||||||
|
|
||||||
const double Dy_term =
|
|
||||||
( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
|
|
||||||
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
|
|
||||||
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
|
|
||||||
TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY;
|
|
||||||
|
|
||||||
const double Dz_term =
|
|
||||||
( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
|
|
||||||
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
|
|
||||||
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
|
|
||||||
TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ;
|
|
||||||
|
|
||||||
// Fortran:
|
|
||||||
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
|
|
||||||
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
free(fh);
|
|
||||||
}
|
|
||||||
@@ -1,255 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
/*
|
|
||||||
* 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。
|
|
||||||
* Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
|
||||||
*
|
|
||||||
* 约定:
|
|
||||||
* nghost = 3
|
|
||||||
* ex[3] = {ex1,ex2,ex3}
|
|
||||||
* f = 原始网格 (ex1*ex2*ex3)
|
|
||||||
* fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...)
|
|
||||||
* SoA[3] = 输入参数
|
|
||||||
*/
|
|
||||||
void lopsided(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
|
||||||
int Symmetry, const double SoA[3])
|
|
||||||
{
|
|
||||||
const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
|
|
||||||
const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
|
|
||||||
const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
|
|
||||||
|
|
||||||
const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
|
|
||||||
(void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
// 对应 Fortran: dX = X(2)-X(1) (Fortran 1-based)
|
|
||||||
// C: X[1]-X[0]
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
|
|
||||||
const double d12dx = ONE / F12 / dX;
|
|
||||||
const double d12dy = ONE / F12 / dY;
|
|
||||||
const double d12dz = ONE / F12 / dZ;
|
|
||||||
|
|
||||||
// Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来)
|
|
||||||
const double d2dx = ONE / TWO / dX;
|
|
||||||
const double d2dy = ONE / TWO / dY;
|
|
||||||
const double d2dz = ONE / TWO / dZ;
|
|
||||||
(void)d2dx; (void)d2dy; (void)d2dz;
|
|
||||||
|
|
||||||
// Fortran:
|
|
||||||
// imax = ex(1); jmax = ex(2); kmax = ex(3)
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
// Fortran:
|
|
||||||
// imin=jmin=kmin=1; 若满足对称条件则设为 -2
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
|
|
||||||
|
|
||||||
// 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3)
|
|
||||||
const size_t nx = (size_t)ex1 + 3;
|
|
||||||
const size_t ny = (size_t)ex2 + 3;
|
|
||||||
const size_t nz = (size_t)ex3 + 3;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
|
|
||||||
double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错)
|
|
||||||
|
|
||||||
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
|
||||||
symmetry_bd(3, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fortran 主循环:
|
|
||||||
* do k=1,ex(3)-1
|
|
||||||
* do j=1,ex(2)-1
|
|
||||||
* do i=1,ex(1)-1
|
|
||||||
*
|
|
||||||
* 转成 C 0-based:
|
|
||||||
* k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
|
|
||||||
*
|
|
||||||
* 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值:
|
|
||||||
* iF=i0+1, jF=j0+1, kF=k0+1
|
|
||||||
*/
|
|
||||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
// ---------------- x direction ----------------
|
|
||||||
const double sfx = Sfx[p];
|
|
||||||
if (sfx > ZEO) {
|
|
||||||
// Fortran: if(i+3 <= imax)
|
|
||||||
// iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4
|
|
||||||
if (i0 <= ex1 - 4) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i+2 <= imax) <=> i0 <= ex1-3
|
|
||||||
else if (i0 <= ex1 - 3) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i+1 <= imax) <=> i0 <= ex1-2(循环里总成立)
|
|
||||||
else if (i0 <= ex1 - 2) {
|
|
||||||
f_rhs[p] -= sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
} else if (sfx < ZEO) {
|
|
||||||
// Fortran: if(i-3 >= imin)
|
|
||||||
// (iF-3) >= iminF <=> (i0-2) >= iminF
|
|
||||||
if ((i0 - 2) >= iminF) {
|
|
||||||
f_rhs[p] -= sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i-2 >= imin) <=> (i0-1) >= iminF
|
|
||||||
else if ((i0 - 1) >= iminF) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i-1 >= imin) <=> i0 >= iminF
|
|
||||||
else if (i0 >= iminF) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------- y direction ----------------
|
|
||||||
const double sfy = Sfy[p];
|
|
||||||
if (sfy > ZEO) {
|
|
||||||
// jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
|
|
||||||
if (j0 <= ex2 - 4) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
|
||||||
} else if (j0 <= ex2 - 3) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
|
||||||
} else if (j0 <= ex2 - 2) {
|
|
||||||
f_rhs[p] -= sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
|
||||||
}
|
|
||||||
} else if (sfy < ZEO) {
|
|
||||||
if ((j0 - 2) >= jminF) {
|
|
||||||
f_rhs[p] -= sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
|
||||||
} else if ((j0 - 1) >= jminF) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
|
||||||
} else if (j0 >= jminF) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------- z direction ----------------
|
|
||||||
const double sfz = Sfz[p];
|
|
||||||
if (sfz > ZEO) {
|
|
||||||
if (k0 <= ex3 - 4) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
|
||||||
} else if (k0 <= ex3 - 3) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
|
||||||
} else if (k0 <= ex3 - 2) {
|
|
||||||
f_rhs[p] -= sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
|
||||||
}
|
|
||||||
} else if (sfz < ZEO) {
|
|
||||||
if ((k0 - 2) >= kminF) {
|
|
||||||
f_rhs[p] -= sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
|
||||||
} else if ((k0 - 1) >= kminF) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
|
||||||
} else if (k0 >= kminF) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
free(fh);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,77 +1,83 @@
|
|||||||
|
|
||||||
#define tetradtype 2
|
|
||||||
|
#if 0
|
||||||
#define Cell
|
note here
|
||||||
|
v:r; u: phi; w: theta
|
||||||
#define ghost_width 3
|
tetradtype 0
|
||||||
|
v^a = (x,y,z)
|
||||||
|
orthonormal order: v,u,w
|
||||||
|
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
||||||
#define GAUGE 0
|
tetradtype 1
|
||||||
|
orthonormal order: w,u,v
|
||||||
#define CPBC_ghost_width (ghost_width)
|
m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012)
|
||||||
|
tetradtype 2
|
||||||
#define ABV 0
|
v_a = (x,y,z)
|
||||||
|
orthonormal order: v,u,w
|
||||||
#define EScalar_CC 2
|
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
||||||
|
#endif
|
||||||
#if 0
|
#define tetradtype 2
|
||||||
|
|
||||||
define tetradtype
|
#if 0
|
||||||
v:r; u: phi; w: theta
|
note here
|
||||||
tetradtype 0
|
Cell center or Vertex center
|
||||||
v^a = (x,y,z)
|
#endif
|
||||||
orthonormal order: v,u,w
|
#define Cell
|
||||||
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
|
||||||
tetradtype 1
|
#if 0
|
||||||
orthonormal order: w,u,v
|
note here
|
||||||
m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012)
|
2nd order: 2
|
||||||
tetradtype 2
|
4th order: 3
|
||||||
v_a = (x,y,z)
|
6th order: 4
|
||||||
orthonormal order: v,u,w
|
8th order: 5
|
||||||
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
#endif
|
||||||
|
#define ghost_width 3
|
||||||
define Cell or Vertex
|
|
||||||
Cell center or Vertex center
|
#if 0
|
||||||
|
note here
|
||||||
define ghost_width
|
use shell or not
|
||||||
2nd order: 2
|
#endif
|
||||||
4th order: 3
|
#define WithShell
|
||||||
6th order: 4
|
|
||||||
8th order: 5
|
#if 0
|
||||||
|
note here
|
||||||
define WithShell
|
use constraint preserving boundary condition or not
|
||||||
use shell or not
|
only affect Z4c
|
||||||
|
#endif
|
||||||
define CPBC
|
#define CPBC
|
||||||
use constraint preserving boundary condition or not
|
|
||||||
only affect Z4c
|
#if 0
|
||||||
CPBC only supports WithShell
|
note here
|
||||||
|
Gauge condition type
|
||||||
define GAUGE
|
0: B^i gauge
|
||||||
0: B^i gauge
|
1: David's puncture gauge
|
||||||
1: David puncture gauge
|
2: MB B^i gauge
|
||||||
2: MB B^i gauge
|
3: RIT B^i gauge
|
||||||
3: RIT B^i gauge
|
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||||
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||||
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
6: MGB1 B^i gauge
|
||||||
6: MGB1 B^i gauge
|
7: MGB2 B^i gauge
|
||||||
7: MGB2 B^i gauge
|
#endif
|
||||||
|
#define GAUGE 2
|
||||||
define CPBC_ghost_width (ghost_width)
|
|
||||||
buffer points for CPBC boundary
|
#if 0
|
||||||
|
buffer points for CPBC boundary
|
||||||
define ABV
|
#endif
|
||||||
0: using BSSN variable for constraint violation and psi4 calculation
|
#define CPBC_ghost_width (ghost_width)
|
||||||
1: using ADM variable for constraint violation and psi4 calculation
|
|
||||||
|
#if 0
|
||||||
define EScalar_CC
|
using BSSN variable for constraint violation and psi4 calculation: 0
|
||||||
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
|
using ADM variable for constraint violation and psi4 calculation: 1
|
||||||
1: Case C of 1112.3928, V=0
|
#endif
|
||||||
2: shell with phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
|
#define ABV 0
|
||||||
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
|
|
||||||
4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
|
#if 0
|
||||||
5: shell with phi(r) = phi0 * Exp(-(r-r0)**2/sigma), V = 0
|
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
|
||||||
|
1: Case C of 1112.3928, V=0
|
||||||
#endif
|
2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
|
||||||
|
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
|
||||||
|
4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
|
||||||
|
5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
|
||||||
|
#endif
|
||||||
|
#define EScalar_CC 2
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,145 +1,112 @@
|
|||||||
|
|
||||||
#ifndef MICRODEF_H
|
#ifndef MICRODEF_H
|
||||||
#define MICRODEF_H
|
#define MICRODEF_H
|
||||||
|
|
||||||
#include "macrodef.fh"
|
#include "macrodef.fh"
|
||||||
|
|
||||||
// application parameters
|
// application parameters
|
||||||
|
|
||||||
#define SommerType 0
|
/// ****
|
||||||
|
// sommerfeld boundary type
|
||||||
#define GaussInt
|
// 0: bam, 1: shibata
|
||||||
|
#define SommerType 0
|
||||||
#define ABEtype 0
|
|
||||||
|
/// ****
|
||||||
//#define With_AHF
|
// for Using Gauss-Legendre quadrature in theta direction
|
||||||
#define Psi4type 0
|
#define GaussInt
|
||||||
|
|
||||||
//#define Point_Psi4
|
/// ****
|
||||||
|
// 0: BSSN vacuum
|
||||||
#define RPS 1
|
// 1: coupled to scalar field
|
||||||
|
// 2: Z4c vacuum
|
||||||
#define AGM 0
|
// 3: coupled to Maxwell field
|
||||||
|
//
|
||||||
#define RPB 0
|
#define ABEtype 2
|
||||||
|
|
||||||
#define MAPBH 1
|
/// ****
|
||||||
|
// using Apparent Horizon Finder
|
||||||
#define PSTR 0
|
//#define With_AHF
|
||||||
|
|
||||||
#define REGLEV 0
|
/// ****
|
||||||
|
// Psi4 calculation method
|
||||||
//#define USE_GPU
|
// 0: EB method
|
||||||
|
// 1: 4-D method
|
||||||
//#define CHECKDETAIL
|
//
|
||||||
|
#define Psi4type 0
|
||||||
//#define FAKECHECK
|
|
||||||
|
/// ****
|
||||||
//
|
// for Using point psi4 or not
|
||||||
// define SommerType
|
//#define Point_Psi4
|
||||||
// sommerfeld boundary type
|
|
||||||
// 0: bam
|
/// ****
|
||||||
// 1: shibata
|
// RestrictProlong in Step (0) or after Step (1)
|
||||||
//
|
#define RPS 1
|
||||||
// define GaussInt
|
|
||||||
// for Using Gauss-Legendre quadrature in theta direction
|
/// ****
|
||||||
//
|
// Enforce algebra constraint
|
||||||
// define ABEtype
|
// for every RK4 sub step: 0
|
||||||
// 0: BSSN vacuum
|
// only when iter_count == 3: 1
|
||||||
// 1: coupled to scalar field
|
// after routine Step: 2
|
||||||
// 2: Z4c vacuum
|
#define AGM 0
|
||||||
// 3: coupled to Maxwell field
|
|
||||||
//
|
/// ****
|
||||||
// define With_AHF
|
// Restrict Prolong using BAM style 1 or old style 0
|
||||||
// using Apparent Horizon Finder
|
#define RPB 0
|
||||||
//
|
|
||||||
// define Psi4type
|
/// ****
|
||||||
// Psi4 calculation method
|
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
|
||||||
// 0: EB method
|
#define MAPBH 1
|
||||||
// 1: 4-D method
|
|
||||||
//
|
/// ****
|
||||||
// define Point_Psi4
|
// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
|
||||||
// for Using point psi4 or not
|
#define PSTR 0
|
||||||
//
|
|
||||||
// define RPS
|
/// ****
|
||||||
// RestrictProlong in Step (0) or after Step (1)
|
// regrid for every level or for all levels at a time
|
||||||
//
|
// 0: for every level; 1: for all
|
||||||
// define AGM
|
#define REGLEV 0
|
||||||
// Enforce algebra constraint
|
|
||||||
// for every RK4 sub step: 0
|
/// ****
|
||||||
// only when iter_count == 3: 1
|
// use gpu or not
|
||||||
// after routine Step: 2
|
//#define USE_GPU
|
||||||
//
|
|
||||||
// define RPB
|
/// ****
|
||||||
// Restrict Prolong using BAM style 1 or old style 0
|
// use checkpoint for every process
|
||||||
//
|
//#define CHECKDETAIL
|
||||||
// define MAPBH
|
|
||||||
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
|
/// ****
|
||||||
//
|
// use FakeCheckPrepare to write CheckPoint
|
||||||
// define PSTR
|
//#define FAKECHECK
|
||||||
// parallel structure
|
////================================================================
|
||||||
// 0: level by level
|
// some basic parameters for numerical calculation
|
||||||
// 1: considering all levels
|
#define dim 3
|
||||||
// 2: as 1 but reverse the CPU order
|
|
||||||
// 3: Frank's scheme
|
//#define Cell or Vertex in "microdef.fh"
|
||||||
//
|
|
||||||
// define REGLEV
|
// ******
|
||||||
// regrid for every level or for all levels at a time
|
// buffer point number for mesh refinement interface
|
||||||
// 0: for every level;
|
#define buffer_width 6
|
||||||
// 1: for all
|
|
||||||
//
|
// ******
|
||||||
// define USE_GPU
|
// buffer point number shell-box interface, on shell
|
||||||
// use gpu or not
|
#define SC_width buffer_width
|
||||||
//
|
// buffer point number shell-box interface, on box
|
||||||
// define CHECKDETAIL
|
#define CS_width (2*buffer_width)
|
||||||
// use checkpoint for every process
|
|
||||||
//
|
#if(buffer_width < ghost_width)
|
||||||
// define FAKECHECK
|
#error we always assume buffer_width>ghost_width
|
||||||
// use FakeCheckPrepare to write CheckPoint
|
#endif
|
||||||
//
|
|
||||||
|
#define PACK 1
|
||||||
////================================================================
|
#define UNPACK 2
|
||||||
// some basic parameters for numerical calculation
|
|
||||||
////================================================================
|
#define Mymax(a,b) (((a) > (b)) ? (a) : (b))
|
||||||
|
#define Mymin(a,b) (((a) < (b)) ? (a) : (b))
|
||||||
#define dim 3
|
|
||||||
|
#define feq(a,b,d) (fabs(a-b)<d)
|
||||||
//#define Cell or Vertex in "macrodef.fh"
|
#define flt(a,b,d) ((a-b)<d)
|
||||||
|
#define fgt(a,b,d) ((a-b)>d)
|
||||||
#define buffer_width 6
|
|
||||||
|
#define TINY 1e-10
|
||||||
#define SC_width buffer_width
|
|
||||||
|
#endif /* MICRODEF_H */
|
||||||
#define CS_width (2*buffer_width)
|
|
||||||
|
|
||||||
//
|
|
||||||
// define Cell or Vertex in "macrodef.fh"
|
|
||||||
//
|
|
||||||
// define buffer_width
|
|
||||||
// buffer point number for mesh refinement interface
|
|
||||||
//
|
|
||||||
// define SC_width buffer_width
|
|
||||||
// buffer point number shell-box interface, on shell
|
|
||||||
//
|
|
||||||
// define CS_width
|
|
||||||
// buffer point number shell-box interface, on box
|
|
||||||
//
|
|
||||||
|
|
||||||
#if(buffer_width < ghost_width)
|
|
||||||
# error we always assume buffer_width>ghost_width
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define PACK 1
|
|
||||||
#define UNPACK 2
|
|
||||||
|
|
||||||
#define Mymax(a,b) (((a) > (b)) ? (a) : (b))
|
|
||||||
#define Mymin(a,b) (((a) < (b)) ? (a) : (b))
|
|
||||||
|
|
||||||
#define feq(a,b,d) (fabs(a-b)<d)
|
|
||||||
#define flt(a,b,d) ((a-b)<d)
|
|
||||||
#define fgt(a,b,d) ((a-b)>d)
|
|
||||||
|
|
||||||
#define TINY 1e-10
|
|
||||||
|
|
||||||
#endif /* MICRODEF_H */
|
|
||||||
|
|
||||||
|
|||||||
@@ -2,33 +2,6 @@
|
|||||||
|
|
||||||
include makefile.inc
|
include makefile.inc
|
||||||
|
|
||||||
## polint(ordn=6) kernel selector:
|
|
||||||
## 1 (default): barycentric fast path
|
|
||||||
## 0 : fallback to Neville path
|
|
||||||
POLINT6_USE_BARY ?= 1
|
|
||||||
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
|
||||||
|
|
||||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
|
||||||
## make -> opt (PGO-guided, maximum performance)
|
|
||||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
|
||||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
|
||||||
|
|
||||||
ifeq ($(PGO_MODE),instrument)
|
|
||||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
|
||||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
|
||||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
|
||||||
else
|
|
||||||
## opt (default): maximum performance with PGO profile data
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=$(PROFDATA) \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
|
||||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=$(PROFDATA) \
|
|
||||||
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
|
|
||||||
endif
|
|
||||||
|
|
||||||
.SUFFIXES: .o .f90 .C .for .cu
|
.SUFFIXES: .o .f90 .C .for .cu
|
||||||
|
|
||||||
.f90.o:
|
.f90.o:
|
||||||
@@ -43,70 +16,19 @@ endif
|
|||||||
.cu.o:
|
.cu.o:
|
||||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||||
|
|
||||||
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
|
|
||||||
bssn_rhs_cuda.o: bssn_rhs_cuda.cu macrodef.h
|
|
||||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
|
||||||
|
|
||||||
# C rewrite of BSSN RHS kernel and helpers
|
|
||||||
bssn_rhs_c.o: bssn_rhs_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
fderivs_c.o: fderivs_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
fdderivs_c.o: fdderivs_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
kodiss_c.o: kodiss_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
lopsided_c.o: lopsided_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
|
||||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
|
||||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
|
||||||
|
|
||||||
TwoPunctures.o: TwoPunctures.C
|
TwoPunctures.o: TwoPunctures.C
|
||||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
TwoPunctureABE.o: TwoPunctureABE.C
|
TwoPunctureABE.o: TwoPunctureABE.C
|
||||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
# Input files
|
# Input files
|
||||||
|
|
||||||
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
|
|
||||||
ifeq ($(USE_CXX_KERNELS),0)
|
|
||||||
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
|
|
||||||
CFILES =
|
|
||||||
else
|
|
||||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
|
||||||
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o
|
|
||||||
endif
|
|
||||||
|
|
||||||
# CUDA rewrite: bssn_rhs_cuda.o replaces all CFILES (stencils are built-in)
|
|
||||||
CFILES_CUDA = bssn_rhs_cuda.o
|
|
||||||
|
|
||||||
## RK4 kernel switch (independent from USE_CXX_KERNELS)
|
|
||||||
ifeq ($(USE_CXX_RK4),1)
|
|
||||||
CFILES += rungekutta4_rout_c.o
|
|
||||||
CFILES_CUDA += rungekutta4_rout_c.o
|
|
||||||
RK4_F90_OBJ =
|
|
||||||
else
|
|
||||||
RK4_F90_OBJ = rungekutta4_rout.o
|
|
||||||
endif
|
|
||||||
|
|
||||||
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||||
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
||||||
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
||||||
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
||||||
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
||||||
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
|
NullShellPatch2_Evo.o writefile_f.o
|
||||||
|
|
||||||
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||||
cgh.o surface_integral.o ShellPatch.o\
|
cgh.o surface_integral.o ShellPatch.o\
|
||||||
@@ -116,9 +38,9 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
|
|||||||
NullShellPatch2_Evo.o \
|
NullShellPatch2_Evo.o \
|
||||||
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
||||||
|
|
||||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||||
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
|
rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
|
||||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||||
@@ -129,14 +51,6 @@ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
|||||||
scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
|
scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
|
||||||
NullNews2.o tool_f.o
|
NullNews2.o tool_f.o
|
||||||
|
|
||||||
ifeq ($(USE_CXX_KERNELS),0)
|
|
||||||
# Fortran mode: include original bssn_rhs.o
|
|
||||||
F90FILES = $(F90FILES_BASE) bssn_rhs.o
|
|
||||||
else
|
|
||||||
# C++ mode (default): bssn_rhs.o replaced by C++ kernel
|
|
||||||
F90FILES = $(F90FILES_BASE)
|
|
||||||
endif
|
|
||||||
|
|
||||||
F77FILES = zbesh.o
|
F77FILES = zbesh.o
|
||||||
|
|
||||||
AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
|
AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
|
||||||
@@ -149,7 +63,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
|
|||||||
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
||||||
|
|
||||||
# file dependences
|
# file dependences
|
||||||
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
||||||
|
|
||||||
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||||
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
|
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
|
||||||
@@ -172,7 +86,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
|
|||||||
|
|
||||||
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
|
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
|
||||||
|
|
||||||
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
||||||
|
|
||||||
TwoPunctureFILES: TwoPunctures.h
|
TwoPunctureFILES: TwoPunctures.h
|
||||||
|
|
||||||
@@ -181,17 +95,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
|
|||||||
misc.o : zbesh.o
|
misc.o : zbesh.o
|
||||||
|
|
||||||
# projects
|
# projects
|
||||||
ABE: $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
|
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
|
||||||
|
|
||||||
ABE_CUDA: $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
||||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES_CUDA) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
|
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
||||||
|
|
||||||
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
|
||||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
|
||||||
|
|
||||||
TwoPunctureABE: $(TwoPunctureFILES)
|
TwoPunctureABE: $(TwoPunctureFILES)
|
||||||
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
|
rm *.o ABE ABEGPU TwoPunctureABE make.log -f
|
||||||
|
|||||||
@@ -8,51 +8,18 @@ filein = -I/usr/include/ -I${MKLROOT}/include
|
|||||||
|
|
||||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
|
||||||
|
|
||||||
## Memory allocator switch
|
|
||||||
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
|
||||||
## 0 : use system default allocator (ptmalloc)
|
|
||||||
USE_TBBMALLOC ?= 1
|
|
||||||
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
|
|
||||||
ifneq ($(wildcard $(TBBMALLOC_SO)),)
|
|
||||||
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
|
|
||||||
else
|
|
||||||
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
|
|
||||||
endif
|
|
||||||
ifeq ($(USE_TBBMALLOC),1)
|
|
||||||
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
|
||||||
endif
|
|
||||||
|
|
||||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
|
||||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
|
||||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
|
||||||
PGO_MODE ?= opt
|
|
||||||
|
|
||||||
## Interp_Points load balance profiling mode
|
|
||||||
## off : (default) no load balance instrumentation
|
|
||||||
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
|
||||||
## optimize : Pass 2 — read profile and apply block rebalancing
|
|
||||||
INTERP_LB_MODE ?= off
|
|
||||||
|
|
||||||
ifeq ($(INTERP_LB_MODE),profile)
|
|
||||||
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
|
|
||||||
else ifeq ($(INTERP_LB_MODE),optimize)
|
|
||||||
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
|
|
||||||
else
|
|
||||||
INTERP_LB_FLAGS =
|
|
||||||
endif
|
|
||||||
|
|
||||||
## Kernel implementation switch
|
|
||||||
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
|
||||||
## 0 : fall back to original Fortran kernels
|
|
||||||
USE_CXX_KERNELS ?= 1
|
|
||||||
|
|
||||||
## RK4 kernel implementation switch
|
|
||||||
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
|
|
||||||
## 0 : use original Fortran rungekutta4_rout.o
|
|
||||||
USE_CXX_RK4 ?= 1
|
|
||||||
|
|
||||||
|
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
|
||||||
|
## -fprofile-instr-use: use collected profile data to guide optimization decisions
|
||||||
|
## (branch prediction, basic block layout, inlining, loop unrolling)
|
||||||
|
PROFDATA = ../../pgo_profile/default.profdata
|
||||||
|
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
|
-fprofile-instr-use=$(PROFDATA) \
|
||||||
|
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||||
|
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
|
-fprofile-instr-use=$(PROFDATA) \
|
||||||
|
-align array64byte -fpp -I${MKLROOT}/include
|
||||||
f90 = ifx
|
f90 = ifx
|
||||||
f77 = ifx
|
f77 = ifx
|
||||||
CXX = icpx
|
CXX = icpx
|
||||||
@@ -62,4 +29,4 @@ CLINKER = mpiicpx
|
|||||||
Cu = nvcc
|
Cu = nvcc
|
||||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
||||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=sm_80
|
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||||
|
|||||||
@@ -217,6 +217,7 @@
|
|||||||
real*8,dimension(2*ghost_width) :: X,Y,Z
|
real*8,dimension(2*ghost_width) :: X,Y,Z
|
||||||
real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
|
real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
|
||||||
real*8, dimension(2*ghost_width) :: tmp1
|
real*8, dimension(2*ghost_width) :: tmp1
|
||||||
|
real*8 :: ddy
|
||||||
real*8,dimension(3) :: ccp
|
real*8,dimension(3) :: ccp
|
||||||
|
|
||||||
#if (ghost_width == 2)
|
#if (ghost_width == 2)
|
||||||
@@ -579,7 +580,7 @@
|
|||||||
tmp1(ghost_width-cxI(1)+cxB(1) :ghost_width-cxI(1)+cxT(1) ) = funf(cxB(1):cxT(1),j,k)
|
tmp1(ghost_width-cxI(1)+cxB(1) :ghost_width-cxI(1)+cxT(1) ) = funf(cxB(1):cxT(1),j,k)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call polint0(X,tmp1,funf(i,j,k),2*ghost_width)
|
call polint(X,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||||
|
|
||||||
! for y direction
|
! for y direction
|
||||||
elseif(sum(fg).eq.2.and.fg(2) .eq. 0.and. &
|
elseif(sum(fg).eq.2.and.fg(2) .eq. 0.and. &
|
||||||
@@ -689,7 +690,7 @@
|
|||||||
tmp1(ghost_width-cxI(2)+cxB(2) :ghost_width-cxI(2)+cxT(2) ) = funf(i,cxB(2):cxT(2),k)
|
tmp1(ghost_width-cxI(2)+cxB(2) :ghost_width-cxI(2)+cxT(2) ) = funf(i,cxB(2):cxT(2),k)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call polint0(Y,tmp1,funf(i,j,k),2*ghost_width)
|
call polint(Y,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||||
|
|
||||||
! for z direction
|
! for z direction
|
||||||
elseif(sum(fg).eq.2.and.fg(3) .eq. 0.and. &
|
elseif(sum(fg).eq.2.and.fg(3) .eq. 0.and. &
|
||||||
@@ -801,7 +802,7 @@
|
|||||||
tmp1(ghost_width-cxI(3)+cxB(3) :ghost_width-cxI(3)+cxT(3) ) = funf(i,j,cxB(3):cxT(3))
|
tmp1(ghost_width-cxI(3)+cxB(3) :ghost_width-cxI(3)+cxT(3) ) = funf(i,j,cxB(3):cxT(3))
|
||||||
endif
|
endif
|
||||||
|
|
||||||
call polint0(Z,tmp1,funf(i,j,k),2*ghost_width)
|
call polint(Z,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@@ -1933,35 +1934,18 @@
|
|||||||
! when if=1 -> ic=0, this is different to vertex center grid
|
! when if=1 -> ic=0, this is different to vertex center grid
|
||||||
real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc
|
real*8, dimension(-2:extc(1),-2:extc(2),-2:extc(3)) :: funcc
|
||||||
integer,dimension(3) :: cxI
|
integer,dimension(3) :: cxI
|
||||||
integer :: i,j,k,ii,jj,kk,px,py,pz
|
integer :: i,j,k,ii,jj,kk
|
||||||
real*8, dimension(6,6) :: tmp2
|
real*8, dimension(6,6) :: tmp2
|
||||||
real*8, dimension(6) :: tmp1
|
real*8, dimension(6) :: tmp1
|
||||||
integer, dimension(extf(1)) :: cix
|
|
||||||
integer, dimension(extf(2)) :: ciy
|
|
||||||
integer, dimension(extf(3)) :: ciz
|
|
||||||
integer, dimension(extf(1)) :: pix
|
|
||||||
integer, dimension(extf(2)) :: piy
|
|
||||||
integer, dimension(extf(3)) :: piz
|
|
||||||
|
|
||||||
real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
|
real*8, parameter :: C1=7.7d1/8.192d3,C2=-6.93d2/8.192d3,C3=3.465d3/4.096d3
|
||||||
real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
|
real*8, parameter :: C6=6.3d1/8.192d3,C5=-4.95d2/8.192d3,C4=1.155d3/4.096d3
|
||||||
real*8, dimension(6,2), parameter :: WC = reshape((/&
|
|
||||||
C1,C2,C3,C4,C5,C6,&
|
|
||||||
C6,C5,C4,C3,C2,C1/), (/6,2/))
|
|
||||||
|
|
||||||
integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
|
integer::imini,imaxi,jmini,jmaxi,kmini,kmaxi
|
||||||
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
|
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
|
||||||
integer::maxcx,maxcy,maxcz
|
|
||||||
|
|
||||||
real*8,dimension(3) :: CD,FD
|
real*8,dimension(3) :: CD,FD
|
||||||
real*8 :: tmp_yz(extc(1), 6) ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
|
|
||||||
real*8 :: tmp_xyz_line(-2:extc(1)) ! 包含 X 向 6 点模板访问所需下界
|
|
||||||
real*8 :: v1, v2, v3, v4, v5, v6
|
|
||||||
integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
|
|
||||||
integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
|
|
||||||
logical :: need_full_symmetry
|
|
||||||
real*8 :: res_line
|
|
||||||
real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2)) ! 包含 Y/X 向模板访问所需下界
|
|
||||||
if(wei.ne.3)then
|
if(wei.ne.3)then
|
||||||
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
|
||||||
write(*,*)"dim = ",wei
|
write(*,*)"dim = ",wei
|
||||||
@@ -2036,140 +2020,145 @@
|
|||||||
return
|
return
|
||||||
endif
|
endif
|
||||||
|
|
||||||
do i = imino,imaxo
|
call symmetry_bd(3,extc,func,funcc,SoA)
|
||||||
ii = i + lbf(1) - 1
|
|
||||||
cix(i) = ii/2 - lbc(1) + 1
|
|
||||||
if(ii/2*2 == ii)then
|
|
||||||
pix(i) = 1
|
|
||||||
else
|
|
||||||
pix(i) = 2
|
|
||||||
endif
|
|
||||||
enddo
|
|
||||||
do j = jmino,jmaxo
|
|
||||||
jj = j + lbf(2) - 1
|
|
||||||
ciy(j) = jj/2 - lbc(2) + 1
|
|
||||||
if(jj/2*2 == jj)then
|
|
||||||
piy(j) = 1
|
|
||||||
else
|
|
||||||
piy(j) = 2
|
|
||||||
endif
|
|
||||||
enddo
|
|
||||||
do k = kmino,kmaxo
|
|
||||||
kk = k + lbf(3) - 1
|
|
||||||
ciz(k) = kk/2 - lbc(3) + 1
|
|
||||||
if(kk/2*2 == kk)then
|
|
||||||
piz(k) = 1
|
|
||||||
else
|
|
||||||
piz(k) = 2
|
|
||||||
endif
|
|
||||||
enddo
|
|
||||||
|
|
||||||
ic_min = minval(cix(imino:imaxo))
|
|
||||||
ic_max = maxval(cix(imino:imaxo))
|
|
||||||
jc_min = minval(ciy(jmino:jmaxo))
|
|
||||||
jc_max = maxval(ciy(jmino:jmaxo))
|
|
||||||
kc_min = minval(ciz(kmino:kmaxo))
|
|
||||||
kc_max = maxval(ciz(kmino:kmaxo))
|
|
||||||
|
|
||||||
maxcx = ic_max
|
|
||||||
maxcy = jc_max
|
|
||||||
maxcz = kc_max
|
|
||||||
if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
|
|
||||||
write(*,*)"error in prolong"
|
|
||||||
return
|
|
||||||
endif
|
|
||||||
|
|
||||||
i_lo = ic_min - 2
|
|
||||||
i_hi = ic_max + 3
|
|
||||||
j_lo = jc_min - 2
|
|
||||||
j_hi = jc_max + 3
|
|
||||||
k_lo = kc_min - 2
|
|
||||||
k_hi = kc_max + 3
|
|
||||||
need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
|
|
||||||
if(need_full_symmetry)then
|
|
||||||
call symmetry_bd(3,extc,func,funcc,SoA)
|
|
||||||
else
|
|
||||||
funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
|
|
||||||
endif
|
|
||||||
|
|
||||||
! 对每个 k(pz, kc 固定)预计算 Z 向插值的 2D 切片
|
|
||||||
|
|
||||||
do k = kmino, kmaxo
|
|
||||||
pz = piz(k); kc = ciz(k)
|
|
||||||
! --- Pass 1: Z 方向,只算一次 ---
|
|
||||||
do iy = jc_min-2, jc_max+3 ! 仅需的 iy 范围(对应 jc-2:jc+3)
|
|
||||||
do ii = ic_min-2, ic_max+3 ! 仅需的 ii 范围(对应 cix-2:cix+3)
|
|
||||||
tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
|
|
||||||
do j = jmino, jmaxo
|
|
||||||
py = piy(j); jc = ciy(j)
|
|
||||||
! --- Pass 2: Y 方向 ---
|
|
||||||
do ii = ic_min-2, ic_max+3
|
|
||||||
tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
|
|
||||||
end do
|
|
||||||
! --- Pass 3: X 方向 ---
|
|
||||||
do i = imino, imaxo
|
|
||||||
funf(i,j,k) = sum(WC(:,pix(i)) * tmp_xyz_line(cix(i)-2:cix(i)+3))
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
|
|
||||||
!~~~~~~> prolongation start...
|
!~~~~~~> prolongation start...
|
||||||
|
do k = kmino,kmaxo
|
||||||
|
do j = jmino,jmaxo
|
||||||
|
do i = imino,imaxo
|
||||||
|
cxI(1) = i
|
||||||
|
cxI(2) = j
|
||||||
|
cxI(3) = k
|
||||||
|
! change to coarse level reference
|
||||||
|
!|---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*--- ---*---|
|
||||||
|
!|=======x===============x===============x===============x=======|
|
||||||
|
cxI = (cxI+lbf-1)/2
|
||||||
|
! change to array index
|
||||||
|
cxI = cxI - lbc + 1
|
||||||
|
|
||||||
|
if(any(cxI+3 > extc)) write(*,*)"error in prolong"
|
||||||
|
ii=i+lbf(1)-1
|
||||||
|
jj=j+lbf(2)-1
|
||||||
|
kk=k+lbf(3)-1
|
||||||
#if 0
|
#if 0
|
||||||
do k = kmino, kmaxo
|
if(ii/2*2==ii)then
|
||||||
pz = piz(k)
|
if(jj/2*2==jj)then
|
||||||
kc = ciz(k)
|
if(kk/2*2==kk)then
|
||||||
|
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||||
|
else
|
||||||
|
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
if(kk/2*2==kk)then
|
||||||
|
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||||
|
else
|
||||||
|
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
if(jj/2*2==jj)then
|
||||||
|
if(kk/2*2==kk)then
|
||||||
|
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||||
|
else
|
||||||
|
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
if(kk/2*2==kk)then
|
||||||
|
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||||
|
else
|
||||||
|
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||||
|
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
#else
|
||||||
|
if(kk/2*2==kk)then
|
||||||
|
tmp2= C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
else
|
||||||
|
tmp2= C6*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-2)+&
|
||||||
|
C5*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)-1)+&
|
||||||
|
C4*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3) )+&
|
||||||
|
C3*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+1)+&
|
||||||
|
C2*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+2)+&
|
||||||
|
C1*funcc(cxI(1)-2:cxI(1)+3,cxI(2)-2:cxI(2)+3,cxI(3)+3)
|
||||||
|
endif
|
||||||
|
|
||||||
do j = jmino, jmaxo
|
if(jj/2*2==jj)then
|
||||||
py = piy(j)
|
tmp1= C1*tmp2(:,1)+C2*tmp2(:,2)+C3*tmp2(:,3)+C4*tmp2(:,4)+C5*tmp2(:,5)+C6*tmp2(:,6)
|
||||||
jc = ciy(j)
|
else
|
||||||
|
tmp1= C6*tmp2(:,1)+C5*tmp2(:,2)+C4*tmp2(:,3)+C3*tmp2(:,4)+C2*tmp2(:,5)+C1*tmp2(:,6)
|
||||||
|
endif
|
||||||
|
|
||||||
! --- 步骤 1 & 2 融合:分段处理 X 轴,提升 Cache 命中率 ---
|
if(ii/2*2==ii)then
|
||||||
! 我们将 ii 循环逻辑重组,减少对 funcc 的跨行重复访问
|
funf(i,j,k)= C1*tmp1(1)+C2*tmp1(2)+C3*tmp1(3)+C4*tmp1(4)+C5*tmp1(5)+C6*tmp1(6)
|
||||||
do ii = 1, extc(1)
|
else
|
||||||
! 1. 先做 Z 方向的 6 条线插值(针对当前的 ii 和当前的 6 个 iy)
|
funf(i,j,k)= C6*tmp1(1)+C5*tmp1(2)+C4*tmp1(3)+C3*tmp1(4)+C2*tmp1(5)+C1*tmp1(6)
|
||||||
! 我们直接在这里把 Y 方向的加权也做了,省去 tmp_yz 数组
|
endif
|
||||||
! 这样 funcc 的数据读进来后立即完成所有维度的贡献,不再写回内存
|
|
||||||
|
|
||||||
res_line = 0.0d0
|
|
||||||
do jj = 1, 6
|
|
||||||
iy = jc - 3 + jj
|
|
||||||
! 这一行代码是核心:一次性完成 Z 插值并加上 Y 的权重
|
|
||||||
! 编译器会把 WC(jj, py) 存在寄存器里
|
|
||||||
res_line = res_line + WC(jj, py) * ( &
|
|
||||||
WC(1, pz) * funcc(ii, iy, kc-2) + &
|
|
||||||
WC(2, pz) * funcc(ii, iy, kc-1) + &
|
|
||||||
WC(3, pz) * funcc(ii, iy, kc ) + &
|
|
||||||
WC(4, pz) * funcc(ii, iy, kc+1) + &
|
|
||||||
WC(5, pz) * funcc(ii, iy, kc+2) + &
|
|
||||||
WC(6, pz) * funcc(ii, iy, kc+3) )
|
|
||||||
end do
|
|
||||||
tmp_xyz_line(ii) = res_line
|
|
||||||
end do
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
! 3. 【降维:X 向】最后在最内层只处理 X 方向的 6 点加权
|
|
||||||
! 此时每个点的计算量从原来的 200+ 次乘法降到了仅 6 次
|
|
||||||
do i = imino, imaxo
|
|
||||||
px = pix(i)
|
|
||||||
ic = cix(i)
|
|
||||||
|
|
||||||
! 直接从预计算好的 line 中读取连续的 6 个点
|
|
||||||
! ic-2 到 ic+3 对应原始 6 点算子
|
|
||||||
funf(i,j,k) = WC(1,px)*tmp_xyz_line(ic-2) + &
|
|
||||||
WC(2,px)*tmp_xyz_line(ic-1) + &
|
|
||||||
WC(3,px)*tmp_xyz_line(ic ) + &
|
|
||||||
WC(4,px)*tmp_xyz_line(ic+1) + &
|
|
||||||
WC(5,px)*tmp_xyz_line(ic+2) + &
|
|
||||||
WC(6,px)*tmp_xyz_line(ic+3)
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
#endif
|
#endif
|
||||||
|
enddo
|
||||||
|
enddo
|
||||||
|
enddo
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
end subroutine prolong3
|
end subroutine prolong3
|
||||||
@@ -2368,14 +2357,7 @@ end do
|
|||||||
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
|
integer::imino,imaxo,jmino,jmaxo,kmino,kmaxo
|
||||||
|
|
||||||
real*8,dimension(3) :: CD,FD
|
real*8,dimension(3) :: CD,FD
|
||||||
|
|
||||||
real*8 :: tmp_xz_plane(-1:extf(1), 6)
|
|
||||||
real*8 :: tmp_x_line(-1:extf(1))
|
|
||||||
integer :: fi, fj, fk, ii, jj, kk
|
|
||||||
integer :: fi_min, fi_max, ii_lo, ii_hi
|
|
||||||
integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
|
|
||||||
logical :: need_full_symmetry
|
|
||||||
|
|
||||||
if(wei.ne.3)then
|
if(wei.ne.3)then
|
||||||
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
|
write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
|
||||||
write(*,*)"dim = ",wei
|
write(*,*)"dim = ",wei
|
||||||
@@ -2454,86 +2436,9 @@ end do
|
|||||||
stop
|
stop
|
||||||
endif
|
endif
|
||||||
|
|
||||||
! 仅计算 X 向最终写回所需的窗口:
|
call symmetry_bd(2,extf,funf,funff,SoA)
|
||||||
! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
|
|
||||||
fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
|
|
||||||
fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
|
|
||||||
fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
|
|
||||||
fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
|
|
||||||
fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
|
|
||||||
fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
|
|
||||||
ii_lo = fi_min - 2
|
|
||||||
ii_hi = fi_max + 3
|
|
||||||
jj_lo = fj_min - 2
|
|
||||||
jj_hi = fj_max + 3
|
|
||||||
kk_lo = fk_min - 2
|
|
||||||
kk_hi = fk_max + 3
|
|
||||||
if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
|
|
||||||
jj_lo < -1 .or. jj_hi > extf(2) .or. &
|
|
||||||
kk_lo < -1 .or. kk_hi > extf(3))then
|
|
||||||
write(*,*)"restrict3: invalid stencil window"
|
|
||||||
write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
|
|
||||||
write(*,*)"extf=",extf
|
|
||||||
stop
|
|
||||||
endif
|
|
||||||
need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
|
|
||||||
if(need_full_symmetry)then
|
|
||||||
call symmetry_bd(2,extf,funf,funff,SoA)
|
|
||||||
else
|
|
||||||
funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
|
|
||||||
endif
|
|
||||||
|
|
||||||
!~~~~~~> restriction start...
|
!~~~~~~> restriction start...
|
||||||
do k = kmino, kmaxo
|
|
||||||
fk = 2*(k + lbc(3) - 1) - 1 - lbf(3) + 1
|
|
||||||
|
|
||||||
do j = jmino, jmaxo
|
|
||||||
fj = 2*(j + lbc(2) - 1) - 1 - lbf(2) + 1
|
|
||||||
|
|
||||||
! 优化点 1: 显式展开 Z 方向计算,减少循环开销
|
|
||||||
! 确保 ii 循环是最内层且连续访问
|
|
||||||
!DIR$ VECTOR ALWAYS
|
|
||||||
do ii = ii_lo, ii_hi
|
|
||||||
! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
|
|
||||||
! 这里直接硬编码 jj 的偏移,彻底消除一层循环
|
|
||||||
tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
|
|
||||||
C2*(funff(ii,fj-2,fk-1)+funff(ii,fj-2,fk+2)) + &
|
|
||||||
C3*(funff(ii,fj-2,fk )+funff(ii,fj-2,fk+1))
|
|
||||||
tmp_xz_plane(ii, 2) = C1*(funff(ii,fj-1,fk-2)+funff(ii,fj-1,fk+3)) + &
|
|
||||||
C2*(funff(ii,fj-1,fk-1)+funff(ii,fj-1,fk+2)) + &
|
|
||||||
C3*(funff(ii,fj-1,fk )+funff(ii,fj-1,fk+1))
|
|
||||||
tmp_xz_plane(ii, 3) = C1*(funff(ii,fj ,fk-2)+funff(ii,fj ,fk+3)) + &
|
|
||||||
C2*(funff(ii,fj ,fk-1)+funff(ii,fj ,fk+2)) + &
|
|
||||||
C3*(funff(ii,fj ,fk )+funff(ii,fj ,fk+1))
|
|
||||||
tmp_xz_plane(ii, 4) = C1*(funff(ii,fj+1,fk-2)+funff(ii,fj+1,fk+3)) + &
|
|
||||||
C2*(funff(ii,fj+1,fk-1)+funff(ii,fj+1,fk+2)) + &
|
|
||||||
C3*(funff(ii,fj+1,fk )+funff(ii,fj+1,fk+1))
|
|
||||||
tmp_xz_plane(ii, 5) = C1*(funff(ii,fj+2,fk-2)+funff(ii,fj+2,fk+3)) + &
|
|
||||||
C2*(funff(ii,fj+2,fk-1)+funff(ii,fj+2,fk+2)) + &
|
|
||||||
C3*(funff(ii,fj+2,fk )+funff(ii,fj+2,fk+1))
|
|
||||||
tmp_xz_plane(ii, 6) = C1*(funff(ii,fj+3,fk-2)+funff(ii,fj+3,fk+3)) + &
|
|
||||||
C2*(funff(ii,fj+3,fk-1)+funff(ii,fj+3,fk+2)) + &
|
|
||||||
C3*(funff(ii,fj+3,fk )+funff(ii,fj+3,fk+1))
|
|
||||||
end do
|
|
||||||
|
|
||||||
! 优化点 2: 同样向量化 Y 方向压缩
|
|
||||||
!DIR$ VECTOR ALWAYS
|
|
||||||
do ii = ii_lo, ii_hi
|
|
||||||
tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
|
|
||||||
C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
|
|
||||||
C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
|
|
||||||
end do
|
|
||||||
|
|
||||||
! 优化点 3: 最终写入,利用已经缓存在 tmp_x_line 的数据
|
|
||||||
do i = imino, imaxo
|
|
||||||
fi = 2*(i + lbc(1) - 1) - 1 - lbf(1) + 1
|
|
||||||
func(i, j, k) = C1*(tmp_x_line(fi-2) + tmp_x_line(fi+3)) + &
|
|
||||||
C2*(tmp_x_line(fi-1) + tmp_x_line(fi+2)) + &
|
|
||||||
C3*(tmp_x_line(fi ) + tmp_x_line(fi+1))
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
end do
|
|
||||||
#if 0
|
|
||||||
do k = kmino,kmaxo
|
do k = kmino,kmaxo
|
||||||
do j = jmino,jmaxo
|
do j = jmino,jmaxo
|
||||||
do i = imino,imaxo
|
do i = imino,imaxo
|
||||||
@@ -2557,7 +2462,7 @@ end do
|
|||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
enddo
|
enddo
|
||||||
#endif
|
|
||||||
return
|
return
|
||||||
|
|
||||||
end subroutine restrict3
|
end subroutine restrict3
|
||||||
|
|||||||
@@ -217,6 +217,7 @@
|
|||||||
real*8,dimension(2*ghost_width) :: X,Y,Z
|
real*8,dimension(2*ghost_width) :: X,Y,Z
|
||||||
real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
|
real*8, dimension(2*ghost_width,2*ghost_width) :: tmp2
|
||||||
real*8, dimension(2*ghost_width) :: tmp1
|
real*8, dimension(2*ghost_width) :: tmp1
|
||||||
|
real*8 :: ddy
|
||||||
|
|
||||||
#if (ghost_width == 2)
|
#if (ghost_width == 2)
|
||||||
real*8, parameter :: C1=-1.d0/16,C2=9.d0/16
|
real*8, parameter :: C1=-1.d0/16,C2=9.d0/16
|
||||||
@@ -469,7 +470,7 @@
|
|||||||
|
|
||||||
tmp1(cxB(1)+ghost_width-i+1:cxT(1)+ghost_width-i+1) = fh(cxB(1):cxT(1),j,k)
|
tmp1(cxB(1)+ghost_width-i+1:cxT(1)+ghost_width-i+1) = fh(cxB(1):cxT(1),j,k)
|
||||||
|
|
||||||
call polint0(X,tmp1,funf(i,j,k),2*ghost_width)
|
call polint(X,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||||
|
|
||||||
! for y direction
|
! for y direction
|
||||||
elseif (fg(2) .eq. 0)then
|
elseif (fg(2) .eq. 0)then
|
||||||
@@ -528,7 +529,7 @@
|
|||||||
|
|
||||||
tmp1(cxB(2)+ghost_width-j+1:cxT(2)+ghost_width-j+1) = fh(i,cxB(2):cxT(2),k)
|
tmp1(cxB(2)+ghost_width-j+1:cxT(2)+ghost_width-j+1) = fh(i,cxB(2):cxT(2),k)
|
||||||
|
|
||||||
call polint0(Y,tmp1,funf(i,j,k),2*ghost_width)
|
call polint(Y,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||||
|
|
||||||
! for z direction
|
! for z direction
|
||||||
else
|
else
|
||||||
@@ -587,7 +588,7 @@
|
|||||||
|
|
||||||
tmp1(cxB(3)+ghost_width-k+1:cxT(3)+ghost_width-k+1) = fh(i,j,cxB(3):cxT(3))
|
tmp1(cxB(3)+ghost_width-k+1:cxT(3)+ghost_width-k+1) = fh(i,j,cxB(3):cxT(3))
|
||||||
|
|
||||||
call polint0(Z,tmp1,funf(i,j,k),2*ghost_width)
|
call polint(Z,tmp1,0.d0,funf(i,j,k),ddy,2*ghost_width)
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|||||||
@@ -1,155 +0,0 @@
|
|||||||
#include "rungekutta4_rout.h"
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstddef>
|
|
||||||
#include <immintrin.h>
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
inline void rk4_stage0(std::size_t n,
|
|
||||||
const double *__restrict f0,
|
|
||||||
const double *__restrict frhs,
|
|
||||||
double *__restrict f1,
|
|
||||||
double c) {
|
|
||||||
std::size_t i = 0;
|
|
||||||
#if defined(__AVX512F__)
|
|
||||||
const __m512d vc = _mm512_set1_pd(c);
|
|
||||||
for (; i + 7 < n; i += 8) {
|
|
||||||
const __m512d v0 = _mm512_loadu_pd(f0 + i);
|
|
||||||
const __m512d vr = _mm512_loadu_pd(frhs + i);
|
|
||||||
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, vr, v0));
|
|
||||||
}
|
|
||||||
#elif defined(__AVX2__)
|
|
||||||
const __m256d vc = _mm256_set1_pd(c);
|
|
||||||
for (; i + 3 < n; i += 4) {
|
|
||||||
const __m256d v0 = _mm256_loadu_pd(f0 + i);
|
|
||||||
const __m256d vr = _mm256_loadu_pd(frhs + i);
|
|
||||||
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, vr, v0));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#pragma ivdep
|
|
||||||
for (; i < n; ++i) {
|
|
||||||
f1[i] = f0[i] + c * frhs[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void rk4_rhs_accum(std::size_t n,
|
|
||||||
const double *__restrict f1,
|
|
||||||
double *__restrict frhs) {
|
|
||||||
std::size_t i = 0;
|
|
||||||
#if defined(__AVX512F__)
|
|
||||||
const __m512d v2 = _mm512_set1_pd(2.0);
|
|
||||||
for (; i + 7 < n; i += 8) {
|
|
||||||
const __m512d v1 = _mm512_loadu_pd(f1 + i);
|
|
||||||
const __m512d vrhs = _mm512_loadu_pd(frhs + i);
|
|
||||||
_mm512_storeu_pd(frhs + i, _mm512_fmadd_pd(v2, v1, vrhs));
|
|
||||||
}
|
|
||||||
#elif defined(__AVX2__)
|
|
||||||
const __m256d v2 = _mm256_set1_pd(2.0);
|
|
||||||
for (; i + 3 < n; i += 4) {
|
|
||||||
const __m256d v1 = _mm256_loadu_pd(f1 + i);
|
|
||||||
const __m256d vrhs = _mm256_loadu_pd(frhs + i);
|
|
||||||
_mm256_storeu_pd(frhs + i, _mm256_fmadd_pd(v2, v1, vrhs));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#pragma ivdep
|
|
||||||
for (; i < n; ++i) {
|
|
||||||
frhs[i] = frhs[i] + 2.0 * f1[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void rk4_f1_from_f0_f1(std::size_t n,
|
|
||||||
const double *__restrict f0,
|
|
||||||
double *__restrict f1,
|
|
||||||
double c) {
|
|
||||||
std::size_t i = 0;
|
|
||||||
#if defined(__AVX512F__)
|
|
||||||
const __m512d vc = _mm512_set1_pd(c);
|
|
||||||
for (; i + 7 < n; i += 8) {
|
|
||||||
const __m512d v0 = _mm512_loadu_pd(f0 + i);
|
|
||||||
const __m512d v1 = _mm512_loadu_pd(f1 + i);
|
|
||||||
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, v1, v0));
|
|
||||||
}
|
|
||||||
#elif defined(__AVX2__)
|
|
||||||
const __m256d vc = _mm256_set1_pd(c);
|
|
||||||
for (; i + 3 < n; i += 4) {
|
|
||||||
const __m256d v0 = _mm256_loadu_pd(f0 + i);
|
|
||||||
const __m256d v1 = _mm256_loadu_pd(f1 + i);
|
|
||||||
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, v1, v0));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#pragma ivdep
|
|
||||||
for (; i < n; ++i) {
|
|
||||||
f1[i] = f0[i] + c * f1[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
inline void rk4_stage3(std::size_t n,
|
|
||||||
const double *__restrict f0,
|
|
||||||
double *__restrict f1,
|
|
||||||
const double *__restrict frhs,
|
|
||||||
double c) {
|
|
||||||
std::size_t i = 0;
|
|
||||||
#if defined(__AVX512F__)
|
|
||||||
const __m512d vc = _mm512_set1_pd(c);
|
|
||||||
for (; i + 7 < n; i += 8) {
|
|
||||||
const __m512d v0 = _mm512_loadu_pd(f0 + i);
|
|
||||||
const __m512d v1 = _mm512_loadu_pd(f1 + i);
|
|
||||||
const __m512d vr = _mm512_loadu_pd(frhs + i);
|
|
||||||
_mm512_storeu_pd(f1 + i, _mm512_fmadd_pd(vc, _mm512_add_pd(v1, vr), v0));
|
|
||||||
}
|
|
||||||
#elif defined(__AVX2__)
|
|
||||||
const __m256d vc = _mm256_set1_pd(c);
|
|
||||||
for (; i + 3 < n; i += 4) {
|
|
||||||
const __m256d v0 = _mm256_loadu_pd(f0 + i);
|
|
||||||
const __m256d v1 = _mm256_loadu_pd(f1 + i);
|
|
||||||
const __m256d vr = _mm256_loadu_pd(frhs + i);
|
|
||||||
_mm256_storeu_pd(f1 + i, _mm256_fmadd_pd(vc, _mm256_add_pd(v1, vr), v0));
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#pragma ivdep
|
|
||||||
for (; i < n; ++i) {
|
|
||||||
f1[i] = f0[i] + c * (f1[i] + frhs[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
extern "C" {
|
|
||||||
|
|
||||||
int f_rungekutta4_rout(int *ex, double &dT,
|
|
||||||
double *f0, double *f1, double *f_rhs,
|
|
||||||
int &RK4) {
|
|
||||||
const std::size_t n = static_cast<std::size_t>(ex[0]) *
|
|
||||||
static_cast<std::size_t>(ex[1]) *
|
|
||||||
static_cast<std::size_t>(ex[2]);
|
|
||||||
const double *const __restrict f0r = f0;
|
|
||||||
double *const __restrict f1r = f1;
|
|
||||||
double *const __restrict frhs = f_rhs;
|
|
||||||
|
|
||||||
if (__builtin_expect(static_cast<unsigned>(RK4) > 3u, 0)) {
|
|
||||||
std::fprintf(stderr, "rungekutta4_rout_c: invalid RK4 stage %d\n", RK4);
|
|
||||||
std::abort();
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (RK4) {
|
|
||||||
case 0:
|
|
||||||
rk4_stage0(n, f0r, frhs, f1r, 0.5 * dT);
|
|
||||||
break;
|
|
||||||
case 1:
|
|
||||||
rk4_rhs_accum(n, f1r, frhs);
|
|
||||||
rk4_f1_from_f0_f1(n, f0r, f1r, 0.5 * dT);
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
rk4_rhs_accum(n, f1r, frhs);
|
|
||||||
rk4_f1_from_f0_f1(n, f0r, f1r, dT);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
rk4_stage3(n, f0r, f1r, frhs, (1.0 / 6.0) * dT);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // extern "C"
|
|
||||||
@@ -1,246 +0,0 @@
|
|||||||
#ifndef SHARE_FUNC_H
|
|
||||||
#define SHARE_FUNC_H
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
/* 主网格:0-based -> 1D */
|
|
||||||
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1];
|
|
||||||
return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
|
|
||||||
* ord=2 => shift=1
|
|
||||||
* iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex)
|
|
||||||
*/
|
|
||||||
static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
|
|
||||||
const int shift = 1;
|
|
||||||
const int nx = ex[0] + 2; // ex1 + ord
|
|
||||||
const int ny = ex[1] + 2;
|
|
||||||
|
|
||||||
const int ii = iF + shift; // 0..ex1+1
|
|
||||||
const int jj = jF + shift; // 0..ex2+1
|
|
||||||
const int kk = kF + shift; // 0..ex3+1
|
|
||||||
|
|
||||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
|
|
||||||
* ord=3 => shift=2
|
|
||||||
* iF/jF/kF 是 Fortran 索引(可为负)
|
|
||||||
*/
|
|
||||||
static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
|
|
||||||
const int shift = 2; // ord=3 -> -2..ex
|
|
||||||
const int nx = ex[0] + 3; // ex1 + ord
|
|
||||||
const int ny = ex[1] + 3;
|
|
||||||
|
|
||||||
const int ii = iF + shift; // 0..ex1+2
|
|
||||||
const int jj = jF + shift; // 0..ex2+2
|
|
||||||
const int kk = kF + shift; // 0..ex3+2
|
|
||||||
|
|
||||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran
|
|
||||||
* funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
|
|
||||||
*
|
|
||||||
* C 里我们把:
|
|
||||||
* func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
|
|
||||||
* funcc 用“平移下标”存为一维数组:
|
|
||||||
* iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1]
|
|
||||||
* 总长度 nx = extc1 + ord
|
|
||||||
* 同理 ny = extc2 + ord, nz = extc3 + ord
|
|
||||||
*/
|
|
||||||
|
|
||||||
static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
|
|
||||||
const int nx = extc[0], ny = extc[1];
|
|
||||||
return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
|
|
||||||
const int shift = ord - 1; // iF = -shift .. extc1
|
|
||||||
const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个
|
|
||||||
const int ny = extc[1] + ord;
|
|
||||||
|
|
||||||
const int ii = iF + shift; // 0..extc1+shift
|
|
||||||
const int jj = jF + shift; // 0..extc2+shift
|
|
||||||
const int kk = kF + shift; // 0..extc3+shift
|
|
||||||
|
|
||||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 等价于 Fortran:
|
|
||||||
* funcc(1:extc1,1:extc2,1:extc3)=func
|
|
||||||
* do i=0,ord-1
|
|
||||||
* funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
|
|
||||||
* enddo
|
|
||||||
* do i=0,ord-1
|
|
||||||
* funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
|
|
||||||
* enddo
|
|
||||||
* do i=0,ord-1
|
|
||||||
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
|
||||||
* enddo
|
|
||||||
*/
|
|
||||||
static inline void symmetry_bd_impl(int ord,
|
|
||||||
int shift,
|
|
||||||
const int extc[3],
|
|
||||||
const double *__restrict func,
|
|
||||||
double *__restrict funcc,
|
|
||||||
const double SoA[3])
|
|
||||||
{
|
|
||||||
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
|
|
||||||
const int nx = extc1 + ord;
|
|
||||||
const int ny = extc2 + ord;
|
|
||||||
|
|
||||||
const size_t snx = (size_t)nx;
|
|
||||||
const size_t splane = (size_t)nx * (size_t)ny;
|
|
||||||
const size_t interior_i = (size_t)shift + 1u; /* iF = 1 */
|
|
||||||
const size_t interior_j = ((size_t)shift + 1u) * snx; /* jF = 1 */
|
|
||||||
const size_t interior_k = ((size_t)shift + 1u) * splane; /* kF = 1 */
|
|
||||||
const size_t interior0 = interior_k + interior_j + interior_i;
|
|
||||||
|
|
||||||
/* 1) funcc(1:extc1,1:extc2,1:extc3) = func */
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
const double *src_k = func + (size_t)k0 * (size_t)extc2 * (size_t)extc1;
|
|
||||||
const size_t dst_k0 = interior0 + (size_t)k0 * splane;
|
|
||||||
for (int j0 = 0; j0 < extc2; ++j0) {
|
|
||||||
const double *src = src_k + (size_t)j0 * (size_t)extc1;
|
|
||||||
double *dst = funcc + dst_k0 + (size_t)j0 * snx;
|
|
||||||
memcpy(dst, src, (size_t)extc1 * sizeof(double));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 2) funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) */
|
|
||||||
const double s1 = SoA[0];
|
|
||||||
if (s1 == 1.0) {
|
|
||||||
for (int ii = 0; ii < ord; ++ii) {
|
|
||||||
const size_t dst_i = (size_t)(shift - ii);
|
|
||||||
const size_t src_i = (size_t)(shift + ii + 1);
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
|
|
||||||
for (int j0 = 0; j0 < extc2; ++j0) {
|
|
||||||
const size_t off = kbase + (size_t)j0 * snx;
|
|
||||||
funcc[off + dst_i] = funcc[off + src_i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (s1 == -1.0) {
|
|
||||||
for (int ii = 0; ii < ord; ++ii) {
|
|
||||||
const size_t dst_i = (size_t)(shift - ii);
|
|
||||||
const size_t src_i = (size_t)(shift + ii + 1);
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
|
|
||||||
for (int j0 = 0; j0 < extc2; ++j0) {
|
|
||||||
const size_t off = kbase + (size_t)j0 * snx;
|
|
||||||
funcc[off + dst_i] = -funcc[off + src_i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int ii = 0; ii < ord; ++ii) {
|
|
||||||
const size_t dst_i = (size_t)(shift - ii);
|
|
||||||
const size_t src_i = (size_t)(shift + ii + 1);
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
const size_t kbase = interior_k + (size_t)k0 * splane + interior_j;
|
|
||||||
for (int j0 = 0; j0 < extc2; ++j0) {
|
|
||||||
const size_t off = kbase + (size_t)j0 * snx;
|
|
||||||
funcc[off + dst_i] = funcc[off + src_i] * s1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 3) funcc(:,-j,1:extc3) = funcc(:,j+1,1:extc3)*SoA(2) */
|
|
||||||
const double s2 = SoA[1];
|
|
||||||
if (s2 == 1.0) {
|
|
||||||
for (int jj = 0; jj < ord; ++jj) {
|
|
||||||
const size_t dst_j = (size_t)(shift - jj) * snx;
|
|
||||||
const size_t src_j = (size_t)(shift + jj + 1) * snx;
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
const size_t kbase = interior_k + (size_t)k0 * splane;
|
|
||||||
double *dst = funcc + kbase + dst_j;
|
|
||||||
const double *src = funcc + kbase + src_j;
|
|
||||||
for (int i = 0; i < nx; ++i) dst[i] = src[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (s2 == -1.0) {
|
|
||||||
for (int jj = 0; jj < ord; ++jj) {
|
|
||||||
const size_t dst_j = (size_t)(shift - jj) * snx;
|
|
||||||
const size_t src_j = (size_t)(shift + jj + 1) * snx;
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
const size_t kbase = interior_k + (size_t)k0 * splane;
|
|
||||||
double *dst = funcc + kbase + dst_j;
|
|
||||||
const double *src = funcc + kbase + src_j;
|
|
||||||
for (int i = 0; i < nx; ++i) dst[i] = -src[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int jj = 0; jj < ord; ++jj) {
|
|
||||||
const size_t dst_j = (size_t)(shift - jj) * snx;
|
|
||||||
const size_t src_j = (size_t)(shift + jj + 1) * snx;
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
const size_t kbase = interior_k + (size_t)k0 * splane;
|
|
||||||
double *dst = funcc + kbase + dst_j;
|
|
||||||
const double *src = funcc + kbase + src_j;
|
|
||||||
for (int i = 0; i < nx; ++i) dst[i] = src[i] * s2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* 4) funcc(:,:,-k) = funcc(:,:,k+1)*SoA(3) */
|
|
||||||
const double s3 = SoA[2];
|
|
||||||
if (s3 == 1.0) {
|
|
||||||
for (int kk = 0; kk < ord; ++kk) {
|
|
||||||
const size_t dst_k = (size_t)(shift - kk) * splane;
|
|
||||||
const size_t src_k = (size_t)(shift + kk + 1) * splane;
|
|
||||||
double *dst = funcc + dst_k;
|
|
||||||
const double *src = funcc + src_k;
|
|
||||||
for (size_t p = 0; p < splane; ++p) dst[p] = src[p];
|
|
||||||
}
|
|
||||||
} else if (s3 == -1.0) {
|
|
||||||
for (int kk = 0; kk < ord; ++kk) {
|
|
||||||
const size_t dst_k = (size_t)(shift - kk) * splane;
|
|
||||||
const size_t src_k = (size_t)(shift + kk + 1) * splane;
|
|
||||||
double *dst = funcc + dst_k;
|
|
||||||
const double *src = funcc + src_k;
|
|
||||||
for (size_t p = 0; p < splane; ++p) dst[p] = -src[p];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int kk = 0; kk < ord; ++kk) {
|
|
||||||
const size_t dst_k = (size_t)(shift - kk) * splane;
|
|
||||||
const size_t src_k = (size_t)(shift + kk + 1) * splane;
|
|
||||||
double *dst = funcc + dst_k;
|
|
||||||
const double *src = funcc + src_k;
|
|
||||||
for (size_t p = 0; p < splane; ++p) dst[p] = src[p] * s3;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void symmetry_bd(int ord,
|
|
||||||
const int extc[3],
|
|
||||||
const double *func,
|
|
||||||
double *funcc,
|
|
||||||
const double SoA[3])
|
|
||||||
{
|
|
||||||
if (ord <= 0) return;
|
|
||||||
|
|
||||||
/* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */
|
|
||||||
if (ord == 2) {
|
|
||||||
symmetry_bd_impl(2, 1, extc, func, funcc, SoA);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (ord == 3) {
|
|
||||||
symmetry_bd_impl(3, 2, extc, func, funcc, SoA);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
#include "share_func.h"
|
|
||||||
void fdderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fxx, double *fxy, double *fxz,
|
|
||||||
double *fyy, double *fyz, double *fzz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff);
|
|
||||||
|
|
||||||
void fderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fx, double *fy, double *fz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff);
|
|
||||||
|
|
||||||
void kodis(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double SoA[3],
|
|
||||||
int Symmetry, double eps);
|
|
||||||
|
|
||||||
void lopsided(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
|
||||||
int Symmetry, const double SoA[3]);
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Convert interp_lb_profile.bin to a C header for compile-time embedding."""
|
|
||||||
import struct, sys
|
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
|
||||||
print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
with open(sys.argv[1], 'rb') as f:
|
|
||||||
magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
|
|
||||||
threshold = struct.unpack('d', f.read(8))[0]
|
|
||||||
times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
|
|
||||||
heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
|
|
||||||
|
|
||||||
# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
|
|
||||||
# (or vice versa depending on which neighbor is lighter)
|
|
||||||
splits = []
|
|
||||||
for hr in heavy:
|
|
||||||
prev_t = times[hr - 1] if hr > 0 else 1e30
|
|
||||||
next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
|
|
||||||
if prev_t <= next_t:
|
|
||||||
splits.append((hr, hr - 1, hr)) # (block_id, r_left, r_right)
|
|
||||||
else:
|
|
||||||
splits.append((hr, hr, hr + 1))
|
|
||||||
|
|
||||||
# Also remap the displaced neighbor blocks
|
|
||||||
remaps = {}
|
|
||||||
for hr, r_l, r_r in splits:
|
|
||||||
if r_l != hr:
|
|
||||||
# We took r_l's slot, so remap block r_l to its other neighbor
|
|
||||||
displaced = r_l
|
|
||||||
if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
|
|
||||||
remaps[displaced] = displaced - 1
|
|
||||||
elif displaced < nprocs - 1:
|
|
||||||
remaps[displaced] = displaced + 1
|
|
||||||
else:
|
|
||||||
displaced = r_r
|
|
||||||
if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
|
|
||||||
remaps[displaced] = displaced + 1
|
|
||||||
elif displaced > 0:
|
|
||||||
remaps[displaced] = displaced - 1
|
|
||||||
|
|
||||||
with open(sys.argv[2], 'w') as out:
|
|
||||||
out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
|
|
||||||
out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
|
|
||||||
out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
|
|
||||||
out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
|
|
||||||
out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
|
|
||||||
out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
|
|
||||||
out.write(", ".join(str(h) for h in heavy))
|
|
||||||
out.write("};\n\n")
|
|
||||||
out.write("/* Split table: {block_id, r_left, r_right} */\n")
|
|
||||||
out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
|
|
||||||
for bid, rl, rr in splits:
|
|
||||||
out.write(f" {{{bid}, {rl}, {rr}}},\n")
|
|
||||||
out.write("};\n\n")
|
|
||||||
out.write("/* Rank remap for displaced neighbor blocks */\n")
|
|
||||||
out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
|
|
||||||
out.write(f"static const int interp_lb_remaps[][2] = {{\n")
|
|
||||||
for src, dst in sorted(remaps.items()):
|
|
||||||
out.write(f" {{{src}, {dst}}},\n")
|
|
||||||
if not remaps:
|
|
||||||
out.write(" {-1, -1},\n")
|
|
||||||
out.write("};\n\n")
|
|
||||||
out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
|
|
||||||
|
|
||||||
print(f"Generated {sys.argv[2]}:")
|
|
||||||
print(f" {num_heavy} heavy blocks to split: {heavy}")
|
|
||||||
for bid, rl, rr in splits:
|
|
||||||
print(f" block {bid}: split -> rank {rl} (left), rank {rr} (right)")
|
|
||||||
for src, dst in sorted(remaps.items()):
|
|
||||||
print(f" block {src}: remap -> rank {dst}")
|
|
||||||
@@ -11,47 +11,17 @@
|
|||||||
import AMSS_NCKU_Input as input_data
|
import AMSS_NCKU_Input as input_data
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
## CPU core binding configuration using taskset
|
||||||
|
## taskset ensures all child processes inherit the CPU affinity mask
|
||||||
|
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
|
||||||
|
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
|
||||||
|
#NUMACTL_CPU_BIND = "taskset -c 0-111"
|
||||||
|
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
|
||||||
|
|
||||||
|
## Build parallelism configuration
|
||||||
def get_last_n_cores_per_socket(n=32):
|
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
|
||||||
"""
|
## Set make -j to utilize available cores for faster builds
|
||||||
Read CPU topology via lscpu and return a taskset -c string
|
BUILD_JOBS = 96
|
||||||
selecting the last `n` cores of each NUMA node (socket).
|
|
||||||
|
|
||||||
Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
|
|
||||||
-> "taskset -c 24-55,80-111"
|
|
||||||
"""
|
|
||||||
result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
|
|
||||||
|
|
||||||
# Build a dict: node_id -> sorted list of CPU ids
|
|
||||||
node_cpus = {}
|
|
||||||
for line in result.stdout.splitlines():
|
|
||||||
if line.startswith("#") or not line.strip():
|
|
||||||
continue
|
|
||||||
parts = line.split(",")
|
|
||||||
if len(parts) < 2:
|
|
||||||
continue
|
|
||||||
node_id, cpu_id = int(parts[0]), int(parts[1])
|
|
||||||
node_cpus.setdefault(node_id, []).append(cpu_id)
|
|
||||||
|
|
||||||
segments = []
|
|
||||||
for node_id in sorted(node_cpus):
|
|
||||||
cpus = sorted(node_cpus[node_id])
|
|
||||||
selected = cpus[-n:] # last n cores of this socket
|
|
||||||
segments.append(f"{selected[0]}-{selected[-1]}")
|
|
||||||
|
|
||||||
cpu_str = ",".join(segments)
|
|
||||||
total = len(segments) * n
|
|
||||||
print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)")
|
|
||||||
#return f"taskset -c {cpu_str}"
|
|
||||||
return f""
|
|
||||||
|
|
||||||
|
|
||||||
## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
|
|
||||||
NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
|
|
||||||
|
|
||||||
## Build parallelism: match the number of bound cores
|
|
||||||
BUILD_JOBS = 64
|
|
||||||
|
|
||||||
|
|
||||||
##################################################################
|
##################################################################
|
||||||
@@ -70,7 +40,7 @@ def makefile_ABE():
|
|||||||
|
|
||||||
## Build command with CPU binding to nohz_full cores
|
## Build command with CPU binding to nohz_full cores
|
||||||
if (input_data.GPU_Calculation == "no"):
|
if (input_data.GPU_Calculation == "no"):
|
||||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE"
|
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
|
||||||
elif (input_data.GPU_Calculation == "yes"):
|
elif (input_data.GPU_Calculation == "yes"):
|
||||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
||||||
else:
|
else:
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user