Compare commits
4 Commits
cjy-oneapi
...
yx_new_spl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
588fb675a0 | ||
| aabe74c098 | |||
|
|
f147f79ffa | ||
|
|
8abac8dd88 |
@@ -66,7 +66,8 @@ if os.path.exists(File_directory):
|
|||||||
## Prompt whether to overwrite the existing directory
|
## Prompt whether to overwrite the existing directory
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
inputvalue = input()
|
## inputvalue = input()
|
||||||
|
inputvalue = "continue"
|
||||||
## If the user agrees to overwrite, proceed and remove the existing directory
|
## If the user agrees to overwrite, proceed and remove the existing directory
|
||||||
if ( inputvalue == "continue" ):
|
if ( inputvalue == "continue" ):
|
||||||
print( " Continue the calculation !!! " )
|
print( " Continue the calculation !!! " )
|
||||||
@@ -270,12 +271,6 @@ if not os.path.exists( ABE_file ):
|
|||||||
## Copy the executable ABE (or ABEGPU) into the run directory
|
## Copy the executable ABE (or ABEGPU) into the run directory
|
||||||
shutil.copy2(ABE_file, output_directory)
|
shutil.copy2(ABE_file, output_directory)
|
||||||
|
|
||||||
## Copy interp load balance profile if present (for optimize pass)
|
|
||||||
interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin")
|
|
||||||
if os.path.exists(interp_lb_profile):
|
|
||||||
shutil.copy2(interp_lb_profile, output_directory)
|
|
||||||
print( " Copied interp_lb_profile.bin to run directory " )
|
|
||||||
|
|
||||||
###########################
|
###########################
|
||||||
|
|
||||||
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
|
## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory
|
||||||
|
|||||||
@@ -13,9 +13,6 @@ using namespace std;
|
|||||||
#include "MPatch.h"
|
#include "MPatch.h"
|
||||||
#include "Parallel.h"
|
#include "Parallel.h"
|
||||||
#include "fmisc.h"
|
#include "fmisc.h"
|
||||||
#ifdef INTERP_LB_PROFILE
|
|
||||||
#include "interp_lb_profile.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||||
{
|
{
|
||||||
@@ -445,7 +442,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
Bp = Bp->next;
|
Bp = Bp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Replace MPI_Allreduce with per-owner MPI_Bcast:
|
// Replace MPI_Allreduce with per-owner MPI_Bcast:
|
||||||
// Group consecutive points by owner rank and broadcast each group.
|
// Group consecutive points by owner rank and broadcast each group.
|
||||||
// Since each point's data is non-zero only on the owner rank,
|
// Since each point's data is non-zero only on the owner rank,
|
||||||
@@ -510,9 +506,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
// Targeted point-to-point overload: each owner sends each point only to
|
// Targeted point-to-point overload: each owner sends each point only to
|
||||||
// the one rank that needs it for integration (consumer), reducing
|
// the one rank that needs it for integration (consumer), reducing
|
||||||
// communication volume by ~nprocs times compared to the Bcast version.
|
// communication volume by ~nprocs times compared to the Bcast version.
|
||||||
#ifdef INTERP_LB_PROFILE
|
/*
|
||||||
double t_interp_start = MPI_Wtime();
|
double t_calc_end, t_calc_total = 0;
|
||||||
#endif
|
double t_calc_start = MPI_Wtime();*/
|
||||||
int myrank, nprocs;
|
int myrank, nprocs;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
@@ -613,17 +609,15 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
Bp = Bp->next;
|
Bp = Bp->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
#ifdef INTERP_LB_PROFILE
|
t_calc_end = MPI_Wtime();
|
||||||
double t_interp_end = MPI_Wtime();
|
t_calc_total = t_calc_end - t_calc_start;*/
|
||||||
double t_interp_local = t_interp_end - t_interp_start;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// --- Error check for unfound points ---
|
// --- Error check for unfound points ---
|
||||||
for (int j = 0; j < NN; j++)
|
for (int j = 0; j < NN; j++)
|
||||||
{
|
{
|
||||||
if (owner_rank[j] < 0 && myrank == 0)
|
if (owner_rank[j] < 0 && myrank == 0)
|
||||||
{
|
{
|
||||||
|
cout<<owner_rank[j-1]<<endl;
|
||||||
cout << "ERROR: Patch::Interp_Points fails to find point (";
|
cout << "ERROR: Patch::Interp_Points fails to find point (";
|
||||||
for (int d = 0; d < dim; d++)
|
for (int d = 0; d < dim; d++)
|
||||||
{
|
{
|
||||||
@@ -775,31 +769,63 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
|||||||
delete[] recv_count;
|
delete[] recv_count;
|
||||||
delete[] consumer_rank;
|
delete[] consumer_rank;
|
||||||
delete[] owner_rank;
|
delete[] owner_rank;
|
||||||
|
/*
|
||||||
|
// 4. 汇总并输出真正干活最慢的 Top 4
|
||||||
|
struct RankStats {
|
||||||
|
int rank;
|
||||||
|
double calc_time; // 净计算时间
|
||||||
|
};
|
||||||
|
|
||||||
#ifdef INTERP_LB_PROFILE
|
// 创建当前进程的统计数据
|
||||||
{
|
RankStats local_stat;
|
||||||
static bool profile_written = false;
|
local_stat.rank = myrank;
|
||||||
if (!profile_written) {
|
local_stat.calc_time = t_calc_total;
|
||||||
double *all_times = nullptr;
|
|
||||||
if (myrank == 0) all_times = new double[nprocs];
|
// 为所有进程的统计数据分配内存
|
||||||
MPI_Gather(&t_interp_local, 1, MPI_DOUBLE,
|
RankStats *all_stats = nullptr;
|
||||||
all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
|
||||||
if (myrank == 0) {
|
if (myrank == 0) {
|
||||||
int heavy[64];
|
all_stats = new RankStats[nprocs];
|
||||||
int nh = InterpLBProfile::identify_heavy_ranks(
|
|
||||||
all_times, nprocs, 2.5, heavy, 64);
|
|
||||||
InterpLBProfile::write_profile(
|
|
||||||
"interp_lb_profile.bin", nprocs,
|
|
||||||
all_times, heavy, nh, 2.5);
|
|
||||||
printf("[InterpLB] Profile written: %d heavy ranks\n", nh);
|
|
||||||
for (int i = 0; i < nh; i++)
|
|
||||||
printf(" Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]);
|
|
||||||
delete[] all_times;
|
|
||||||
}
|
}
|
||||||
profile_written = true;
|
|
||||||
|
// 使用MPI_Gather收集所有进程的数据到rank 0
|
||||||
|
MPI_Gather(&local_stat, sizeof(RankStats), MPI_BYTE,
|
||||||
|
all_stats, sizeof(RankStats), MPI_BYTE,
|
||||||
|
0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
// 准备输出前4个rank的信息(所有rank都参与,确保广播后一致)
|
||||||
|
int top10_ranks[10] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||||
|
double top10_times[10] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
|
||||||
|
int num_top10 = 0;
|
||||||
|
|
||||||
|
if (myrank == 0) {
|
||||||
|
// 按 calc_time(净计算时间)排序
|
||||||
|
std::sort(all_stats, all_stats + nprocs, [](const RankStats& a, const RankStats& b) {
|
||||||
|
return a.calc_time > b.calc_time;
|
||||||
|
});
|
||||||
|
|
||||||
|
// 取前4个
|
||||||
|
num_top10 = (nprocs < 10) ? nprocs : 10;
|
||||||
|
for (int i = 0; i < num_top10; i++) {
|
||||||
|
top10_ranks[i] = all_stats[i].rank;
|
||||||
|
top10_times[i] = all_stats[i].calc_time;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printf("\n--- Top %d Ranks by ACTIVE COMPUTATION (CPU Time) ---\n", num_top10);
|
||||||
|
for (int i = 0; i < num_top10; i++) {
|
||||||
|
printf("Rank [%4d]: Calc %.6f s\n", top10_ranks[i], top10_times[i]);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
// 清理分配的内存
|
||||||
|
delete[] all_stats;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 广播前4个rank的信息给所有进程
|
||||||
|
MPI_Bcast(&num_top10, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
||||||
|
if (num_top10 > 0) {
|
||||||
|
MPI_Bcast(top10_ranks, 10, MPI_INT, 0, MPI_COMM_WORLD);
|
||||||
|
MPI_Bcast(top10_times, 10, MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
||||||
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
void Patch::Interp_Points(MyList<var> *VarList,
|
void Patch::Interp_Points(MyList<var> *VarList,
|
||||||
int NN, double **XX,
|
int NN, double **XX,
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ using namespace std;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include <mpi.h>
|
#include <mpi.h>
|
||||||
|
#include <memory.h>
|
||||||
#include "MyList.h"
|
#include "MyList.h"
|
||||||
#include "Block.h"
|
#include "Block.h"
|
||||||
#include "Parallel.h"
|
#include "Parallel.h"
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
#include "prolongrestrict.h"
|
#include "prolongrestrict.h"
|
||||||
#include "misc.h"
|
#include "misc.h"
|
||||||
#include "parameters.h"
|
#include "parameters.h"
|
||||||
|
#include <set>
|
||||||
|
|
||||||
int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
|
int Parallel::partition1(int &nx, int split_size, int min_width, int cpusize, int shape) // special for 1 diemnsion
|
||||||
{
|
{
|
||||||
@@ -115,7 +116,7 @@ int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize,
|
|||||||
return nx * ny * nz;
|
return nx * ny * nz;
|
||||||
#undef SEARCH_SIZE
|
#undef SEARCH_SIZE
|
||||||
}
|
}
|
||||||
#elif 1 // Zhihui's idea one on 2013-09-25
|
#elif 0 // Zhihui's idea one on 2013-09-25
|
||||||
{
|
{
|
||||||
int nx, ny, nz;
|
int nx, ny, nz;
|
||||||
int hmin_width;
|
int hmin_width;
|
||||||
@@ -150,7 +151,7 @@ int Parallel::partition3(int *nxyz, int split_size, int *min_width, int cpusize,
|
|||||||
|
|
||||||
return nx * ny * nz;
|
return nx * ny * nz;
|
||||||
}
|
}
|
||||||
#elif 1 // Zhihui's idea two on 2013-09-25
|
#elif 0 // Zhihui's idea two on 2013-09-25
|
||||||
{
|
{
|
||||||
int nx, ny, nz;
|
int nx, ny, nz;
|
||||||
const int hmin_width = 8; // for example we use 8
|
const int hmin_width = 8; // for example we use 8
|
||||||
@@ -462,7 +463,7 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev);
|
ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks
|
||||||
// ng->checkBlock();
|
// ng->checkBlock();
|
||||||
if (BlL)
|
if (BlL)
|
||||||
BlL->insert(ng);
|
BlL->insert(ng);
|
||||||
@@ -500,11 +501,7 @@ MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int i
|
|||||||
|
|
||||||
return BlL;
|
return BlL;
|
||||||
}
|
}
|
||||||
|
MyList<Block> *Parallel::distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
|
||||||
#ifdef INTERP_LB_OPTIMIZE
|
|
||||||
#include "interp_lb_profile_data.h"
|
|
||||||
|
|
||||||
MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
|
|
||||||
bool periodic, int nodes)
|
bool periodic, int nodes)
|
||||||
{
|
{
|
||||||
#ifdef USE_GPU_DIVIDE
|
#ifdef USE_GPU_DIVIDE
|
||||||
@@ -519,6 +516,7 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
{
|
{
|
||||||
int myrank;
|
int myrank;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
// read parameter from file
|
||||||
const int LEN = 256;
|
const int LEN = 256;
|
||||||
char pline[LEN];
|
char pline[LEN];
|
||||||
string str, sgrp, skey, sval;
|
string str, sgrp, skey, sval;
|
||||||
@@ -527,21 +525,44 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
{
|
{
|
||||||
map<string, string>::iterator iter = parameters::str_par.find("inputpar");
|
map<string, string>::iterator iter = parameters::str_par.find("inputpar");
|
||||||
if (iter != parameters::str_par.end())
|
if (iter != parameters::str_par.end())
|
||||||
|
{
|
||||||
strcpy(pname, (iter->second).c_str());
|
strcpy(pname, (iter->second).c_str());
|
||||||
else { cout << "Error inputpar" << endl; exit(0); }
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cout << "Error inputpar" << endl;
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ifstream inf(pname, ifstream::in);
|
ifstream inf(pname, ifstream::in);
|
||||||
if (!inf.good() && myrank == 0)
|
if (!inf.good() && myrank == 0)
|
||||||
{ cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
|
{
|
||||||
|
cout << "Can not open parameter file " << pname << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 1; inf.good(); i++)
|
for (int i = 1; inf.good(); i++)
|
||||||
{
|
{
|
||||||
inf.getline(pline, LEN); str = pline;
|
inf.getline(pline, LEN);
|
||||||
|
str = pline;
|
||||||
|
|
||||||
int status = misc::parse_parts(str, sgrp, skey, sval, sind);
|
int status = misc::parse_parts(str, sgrp, skey, sval, sind);
|
||||||
if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
|
if (status == -1)
|
||||||
else if (status == 0) continue;
|
{
|
||||||
if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); }
|
cout << "error reading parameter file " << pname << " in line " << i << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
else if (status == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (sgrp == "ABE")
|
||||||
|
{
|
||||||
|
if (skey == "cpu part")
|
||||||
|
cpu_part = atof(sval.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inf.close();
|
inf.close();
|
||||||
|
|
||||||
parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
|
parameters::dou_par.insert(map<string, double>::value_type("cpu part", cpu_part));
|
||||||
}
|
}
|
||||||
iter = parameters::dou_par.find("gpu part");
|
iter = parameters::dou_par.find("gpu part");
|
||||||
@@ -553,6 +574,7 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
{
|
{
|
||||||
int myrank;
|
int myrank;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
// read parameter from file
|
||||||
const int LEN = 256;
|
const int LEN = 256;
|
||||||
char pline[LEN];
|
char pline[LEN];
|
||||||
string str, sgrp, skey, sval;
|
string str, sgrp, skey, sval;
|
||||||
@@ -561,26 +583,52 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
{
|
{
|
||||||
map<string, string>::iterator iter = parameters::str_par.find("inputpar");
|
map<string, string>::iterator iter = parameters::str_par.find("inputpar");
|
||||||
if (iter != parameters::str_par.end())
|
if (iter != parameters::str_par.end())
|
||||||
|
{
|
||||||
strcpy(pname, (iter->second).c_str());
|
strcpy(pname, (iter->second).c_str());
|
||||||
else { cout << "Error inputpar" << endl; exit(0); }
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cout << "Error inputpar" << endl;
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ifstream inf(pname, ifstream::in);
|
ifstream inf(pname, ifstream::in);
|
||||||
if (!inf.good() && myrank == 0)
|
if (!inf.good() && myrank == 0)
|
||||||
{ cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
|
{
|
||||||
|
cout << "Can not open parameter file " << pname << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 1; inf.good(); i++)
|
for (int i = 1; inf.good(); i++)
|
||||||
{
|
{
|
||||||
inf.getline(pline, LEN); str = pline;
|
inf.getline(pline, LEN);
|
||||||
|
str = pline;
|
||||||
|
|
||||||
int status = misc::parse_parts(str, sgrp, skey, sval, sind);
|
int status = misc::parse_parts(str, sgrp, skey, sval, sind);
|
||||||
if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); }
|
if (status == -1)
|
||||||
else if (status == 0) continue;
|
{
|
||||||
if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); }
|
cout << "error reading parameter file " << pname << " in line " << i << endl;
|
||||||
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
|
}
|
||||||
|
else if (status == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (sgrp == "ABE")
|
||||||
|
{
|
||||||
|
if (skey == "gpu part")
|
||||||
|
gpu_part = atof(sval.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inf.close();
|
inf.close();
|
||||||
|
|
||||||
parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
|
parameters::dou_par.insert(map<string, double>::value_type("gpu part", gpu_part));
|
||||||
}
|
}
|
||||||
if (nodes == 0) nodes = cpusize / 2;
|
|
||||||
|
if (nodes == 0)
|
||||||
|
nodes = cpusize / 2;
|
||||||
#else
|
#else
|
||||||
if (nodes == 0) nodes = cpusize;
|
if (nodes == 0)
|
||||||
|
nodes = cpusize;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (dim != 3)
|
if (dim != 3)
|
||||||
@@ -591,6 +639,7 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
|
|
||||||
MyList<Block> *BlL = 0;
|
MyList<Block> *BlL = 0;
|
||||||
int split_size, min_size, block_size = 0;
|
int split_size, min_size, block_size = 0;
|
||||||
|
|
||||||
int min_width = 2 * Mymax(ghost_width, buffer_width);
|
int min_width = 2 * Mymax(ghost_width, buffer_width);
|
||||||
int nxyz[dim], mmin_width[dim], min_shape[dim];
|
int nxyz[dim], mmin_width[dim], min_shape[dim];
|
||||||
|
|
||||||
@@ -611,6 +660,7 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
|
|
||||||
for (int i = 0; i < dim; i++)
|
for (int i = 0; i < dim; i++)
|
||||||
mmin_width[i] = Mymin(min_width, min_shape[i]);
|
mmin_width[i] = Mymin(min_width, min_shape[i]);
|
||||||
|
|
||||||
min_size = mmin_width[0];
|
min_size = mmin_width[0];
|
||||||
for (int i = 1; i < dim; i++)
|
for (int i = 1; i < dim; i++)
|
||||||
min_size = min_size * mmin_width[i];
|
min_size = min_size * mmin_width[i];
|
||||||
@@ -619,6 +669,7 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
while (PLi)
|
while (PLi)
|
||||||
{
|
{
|
||||||
Patch *PP = PLi->data;
|
Patch *PP = PLi->data;
|
||||||
|
// PP->checkPatch(true);
|
||||||
int bs = PP->shape[0];
|
int bs = PP->shape[0];
|
||||||
for (int i = 1; i < dim; i++)
|
for (int i = 1; i < dim; i++)
|
||||||
bs = bs * PP->shape[i];
|
bs = bs * PP->shape[i];
|
||||||
@@ -642,25 +693,16 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
for (int j = 0; j < nxyz[1]; j++)
|
for (int j = 0; j < nxyz[1]; j++)
|
||||||
for (int k = 0; k < nxyz[2]; k++)
|
for (int k = 0; k < nxyz[2]; k++)
|
||||||
{
|
{
|
||||||
|
// --- 1. 定义局部变量 ---
|
||||||
int ibbox_here[6], shape_here[3];
|
int ibbox_here[6], shape_here[3];
|
||||||
double bbox_here[6], dd;
|
double bbox_here[6], dd;
|
||||||
Block *current_ng_start = nullptr;
|
Block *current_ng_start = nullptr; // 本次循环产生的第一个(或唯一一个)块
|
||||||
|
|
||||||
bool is_heavy = false;
|
// --- 2. 核心逻辑分支 ---
|
||||||
int r_l = -1, r_r = -1;
|
if (current_block_id == 27 || current_block_id == 28 ||
|
||||||
if (cpusize == INTERP_LB_NPROCS) {
|
current_block_id == 35 || current_block_id == 36)
|
||||||
for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) {
|
|
||||||
if (current_block_id == interp_lb_splits[si][0]) {
|
|
||||||
is_heavy = true;
|
|
||||||
r_l = interp_lb_splits[si][1];
|
|
||||||
r_r = interp_lb_splits[si][2];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_heavy)
|
|
||||||
{
|
{
|
||||||
|
// A. 计算原始索引 (不带 Ghost)
|
||||||
int ib0 = (PP->shape[0] * i) / nxyz[0];
|
int ib0 = (PP->shape[0] * i) / nxyz[0];
|
||||||
int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
|
int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
|
||||||
int jb1 = (PP->shape[1] * j) / nxyz[1];
|
int jb1 = (PP->shape[1] * j) / nxyz[1];
|
||||||
@@ -668,17 +710,41 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
int kb2 = (PP->shape[2] * k) / nxyz[2];
|
int kb2 = (PP->shape[2] * k) / nxyz[2];
|
||||||
int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
|
int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1;
|
||||||
|
|
||||||
|
int r_1, r_2, r_3, r_4;
|
||||||
Block * split_first_block = nullptr;
|
Block * split_first_block = nullptr;
|
||||||
Block * split_last_block = nullptr;
|
Block * split_last_block = nullptr;
|
||||||
|
if(current_block_id == 27)
|
||||||
|
{
|
||||||
|
r_1 = 24; r_2 = 25; r_3 = 26; r_4 = 27;
|
||||||
splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
|
splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
|
||||||
PP, r_l, r_r, ingfsi, fngfsi, periodic,
|
PP, r_1,r_2,r_3, r_4, ingfsi, fngfsi, periodic,split_first_block,split_last_block);
|
||||||
split_first_block, split_last_block);
|
}
|
||||||
|
|
||||||
|
else if(current_block_id == 28)
|
||||||
|
{
|
||||||
|
r_1 = 28; r_2 = 29;
|
||||||
|
splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
|
||||||
|
PP, r_1, r_2, ingfsi, fngfsi, periodic,split_first_block,split_last_block);
|
||||||
|
}
|
||||||
|
else if(current_block_id == 35)
|
||||||
|
{
|
||||||
|
r_1 = 34; r_2 = 35;
|
||||||
|
splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
|
||||||
|
PP, r_1, r_2, ingfsi, fngfsi, periodic,split_first_block,split_last_block);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
r_1 = 36; r_2 = 37; r_3 = 38; r_4 = 39;
|
||||||
|
splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5,
|
||||||
|
PP, r_1,r_2,r_3, r_4, ingfsi, fngfsi, periodic,split_first_block,split_last_block);
|
||||||
|
}
|
||||||
|
|
||||||
current_ng_start = split_first_block;
|
current_ng_start = split_first_block;
|
||||||
ng = split_last_block;
|
ng = split_last_block;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
// B. 普通块逻辑 (含 Ghost 扩张)
|
||||||
ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
|
ibbox_here[0] = (PP->shape[0] * i) / nxyz[0];
|
||||||
ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
|
ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1;
|
||||||
ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
|
ibbox_here[1] = (PP->shape[1] * j) / nxyz[1];
|
||||||
@@ -702,27 +768,34 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
|
|
||||||
for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
|
for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1;
|
||||||
|
|
||||||
|
// 物理坐标计算 (根据你的宏定义 Cell/Vertex)
|
||||||
#ifdef Vertex
|
#ifdef Vertex
|
||||||
#ifdef Cell
|
#ifdef Cell
|
||||||
#error Both Cell and Vertex are defined
|
#error Both Cell and Vertex are defined
|
||||||
#endif
|
#endif
|
||||||
|
// 0--4, 5--10
|
||||||
dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
|
dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
|
||||||
bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
|
bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd;
|
||||||
bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
|
bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd;
|
||||||
|
|
||||||
dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
|
dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
|
||||||
bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
|
bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd;
|
||||||
bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
|
bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd;
|
||||||
|
|
||||||
dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
|
dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
|
||||||
bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
|
bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd;
|
||||||
bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
|
bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd;
|
||||||
#else
|
#else
|
||||||
#ifdef Cell
|
#ifdef Cell
|
||||||
|
// 0--5, 5--10
|
||||||
dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
|
dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
|
||||||
bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
|
bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd;
|
||||||
bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
|
bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd;
|
||||||
|
|
||||||
dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
|
dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
|
||||||
bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
|
bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd;
|
||||||
bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
|
bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd;
|
||||||
|
|
||||||
dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
|
dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
|
||||||
bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
|
bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd;
|
||||||
bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
|
bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd;
|
||||||
@@ -730,29 +803,32 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
#error Not define Vertex nor Cell
|
#error Not define Vertex nor Cell
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
ng = createMappedBlock(BlL, dim, shape_here, bbox_here,
|
ng = createMappedBlock(BlL, dim, shape_here, bbox_here, current_block_id, ingfsi, fngfsi, PP->lev);
|
||||||
current_block_id, ingfsi, fngfsi, PP->lev);
|
|
||||||
current_ng_start = ng;
|
current_ng_start = ng;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- 3. 统一处理 Patch 起始 Block 指针 ---
|
||||||
if (first_block_in_patch) {
|
if (first_block_in_patch) {
|
||||||
ng0 = current_ng_start;
|
ng0 = current_ng_start;
|
||||||
|
|
||||||
|
// 立即设置 PP->blb,避免后续循环覆盖 ng0
|
||||||
MyList<Block> *Bp_start = BlL;
|
MyList<Block> *Bp_start = BlL;
|
||||||
while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
|
while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next;
|
||||||
PP->blb = Bp_start;
|
PP->blb = Bp_start;
|
||||||
|
|
||||||
first_block_in_patch = false;
|
first_block_in_patch = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
current_block_id++;
|
current_block_id++;
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
// --- 4. 设置 Patch 结束 Block 指针 ---
|
||||||
MyList<Block> *Bp_end = BlL;
|
MyList<Block> *Bp_end = BlL;
|
||||||
while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
|
while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next;
|
||||||
PP->ble = Bp_end;
|
PP->ble = Bp_end;
|
||||||
}
|
|
||||||
|
|
||||||
PLi = PLi->next;
|
PLi = PLi->next;
|
||||||
|
first_block_in_patch = true;
|
||||||
}
|
}
|
||||||
if (reacpu < nodes * 2 / 3)
|
if (reacpu < nodes * 2 / 3)
|
||||||
{
|
{
|
||||||
@@ -765,24 +841,50 @@ MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusi
|
|||||||
return BlL;
|
return BlL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief 将当前 Block 几何4等分并存入列表
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
||||||
int ib0_orig, int ib3_orig,
|
int ib0_orig, int ib3_orig,
|
||||||
int jb1_orig, int jb4_orig,
|
int jb1_orig, int jb4_orig,
|
||||||
int kb2_orig, int kb5_orig,
|
int kb2_orig, int kb5_orig,
|
||||||
Patch* PP, int r_left, int r_right,
|
Patch* PP, int r_1, int r_2, int r_3, int r_4,
|
||||||
int ingfsi, int fngfsi, bool periodic,
|
int ingfsi, int fngfsi, bool periodic,
|
||||||
Block* &split_first_block, Block* &split_last_block)
|
Block* &split_first_block, Block* &split_last_block)
|
||||||
{
|
{
|
||||||
int mid = (ib0_orig + ib3_orig) / 2;
|
// 1. 计算四分索引区间
|
||||||
|
// 计算 X 方向的总网格数
|
||||||
|
int total_len = ib3_orig - ib0_orig + 1;
|
||||||
|
|
||||||
int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
|
// 计算三个切分点,确保覆盖 [ib0_orig, ib3_orig]
|
||||||
int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
|
// 段 1: [ib0_orig, m1]
|
||||||
|
// 段 2: [m1 + 1, m2]
|
||||||
|
// 段 3: [m2 + 1, m3]
|
||||||
|
// 段 4: [m3 + 1, ib3_orig]
|
||||||
|
int m1 = ib0_orig + total_len / 4 - 1;
|
||||||
|
int m2 = ib0_orig + total_len / 2 - 1;
|
||||||
|
int m3 = ib0_orig + (3 * total_len) / 4 - 1;
|
||||||
|
|
||||||
|
int indices[4][6] = {
|
||||||
|
{ib0_orig, jb1_orig, kb2_orig, m1, jb4_orig, kb5_orig}, // 子块 1
|
||||||
|
{m1 + 1, jb1_orig, kb2_orig, m2, jb4_orig, kb5_orig}, // 子块 2
|
||||||
|
{m2 + 1, jb1_orig, kb2_orig, m3, jb4_orig, kb5_orig}, // 子块 3
|
||||||
|
{m3 + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig} // 子块 4
|
||||||
|
};
|
||||||
|
|
||||||
|
int target_ranks[4] = {r_1, r_2, r_3, r_4};
|
||||||
|
|
||||||
|
// 2. 内部处理逻辑 (保持原有的 Ghost 扩张和物理坐标转换)
|
||||||
auto createSubBlock = [&](int* ib_raw, int target_rank) {
|
auto createSubBlock = [&](int* ib_raw, int target_rank) {
|
||||||
int ib_final[6];
|
int ib_final[6];
|
||||||
int sh_here[3];
|
int sh_here[3];
|
||||||
double bb_here[6], dd;
|
double bb_here[6], dd;
|
||||||
|
|
||||||
|
// --- 逻辑 A: Ghost 扩张 ---
|
||||||
if (periodic) {
|
if (periodic) {
|
||||||
ib_final[0] = ib_raw[0] - ghost_width;
|
ib_final[0] = ib_raw[0] - ghost_width;
|
||||||
ib_final[3] = ib_raw[3] + ghost_width;
|
ib_final[3] = ib_raw[3] + ghost_width;
|
||||||
@@ -803,68 +905,40 @@ Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
|||||||
sh_here[1] = ib_final[4] - ib_final[1] + 1;
|
sh_here[1] = ib_final[4] - ib_final[1] + 1;
|
||||||
sh_here[2] = ib_final[5] - ib_final[2] + 1;
|
sh_here[2] = ib_final[5] - ib_final[2] + 1;
|
||||||
|
|
||||||
#ifdef Vertex
|
// --- 逻辑 B: 物理坐标计算 ---
|
||||||
dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1);
|
|
||||||
bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
|
|
||||||
bb_here[3] = PP->bbox[0] + ib_final[3] * dd;
|
|
||||||
dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1);
|
|
||||||
bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
|
|
||||||
bb_here[4] = PP->bbox[1] + ib_final[4] * dd;
|
|
||||||
dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1);
|
|
||||||
bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
|
|
||||||
bb_here[5] = PP->bbox[2] + ib_final[5] * dd;
|
|
||||||
#else
|
|
||||||
#ifdef Cell
|
|
||||||
dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
|
dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
|
||||||
bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
|
bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
|
||||||
bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
|
bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
|
||||||
|
|
||||||
dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
|
dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
|
||||||
bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
|
bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
|
||||||
bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
|
bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
|
||||||
|
|
||||||
dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
|
dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
|
||||||
bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
|
bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
|
||||||
bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
|
bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
|
Block* Bg = new Block(_dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
|
||||||
|
|
||||||
if (BlL) BlL->insert(Bg);
|
if (BlL) BlL->insert(Bg);
|
||||||
else BlL = new MyList<Block>(Bg);
|
else BlL = new MyList<Block>(Bg);
|
||||||
|
|
||||||
return Bg;
|
return Bg;
|
||||||
};
|
};
|
||||||
|
|
||||||
split_first_block = createSubBlock(indices_L, r_left);
|
// 3. 执行创建并返回首尾指针
|
||||||
split_last_block = createSubBlock(indices_R, r_right);
|
split_first_block = createSubBlock(indices[0], target_ranks[0]);
|
||||||
return split_last_block;
|
createSubBlock(indices[1], target_ranks[1]); // 中间块仅插入 List
|
||||||
|
createSubBlock(indices[2], target_ranks[2]); // 中间块仅插入 List
|
||||||
|
split_last_block = createSubBlock(indices[3], target_ranks[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
|
/**
|
||||||
int block_id, int ingfsi, int fngfsi, int lev)
|
* @brief 将当前 Block 几何二等分并存入列表
|
||||||
{
|
* @param axis 拆分轴:0-x, 1-y, 2-z (建议选最长轴)
|
||||||
int target_rank = block_id;
|
*/
|
||||||
if (INTERP_LB_NPROCS > 0) {
|
|
||||||
for (int ri = 0; ri < interp_lb_num_remaps; ri++) {
|
|
||||||
if (block_id == interp_lb_remaps[ri][0]) {
|
|
||||||
target_rank = interp_lb_remaps[ri][1];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
|
|
||||||
if (BlL) BlL->insert(ng);
|
|
||||||
else BlL = new MyList<Block>(ng);
|
|
||||||
|
|
||||||
return ng;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute
|
|
||||||
MyList<Block> *Parallel::distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
|
|
||||||
bool periodic, int nodes)
|
|
||||||
{
|
|
||||||
return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes);
|
|
||||||
}
|
|
||||||
Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
||||||
int ib0_orig, int ib3_orig,
|
int ib0_orig, int ib3_orig,
|
||||||
int jb1_orig, int jb4_orig,
|
int jb1_orig, int jb4_orig,
|
||||||
@@ -872,11 +946,101 @@ Block* Parallel::splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
|||||||
Patch* PP, int r_left, int r_right,
|
Patch* PP, int r_left, int r_right,
|
||||||
int ingfsi, int fngfsi, bool periodic,
|
int ingfsi, int fngfsi, bool periodic,
|
||||||
Block* &split_first_block, Block* &split_last_block)
|
Block* &split_first_block, Block* &split_last_block)
|
||||||
{ return nullptr; }
|
{
|
||||||
|
// 1. 索引二分 (基于无 ghost 的原始索引)
|
||||||
|
int mid = (ib0_orig + ib3_orig) / 2;
|
||||||
|
|
||||||
|
// 左块原始索引: [ib0, mid], 右块原始索引: [mid+1, ib3]
|
||||||
|
int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig};
|
||||||
|
int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig};
|
||||||
|
|
||||||
|
// 2. 内部处理逻辑 (复刻原 distribute 逻辑)
|
||||||
|
auto createSubBlock = [&](int* ib_raw, int target_rank) {
|
||||||
|
int ib_final[6];
|
||||||
|
int sh_here[3];
|
||||||
|
double bb_here[6], dd;
|
||||||
|
|
||||||
|
// --- 逻辑 A: Ghost 扩张 ---
|
||||||
|
if (periodic) {
|
||||||
|
ib_final[0] = ib_raw[0] - ghost_width;
|
||||||
|
ib_final[3] = ib_raw[3] + ghost_width;
|
||||||
|
ib_final[1] = ib_raw[1] - ghost_width;
|
||||||
|
ib_final[4] = ib_raw[4] + ghost_width;
|
||||||
|
ib_final[2] = ib_raw[2] - ghost_width;
|
||||||
|
ib_final[5] = ib_raw[5] + ghost_width;
|
||||||
|
} else {
|
||||||
|
ib_final[0] = Mymax(0, ib_raw[0] - ghost_width);
|
||||||
|
ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width);
|
||||||
|
ib_final[1] = Mymax(0, ib_raw[1] - ghost_width);
|
||||||
|
ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width);
|
||||||
|
ib_final[2] = Mymax(0, ib_raw[2] - ghost_width);
|
||||||
|
ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width);
|
||||||
|
}
|
||||||
|
|
||||||
|
sh_here[0] = ib_final[3] - ib_final[0] + 1;
|
||||||
|
sh_here[1] = ib_final[4] - ib_final[1] + 1;
|
||||||
|
sh_here[2] = ib_final[5] - ib_final[2] + 1;
|
||||||
|
|
||||||
|
// --- 逻辑 B: 物理坐标计算 (严格匹配 Cell 模式) ---
|
||||||
|
// X 方向
|
||||||
|
dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0];
|
||||||
|
bb_here[0] = PP->bbox[0] + ib_final[0] * dd;
|
||||||
|
bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd;
|
||||||
|
|
||||||
|
// Y 方向
|
||||||
|
dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1];
|
||||||
|
bb_here[1] = PP->bbox[1] + ib_final[1] * dd;
|
||||||
|
bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd;
|
||||||
|
|
||||||
|
// Z 方向
|
||||||
|
dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2];
|
||||||
|
bb_here[2] = PP->bbox[2] + ib_final[2] * dd;
|
||||||
|
bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd;
|
||||||
|
|
||||||
|
Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev);
|
||||||
|
if (BlL) BlL->insert(Bg);
|
||||||
|
else BlL = new MyList<Block>(Bg);
|
||||||
|
|
||||||
|
|
||||||
|
return Bg;
|
||||||
|
};
|
||||||
|
|
||||||
|
// 执行创建
|
||||||
|
split_first_block = createSubBlock(indices_L, r_left);
|
||||||
|
split_last_block = createSubBlock(indices_R, r_right);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief 创建映射后的 Block
|
||||||
|
*/
|
||||||
Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
|
Block* Parallel::createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
|
||||||
int block_id, int ingfsi, int fngfsi, int lev)
|
int block_id, int ingfsi, int fngfsi, int lev)
|
||||||
{ return nullptr; }
|
{
|
||||||
#endif
|
// 映射表逻辑
|
||||||
|
int target_rank = block_id;
|
||||||
|
switch(block_id){
|
||||||
|
case 24: target_rank = 23; break;
|
||||||
|
case 25: target_rank = 23; break;
|
||||||
|
case 26: target_rank = 23; break;
|
||||||
|
case 29: target_rank = 30; break;
|
||||||
|
case 34: target_rank = 33; break;
|
||||||
|
case 37: target_rank = 40; break;
|
||||||
|
case 38: target_rank = 40; break;
|
||||||
|
case 39: target_rank = 40; break;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev);
|
||||||
|
|
||||||
|
if (BlL) BlL->insert(ng);
|
||||||
|
else BlL = new MyList<Block>(ng);
|
||||||
|
|
||||||
|
return ng;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
||||||
MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
|
MyList<Block> *Parallel::distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
|
||||||
@@ -6860,3 +7024,224 @@ void Parallel::checkpatchlist(MyList<Patch> *PatL, bool buflog)
|
|||||||
PL = PL->next;
|
PL = PL->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Check if load balancing is needed based on interpolation times
|
||||||
|
bool Parallel::check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks)
|
||||||
|
{
|
||||||
|
// Calculate average time
|
||||||
|
double avg_time = 0;
|
||||||
|
for (int r = 0; r < nprocs; r++)
|
||||||
|
{
|
||||||
|
avg_time += rank_times[r];
|
||||||
|
}
|
||||||
|
avg_time /= nprocs;
|
||||||
|
|
||||||
|
// Identify heavy ranks (time > 1.5x average)
|
||||||
|
std::vector<std::pair<int, double>> rank_times_vec;
|
||||||
|
for (int r = 0; r < nprocs; r++)
|
||||||
|
{
|
||||||
|
if (rank_times[r] > avg_time * 1.5)
|
||||||
|
{
|
||||||
|
rank_times_vec.push_back(std::make_pair(r, rank_times[r]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by time (descending)
|
||||||
|
std::sort(rank_times_vec.begin(), rank_times_vec.end(),
|
||||||
|
[](const std::pair<int, double>& a, const std::pair<int, double>& b) {
|
||||||
|
return a.second > b.second;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Take top 4 heavy ranks
|
||||||
|
num_heavy = std::min(4, (int)rank_times_vec.size());
|
||||||
|
if (num_heavy > 0)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < num_heavy; i++)
|
||||||
|
{
|
||||||
|
heavy_ranks[i] = rank_times_vec[i].first;
|
||||||
|
}
|
||||||
|
return true; // Load balancing is needed
|
||||||
|
}
|
||||||
|
|
||||||
|
return false; // No load balancing needed
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split blocks belonging to heavy ranks to improve load balancing
|
||||||
|
// Strategy: Split heavy rank blocks in half, merge 8 light ranks to free 4 ranks
|
||||||
|
void Parallel::split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
|
||||||
|
int split_factor, int cpusize, int ingfsi, int fngfsi)
|
||||||
|
{
|
||||||
|
int myrank, nprocs;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
|
||||||
|
|
||||||
|
if (myrank != 0) return; // Only rank 0 performs the analysis
|
||||||
|
|
||||||
|
cout << "\n=== Load Balancing Strategy ===" << endl;
|
||||||
|
cout << "Heavy ranks to split (in half): " << num_heavy << endl;
|
||||||
|
for (int i = 0; i < num_heavy; i++)
|
||||||
|
cout << " Heavy rank " << heavy_ranks[i] << endl;
|
||||||
|
|
||||||
|
// Step 1: Identify all blocks and their ranks
|
||||||
|
std::vector<int> all_ranks;
|
||||||
|
std::map<int, std::vector<Block*>> rank_to_blocks;
|
||||||
|
|
||||||
|
MyList<Patch> *PL = PatL;
|
||||||
|
while (PL)
|
||||||
|
{
|
||||||
|
Patch *PP = PL->data;
|
||||||
|
MyList<Block> *BP = PP->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *block = BP->data;
|
||||||
|
all_ranks.push_back(block->rank);
|
||||||
|
rank_to_blocks[block->rank].push_back(block);
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
PL = PL->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: Identify light ranks (not in heavy_ranks list)
|
||||||
|
std::set<int> heavy_set(heavy_ranks, heavy_ranks + num_heavy);
|
||||||
|
std::vector<int> light_ranks;
|
||||||
|
for (int r : all_ranks)
|
||||||
|
{
|
||||||
|
if (heavy_set.find(r) == heavy_set.end())
|
||||||
|
{
|
||||||
|
light_ranks.push_back(r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove duplicates from light_ranks
|
||||||
|
std::sort(light_ranks.begin(), light_ranks.end());
|
||||||
|
light_ranks.erase(std::unique(light_ranks.begin(), light_ranks.end()), light_ranks.end());
|
||||||
|
|
||||||
|
cout << "Found " << light_ranks.size() << " light ranks (candidates for merging)" << endl;
|
||||||
|
|
||||||
|
// Step 3: Select 8 light ranks to merge (those with smallest workload)
|
||||||
|
// For now, we select the first 8 light ranks
|
||||||
|
int num_to_merge = 8;
|
||||||
|
if (light_ranks.size() < num_to_merge)
|
||||||
|
{
|
||||||
|
cout << "WARNING: Not enough light ranks to merge. Found " << light_ranks.size()
|
||||||
|
<< ", need " << num_to_merge << endl;
|
||||||
|
num_to_merge = light_ranks.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> ranks_to_merge(light_ranks.begin(), light_ranks.begin() + num_to_merge);
|
||||||
|
|
||||||
|
cout << "Light ranks to merge (8 -> 4 merged ranks):" << endl;
|
||||||
|
for (int i = 0; i < num_to_merge; i++)
|
||||||
|
cout << " Rank " << ranks_to_merge[i] << endl;
|
||||||
|
|
||||||
|
// Step 4: Analyze blocks that need to be split
|
||||||
|
cout << "\n=== Analyzing blocks for splitting ===" << endl;
|
||||||
|
|
||||||
|
struct BlockSplitInfo {
|
||||||
|
Block *original_block;
|
||||||
|
int split_dim;
|
||||||
|
int split_point;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<BlockSplitInfo> blocks_to_split;
|
||||||
|
|
||||||
|
PL = PatL;
|
||||||
|
while (PL)
|
||||||
|
{
|
||||||
|
Patch *PP = PL->data;
|
||||||
|
MyList<Block> *BP = PP->blb;
|
||||||
|
while (BP)
|
||||||
|
{
|
||||||
|
Block *block = BP->data;
|
||||||
|
|
||||||
|
// Check if this block belongs to a heavy rank
|
||||||
|
for (int i = 0; i < num_heavy; i++)
|
||||||
|
{
|
||||||
|
if (block->rank == heavy_ranks[i])
|
||||||
|
{
|
||||||
|
// Find the largest dimension for splitting
|
||||||
|
int max_dim = 0;
|
||||||
|
int max_size = block->shape[0];
|
||||||
|
for (int d = 1; d < dim; d++)
|
||||||
|
{
|
||||||
|
if (block->shape[d] > max_size)
|
||||||
|
{
|
||||||
|
max_size = block->shape[d];
|
||||||
|
max_dim = d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int split_point = max_size / 2;
|
||||||
|
|
||||||
|
BlockSplitInfo info;
|
||||||
|
info.original_block = block;
|
||||||
|
info.split_dim = max_dim;
|
||||||
|
info.split_point = split_point;
|
||||||
|
blocks_to_split.push_back(info);
|
||||||
|
|
||||||
|
cout << "Block at rank " << block->rank << " will be split" << endl;
|
||||||
|
cout << " Shape: [" << block->shape[0] << ", " << block->shape[1] << ", " << block->shape[2] << "]" << endl;
|
||||||
|
cout << " Split along dimension " << max_dim << " at index " << split_point << endl;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
BP = BP->next;
|
||||||
|
}
|
||||||
|
PL = PL->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nTotal blocks to split: " << blocks_to_split.size() << endl;
|
||||||
|
|
||||||
|
// Step 5: Calculate new rank assignments
|
||||||
|
// Strategy:
|
||||||
|
// - For each heavy rank, its blocks are split in half
|
||||||
|
// - First half keeps the original rank
|
||||||
|
// - Second half gets a new rank (from the freed light ranks)
|
||||||
|
// - 8 light ranks are merged into 4 ranks, freeing up 4 ranks
|
||||||
|
|
||||||
|
std::vector<int> freed_ranks;
|
||||||
|
for (size_t i = 0; i < ranks_to_merge.size(); i += 2)
|
||||||
|
{
|
||||||
|
// Merge pairs of light ranks: (ranks_to_merge[i], ranks_to_merge[i+1]) -> ranks_to_merge[i]
|
||||||
|
// This frees up ranks_to_merge[i+1]
|
||||||
|
if (i + 1 < ranks_to_merge.size())
|
||||||
|
{
|
||||||
|
freed_ranks.push_back(ranks_to_merge[i + 1]);
|
||||||
|
cout << "Merging ranks " << ranks_to_merge[i] << " and " << ranks_to_merge[i + 1]
|
||||||
|
<< " -> keeping rank " << ranks_to_merge[i] << ", freeing rank " << ranks_to_merge[i + 1] << endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nFreed ranks available for split blocks: ";
|
||||||
|
for (int r : freed_ranks)
|
||||||
|
cout << r << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
// Step 6: Assign new ranks to split blocks
|
||||||
|
int freed_idx = 0;
|
||||||
|
for (size_t i = 0; i < blocks_to_split.size(); i++)
|
||||||
|
{
|
||||||
|
BlockSplitInfo &info = blocks_to_split[i];
|
||||||
|
Block *original = info.original_block;
|
||||||
|
|
||||||
|
if (freed_idx < freed_ranks.size())
|
||||||
|
{
|
||||||
|
cout << "\nSplitting block at rank " << original->rank << endl;
|
||||||
|
cout << " First half: keeps rank " << original->rank << endl;
|
||||||
|
cout << " Second half: gets new rank " << freed_ranks[freed_idx] << endl;
|
||||||
|
freed_idx++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cout << "WARNING: Not enough freed ranks for all split blocks!" << endl;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\n=== Load Balancing Analysis Complete ===" << endl;
|
||||||
|
cout << "Next steps:" << endl;
|
||||||
|
cout << " 1. Recompose the grid with new rank assignments" << endl;
|
||||||
|
cout << " 2. Data migration will be handled by recompose_cgh" << endl;
|
||||||
|
cout << " 3. Ghost zone communication will be updated automatically" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <new>
|
#include <new>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
#include <memory.h>
|
||||||
#include "Parallel_bam.h"
|
#include "Parallel_bam.h"
|
||||||
#include "var.h"
|
#include "var.h"
|
||||||
#include "MPatch.h"
|
#include "MPatch.h"
|
||||||
@@ -32,12 +32,21 @@ namespace Parallel
|
|||||||
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
|
int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions
|
||||||
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
|
int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape);
|
||||||
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
|
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
|
||||||
MyList<Block> *distribute_optimize(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0);
|
|
||||||
|
MyList<Block> *distribute_hard(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks
|
||||||
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
||||||
int ib0_orig, int ib3_orig,
|
int ib0_orig, int ib3_orig,
|
||||||
int jb1_orig, int jb4_orig,
|
int jb1_orig, int jb4_orig,
|
||||||
int kb2_orig, int kb5_orig,
|
int kb2_orig, int kb5_orig,
|
||||||
Patch* PP, int r_left, int r_right,
|
Patch* PP, int r_1, int r_2,
|
||||||
|
int ingfsi, int fngfsi, bool periodic,
|
||||||
|
Block* &split_first_block, Block* &split_last_block);
|
||||||
|
|
||||||
|
Block* splitHotspotBlock(MyList<Block>* &BlL, int _dim,
|
||||||
|
int ib0_orig, int ib3_orig,
|
||||||
|
int jb1_orig, int jb4_orig,
|
||||||
|
int kb2_orig, int kb5_orig,
|
||||||
|
Patch* PP, int r_1, int r_2, int r_3, int r_4,
|
||||||
int ingfsi, int fngfsi, bool periodic,
|
int ingfsi, int fngfsi, bool periodic,
|
||||||
Block* &split_first_block, Block* &split_last_block);
|
Block* &split_first_block, Block* &split_last_block);
|
||||||
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
|
Block* createMappedBlock(MyList<Block>* &BlL, int _dim, int* shape, double* bbox,
|
||||||
@@ -218,6 +227,18 @@ namespace Parallel
|
|||||||
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
||||||
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
|
MyList<Block> *distribute(MyList<Patch> *PatchLIST, int cpusize, int ingfsi, int fngfsi,
|
||||||
bool periodic, int start_rank, int end_rank, int nodes = 0);
|
bool periodic, int start_rank, int end_rank, int nodes = 0);
|
||||||
|
|
||||||
|
// Redistribute blocks with time statistics for load balancing
|
||||||
|
MyList<Block> *distribute(MyList<Patch> *PatchLIST, MyList<Block> *OldBlockL,
|
||||||
|
int cpusize, int ingfsi, int fngfsi,
|
||||||
|
bool periodic, int start_rank, int end_rank, int nodes = 0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Dynamic load balancing: split blocks for heavy ranks
|
||||||
|
void split_heavy_blocks(MyList<Patch> *PatL, int *heavy_ranks, int num_heavy,
|
||||||
|
int split_factor, int cpusize, int ingfsi, int fngfsi);
|
||||||
|
|
||||||
|
// Check if load balancing is needed based on interpolation times
|
||||||
|
bool check_load_balance_need(double *rank_times, int nprocs, int &num_heavy, int *heavy_ranks);
|
||||||
}
|
}
|
||||||
#endif /*PARALLEL_H */
|
#endif /*PARALLEL_H */
|
||||||
|
|||||||
@@ -2426,9 +2426,9 @@ void bssn_class::RecursiveStep(int lev)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if (REGLEV == 0)
|
#if (REGLEV == 0)
|
||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@@ -2605,9 +2605,9 @@ void bssn_class::ParallelStep()
|
|||||||
delete[] tporg;
|
delete[] tporg;
|
||||||
delete[] tporgo;
|
delete[] tporgo;
|
||||||
#if (REGLEV == 0)
|
#if (REGLEV == 0)
|
||||||
if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@@ -2772,9 +2772,9 @@ void bssn_class::ParallelStep()
|
|||||||
if (lev + 1 >= GH->movls)
|
if (lev + 1 >= GH->movls)
|
||||||
{
|
{
|
||||||
// GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
|
// GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0,
|
||||||
if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -2787,9 +2787,9 @@ void bssn_class::ParallelStep()
|
|||||||
// for this level
|
// for this level
|
||||||
if (YN == 1)
|
if (YN == 1)
|
||||||
{
|
{
|
||||||
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -2806,9 +2806,9 @@ void bssn_class::ParallelStep()
|
|||||||
if (YN == 1)
|
if (YN == 1)
|
||||||
{
|
{
|
||||||
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
@@ -2822,9 +2822,9 @@ void bssn_class::ParallelStep()
|
|||||||
if (i % 4 == 3)
|
if (i % 4 == 3)
|
||||||
{
|
{
|
||||||
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
// GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0,
|
||||||
if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
|
||||||
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
SynchList_cor, OldStateList, StateList, SynchList_pre,
|
||||||
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
|
fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor);
|
||||||
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
|
||||||
|
|
||||||
// a_stream.clear();
|
// a_stream.clear();
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -43,6 +43,14 @@ cgh::cgh(int ingfsi, int fngfsi, int Symmetry, char *filename, int checkrun,
|
|||||||
end_rank = 0;
|
end_rank = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Initialize load balancing variables
|
||||||
|
enable_load_balance = false;
|
||||||
|
load_balance_check_interval = 10; // Check every 10 time steps
|
||||||
|
current_time_step = 0;
|
||||||
|
rank_interp_times = nullptr;
|
||||||
|
heavy_ranks = nullptr;
|
||||||
|
num_heavy_ranks = 0;
|
||||||
|
|
||||||
if (!checkrun)
|
if (!checkrun)
|
||||||
{
|
{
|
||||||
read_bbox(Symmetry, filename);
|
read_bbox(Symmetry, filename);
|
||||||
@@ -113,6 +121,12 @@ cgh::~cgh()
|
|||||||
delete[] Porgls[lev];
|
delete[] Porgls[lev];
|
||||||
}
|
}
|
||||||
delete[] Porgls;
|
delete[] Porgls;
|
||||||
|
|
||||||
|
// Clean up load balancing memory
|
||||||
|
if (rank_interp_times)
|
||||||
|
delete[] rank_interp_times;
|
||||||
|
if (heavy_ranks)
|
||||||
|
delete[] heavy_ranks;
|
||||||
}
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
@@ -130,11 +144,7 @@ void cgh::compose_cgh(int nprocs)
|
|||||||
for (int lev = 0; lev < levels; lev++)
|
for (int lev = 0; lev < levels; lev++)
|
||||||
{
|
{
|
||||||
checkPatchList(PatL[lev], false);
|
checkPatchList(PatL[lev], false);
|
||||||
#ifdef INTERP_LB_OPTIMIZE
|
Parallel::distribute_hard(PatL[lev], nprocs, ingfs, fngfs, false);
|
||||||
Parallel::distribute_optimize(PatL[lev], nprocs, ingfs, fngfs, false);
|
|
||||||
#else
|
|
||||||
Parallel::distribute(PatL[lev], nprocs, ingfs, fngfs, false);
|
|
||||||
#endif
|
|
||||||
#if (RPB == 1)
|
#if (RPB == 1)
|
||||||
// we need distributed box of PatL[lev] and PatL[lev-1]
|
// we need distributed box of PatL[lev] and PatL[lev-1]
|
||||||
if (lev > 0)
|
if (lev > 0)
|
||||||
@@ -1305,13 +1315,13 @@ bool cgh::Interp_One_Point(MyList<var> *VarList,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||||
MyList<var> *OldList, MyList<var> *StateList,
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
||||||
monitor *ErrorMonitor)
|
monitor *ErrorMonitor)
|
||||||
{
|
{
|
||||||
if (lev < movls)
|
if (lev < movls)
|
||||||
return false;
|
return;
|
||||||
|
|
||||||
#if (0)
|
#if (0)
|
||||||
// #if (PSTR == 1 || PSTR == 2)
|
// #if (PSTR == 1 || PSTR == 2)
|
||||||
@@ -1400,7 +1410,7 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
|
|||||||
for (bhi = 0; bhi < BH_num; bhi++)
|
for (bhi = 0; bhi < BH_num; bhi++)
|
||||||
delete[] tmpPorg[bhi];
|
delete[] tmpPorg[bhi];
|
||||||
delete[] tmpPorg;
|
delete[] tmpPorg;
|
||||||
return false;
|
return;
|
||||||
}
|
}
|
||||||
// x direction
|
// x direction
|
||||||
rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
|
rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX;
|
||||||
@@ -1504,7 +1514,6 @@ bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do
|
|||||||
for (int bhi = 0; bhi < BH_num; bhi++)
|
for (int bhi = 0; bhi < BH_num; bhi++)
|
||||||
delete[] tmpPorg[bhi];
|
delete[] tmpPorg[bhi];
|
||||||
delete[] tmpPorg;
|
delete[] tmpPorg;
|
||||||
return tot_flag;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1710,3 +1719,121 @@ void cgh::settrfls(const int lev)
|
|||||||
{
|
{
|
||||||
trfls = lev;
|
trfls = lev;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//================================================================================================
|
||||||
|
// Load Balancing Functions
|
||||||
|
//================================================================================================
|
||||||
|
|
||||||
|
// Initialize load balancing
|
||||||
|
void cgh::init_load_balance(int nprocs)
|
||||||
|
{
|
||||||
|
if (rank_interp_times)
|
||||||
|
delete[] rank_interp_times;
|
||||||
|
if (heavy_ranks)
|
||||||
|
delete[] heavy_ranks;
|
||||||
|
|
||||||
|
rank_interp_times = new double[nprocs];
|
||||||
|
heavy_ranks = new int[4]; // Maximum 4 heavy ranks
|
||||||
|
num_heavy_ranks = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < nprocs; i++)
|
||||||
|
rank_interp_times[i] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update interpolation time for a rank
|
||||||
|
void cgh::update_interp_time(int rank, double time)
|
||||||
|
{
|
||||||
|
if (rank_interp_times && rank >= 0)
|
||||||
|
{
|
||||||
|
rank_interp_times[rank] = time;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check and perform load balancing if needed
|
||||||
|
bool cgh::check_and_rebalance(int nprocs, int lev,
|
||||||
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
|
MyList<var> *FutureList, MyList<var> *tmList,
|
||||||
|
int Symmetry, bool BB)
|
||||||
|
{
|
||||||
|
int myrank;
|
||||||
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||||
|
|
||||||
|
// Only check at specified intervals
|
||||||
|
current_time_step++;
|
||||||
|
if (current_time_step % load_balance_check_interval != 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
cout << "\n=== Checking load balance at time step " << current_time_step << " ===" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect all rank times on rank 0
|
||||||
|
double *all_times = nullptr;
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
all_times = new double[nprocs];
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Gather(rank_interp_times, 1, MPI_DOUBLE, all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
bool need_rebalance = false;
|
||||||
|
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
// Check if load balancing is needed
|
||||||
|
need_rebalance = Parallel::check_load_balance_need(all_times, nprocs, num_heavy_ranks, heavy_ranks);
|
||||||
|
|
||||||
|
if (need_rebalance)
|
||||||
|
{
|
||||||
|
cout << "=== Load imbalance detected! Need to rebalance ===" << endl;
|
||||||
|
cout << "Top " << num_heavy_ranks << " heavy ranks: ";
|
||||||
|
for (int i = 0; i < num_heavy_ranks; i++)
|
||||||
|
{
|
||||||
|
cout << heavy_ranks[i] << " (" << all_times[heavy_ranks[i]] << " s) ";
|
||||||
|
}
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
// Analyze blocks that need to be split
|
||||||
|
Parallel::split_heavy_blocks(PatL[lev], heavy_ranks, num_heavy_ranks, 2, nprocs, ingfs, fngfs);
|
||||||
|
|
||||||
|
// Set lev_flag to trigger recompose_cgh
|
||||||
|
cout << "=== Triggering recompose_cgh for level " << lev << " ===" << endl;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cout << "=== Load is balanced, no rebalancing needed ===" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] all_times;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Broadcast the decision to all ranks
|
||||||
|
MPI_Bcast(&need_rebalance, 1, MPI_C_BOOL, 0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
if (need_rebalance)
|
||||||
|
{
|
||||||
|
// Broadcast heavy ranks information
|
||||||
|
MPI_Bcast(&num_heavy_ranks, 1, MPI_INT, 0, MPI_COMM_WORLD);
|
||||||
|
MPI_Bcast(heavy_ranks, num_heavy_ranks, MPI_INT, 0, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
// Perform recompose_cgh on the specified level
|
||||||
|
if (myrank == 0)
|
||||||
|
{
|
||||||
|
cout << "=== Performing recompose_cgh ===" << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call recompose_cgh_Onelevel for the specified level
|
||||||
|
bool *lev_flag = new bool[1];
|
||||||
|
lev_flag[0] = true;
|
||||||
|
recompose_cgh_Onelevel(nprocs, lev, OldList, StateList, FutureList, tmList, Symmetry, BB);
|
||||||
|
delete[] lev_flag;
|
||||||
|
|
||||||
|
// Reset time counter after rebalancing
|
||||||
|
current_time_step = 0;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ public:
|
|||||||
MyList<var> *OldList, MyList<var> *StateList,
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
MyList<var> *FutureList, MyList<var> *tmList,
|
MyList<var> *FutureList, MyList<var> *tmList,
|
||||||
int Symmetry, bool BB);
|
int Symmetry, bool BB);
|
||||||
bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0,
|
||||||
MyList<var> *OldList, MyList<var> *StateList,
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
MyList<var> *FutureList, MyList<var> *tmList, bool BB,
|
||||||
monitor *ErrorMonitor);
|
monitor *ErrorMonitor);
|
||||||
@@ -87,6 +87,21 @@ public:
|
|||||||
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
#if (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
||||||
void construct_mylev(int nprocs);
|
void construct_mylev(int nprocs);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// Load balancing support
|
||||||
|
bool enable_load_balance; // Enable load balancing
|
||||||
|
int load_balance_check_interval; // Check interval (in time steps)
|
||||||
|
int current_time_step; // Current time step counter
|
||||||
|
double *rank_interp_times; // Store interpolation times for each rank
|
||||||
|
int *heavy_ranks; // Store heavy rank numbers
|
||||||
|
int num_heavy_ranks; // Number of heavy ranks
|
||||||
|
|
||||||
|
void init_load_balance(int nprocs);
|
||||||
|
void update_interp_time(int rank, double time);
|
||||||
|
bool check_and_rebalance(int nprocs, int lev,
|
||||||
|
MyList<var> *OldList, MyList<var> *StateList,
|
||||||
|
MyList<var> *FutureList, MyList<var> *tmList,
|
||||||
|
int Symmetry, bool BB);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* CGH_H */
|
#endif /* CGH_H */
|
||||||
|
|||||||
@@ -1,268 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
void fdderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fxx, double *fxy, double *fxz,
|
|
||||||
double *fyy, double *fyz, double *fzz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff)
|
|
||||||
{
|
|
||||||
(void)onoff;
|
|
||||||
|
|
||||||
const int NO_SYMM = 0, EQ_SYMM = 1;
|
|
||||||
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0;
|
|
||||||
const double F1o4 = 2.5e-1; // 1/4
|
|
||||||
const double F8 = 8.0;
|
|
||||||
const double F16 = 16.0;
|
|
||||||
const double F30 = 30.0;
|
|
||||||
const double F1o12 = ONE / 12.0;
|
|
||||||
const double F1o144 = ONE / 144.0;
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
|
|
||||||
|
|
||||||
const double SoA[3] = { SYM1, SYM2, SYM3 };
|
|
||||||
|
|
||||||
/* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */
|
|
||||||
const size_t nx = (size_t)ex1 + 2;
|
|
||||||
const size_t ny = (size_t)ex2 + 2;
|
|
||||||
const size_t nz = (size_t)ex3 + 2;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
|
|
||||||
static double *fh = NULL;
|
|
||||||
static size_t cap = 0;
|
|
||||||
|
|
||||||
if (fh_size > cap) {
|
|
||||||
free(fh);
|
|
||||||
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
|
|
||||||
cap = fh_size;
|
|
||||||
}
|
|
||||||
// double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return;
|
|
||||||
|
|
||||||
symmetry_bd(2, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
/* 系数:按 Fortran 原式 */
|
|
||||||
const double Sdxdx = ONE / (dX * dX);
|
|
||||||
const double Sdydy = ONE / (dY * dY);
|
|
||||||
const double Sdzdz = ONE / (dZ * dZ);
|
|
||||||
|
|
||||||
const double Fdxdx = F1o12 / (dX * dX);
|
|
||||||
const double Fdydy = F1o12 / (dY * dY);
|
|
||||||
const double Fdzdz = F1o12 / (dZ * dZ);
|
|
||||||
|
|
||||||
const double Sdxdy = F1o4 / (dX * dY);
|
|
||||||
const double Sdxdz = F1o4 / (dX * dZ);
|
|
||||||
const double Sdydz = F1o4 / (dY * dZ);
|
|
||||||
|
|
||||||
const double Fdxdy = F1o144 / (dX * dY);
|
|
||||||
const double Fdxdz = F1o144 / (dX * dZ);
|
|
||||||
const double Fdydz = F1o144 / (dY * dZ);
|
|
||||||
|
|
||||||
/* 输出清零:fxx,fyy,fzz,fxy,fxz,fyz = 0 */
|
|
||||||
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
|
|
||||||
for (size_t p = 0; p < all; ++p) {
|
|
||||||
fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO;
|
|
||||||
fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fortran:
|
|
||||||
* do k=1,ex3-1
|
|
||||||
* do j=1,ex2-1
|
|
||||||
* do i=1,ex1-1
|
|
||||||
*/
|
|
||||||
|
|
||||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
/* 高阶分支:i±2,j±2,k±2 都在范围内 */
|
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
||||||
{
|
|
||||||
fxx[p] = Fdxdx * (
|
|
||||||
-fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fyy[p] = Fdydy * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fzz[p] = Fdzdz * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
|
||||||
F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] +
|
|
||||||
F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
/* fxy 高阶:完全照搬 Fortran 的括号结构 */
|
|
||||||
{
|
|
||||||
const double t_jm2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jm1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] );
|
|
||||||
|
|
||||||
const double t_jp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] );
|
|
||||||
|
|
||||||
fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* fxz 高阶 */
|
|
||||||
{
|
|
||||||
const double t_km2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] );
|
|
||||||
|
|
||||||
const double t_km1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] );
|
|
||||||
|
|
||||||
fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
|
||||||
}
|
|
||||||
|
|
||||||
/* fyz 高阶 */
|
|
||||||
{
|
|
||||||
const double t_km2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] );
|
|
||||||
|
|
||||||
const double t_km1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp1 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] );
|
|
||||||
|
|
||||||
const double t_kp2 =
|
|
||||||
( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)]
|
|
||||||
-F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)]
|
|
||||||
+F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)]
|
|
||||||
- fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] );
|
|
||||||
|
|
||||||
fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/* 二阶分支:i±1,j±1,k±1 在范围内 */
|
|
||||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
||||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
||||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
||||||
{
|
|
||||||
fxx[p] = Sdxdx * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] -
|
|
||||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fyy[p] = Sdydy * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] -
|
|
||||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fzz[p] = Sdzdz * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] -
|
|
||||||
TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fxy[p] = Sdxdy * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fxz[p] = Sdxdz * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fyz[p] = Sdydz * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
}else{
|
|
||||||
fxx[p] = 0.0;
|
|
||||||
fyy[p] = 0.0;
|
|
||||||
fzz[p] = 0.0;
|
|
||||||
fxy[p] = 0.0;
|
|
||||||
fxz[p] = 0.0;
|
|
||||||
fyz[p] = 0.0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// free(fh);
|
|
||||||
}
|
|
||||||
@@ -1,150 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* C 版 fderivs
|
|
||||||
*
|
|
||||||
* Fortran:
|
|
||||||
* subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff)
|
|
||||||
*
|
|
||||||
* 约定:
|
|
||||||
* f, fx, fy, fz: ex1*ex2*ex3,按 idx_ex 布局
|
|
||||||
* X: ex1, Y: ex2, Z: ex3
|
|
||||||
*/
|
|
||||||
void fderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fx, double *fy, double *fz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff)
|
|
||||||
{
|
|
||||||
(void)onoff; // Fortran 里没用到
|
|
||||||
|
|
||||||
const double ZEO = 0.0, ONE = 1.0;
|
|
||||||
const double TWO = 2.0, EIT = 8.0;
|
|
||||||
const double F12 = 12.0;
|
|
||||||
|
|
||||||
const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
// dX = X(2)-X(1) -> C: X[1]-X[0]
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
|
|
||||||
// Fortran 1-based bounds
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1;
|
|
||||||
|
|
||||||
// SoA(1:3) = SYM1,SYM2,SYM3
|
|
||||||
const double SoA[3] = { SYM1, SYM2, SYM3 };
|
|
||||||
|
|
||||||
// fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2
|
|
||||||
const size_t nx = (size_t)ex1 + 2;
|
|
||||||
const size_t ny = (size_t)ex2 + 2;
|
|
||||||
const size_t nz = (size_t)ex3 + 2;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
static double *fh = NULL;
|
|
||||||
static size_t cap = 0;
|
|
||||||
|
|
||||||
if (fh_size > cap) {
|
|
||||||
free(fh);
|
|
||||||
fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
|
|
||||||
cap = fh_size;
|
|
||||||
}
|
|
||||||
// double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return;
|
|
||||||
|
|
||||||
// call symmetry_bd(2,ex,f,fh,SoA)
|
|
||||||
symmetry_bd(2, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
const double d12dx = ONE / F12 / dX;
|
|
||||||
const double d12dy = ONE / F12 / dY;
|
|
||||||
const double d12dz = ONE / F12 / dZ;
|
|
||||||
|
|
||||||
const double d2dx = ONE / TWO / dX;
|
|
||||||
const double d2dy = ONE / TWO / dY;
|
|
||||||
const double d2dz = ONE / TWO / dZ;
|
|
||||||
|
|
||||||
// fx = fy = fz = 0
|
|
||||||
const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3;
|
|
||||||
for (size_t p = 0; p < all; ++p) {
|
|
||||||
fx[p] = ZEO;
|
|
||||||
fy[p] = ZEO;
|
|
||||||
fz[p] = ZEO;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fortran loops:
|
|
||||||
* do k=1,ex3-1
|
|
||||||
* do j=1,ex2-1
|
|
||||||
* do i=1,ex1-1
|
|
||||||
*
|
|
||||||
* C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2
|
|
||||||
*/
|
|
||||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
// if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引)
|
|
||||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
||||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
||||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
||||||
{
|
|
||||||
fx[p] = d12dx * (
|
|
||||||
fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] -
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fy[p] = d12dy * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] -
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fz[p] = d12dz * (
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] -
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
|
||||||
EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] -
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// elseif(i+1 <= imax .and. i-1 >= imin ...)
|
|
||||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
||||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
||||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
||||||
{
|
|
||||||
fx[p] = d2dx * (
|
|
||||||
-fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fy[p] = d2dy * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]
|
|
||||||
);
|
|
||||||
|
|
||||||
fz[p] = d2dz * (
|
|
||||||
-fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] +
|
|
||||||
fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// free(fh);
|
|
||||||
}
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
#include "interp_lb_profile.h"
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
namespace InterpLBProfile {
|
|
||||||
|
|
||||||
bool write_profile(const char *filepath, int nprocs,
|
|
||||||
const double *rank_times,
|
|
||||||
const int *heavy_ranks, int num_heavy,
|
|
||||||
double threshold_ratio)
|
|
||||||
{
|
|
||||||
FILE *fp = fopen(filepath, "wb");
|
|
||||||
if (!fp) return false;
|
|
||||||
|
|
||||||
ProfileHeader hdr;
|
|
||||||
hdr.magic = MAGIC;
|
|
||||||
hdr.version = VERSION;
|
|
||||||
hdr.nprocs = nprocs;
|
|
||||||
hdr.num_heavy = num_heavy;
|
|
||||||
hdr.threshold_ratio = threshold_ratio;
|
|
||||||
|
|
||||||
fwrite(&hdr, sizeof(hdr), 1, fp);
|
|
||||||
fwrite(rank_times, sizeof(double), nprocs, fp);
|
|
||||||
fwrite(heavy_ranks, sizeof(int), num_heavy, fp);
|
|
||||||
fclose(fp);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool read_profile(const char *filepath, int current_nprocs,
|
|
||||||
int *heavy_ranks, int &num_heavy,
|
|
||||||
double *rank_times, MPI_Comm comm)
|
|
||||||
{
|
|
||||||
int myrank;
|
|
||||||
MPI_Comm_rank(comm, &myrank);
|
|
||||||
|
|
||||||
int valid = 0;
|
|
||||||
ProfileHeader hdr;
|
|
||||||
memset(&hdr, 0, sizeof(hdr));
|
|
||||||
|
|
||||||
if (myrank == 0) {
|
|
||||||
FILE *fp = fopen(filepath, "rb");
|
|
||||||
if (fp) {
|
|
||||||
if (fread(&hdr, sizeof(hdr), 1, fp) == 1 &&
|
|
||||||
hdr.magic == MAGIC && hdr.version == VERSION &&
|
|
||||||
hdr.nprocs == current_nprocs)
|
|
||||||
{
|
|
||||||
if (fread(rank_times, sizeof(double), current_nprocs, fp)
|
|
||||||
== (size_t)current_nprocs &&
|
|
||||||
fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp)
|
|
||||||
== (size_t)hdr.num_heavy)
|
|
||||||
{
|
|
||||||
num_heavy = hdr.num_heavy;
|
|
||||||
valid = 1;
|
|
||||||
}
|
|
||||||
} else if (fp) {
|
|
||||||
printf("[InterpLB] Profile rejected: magic=0x%X version=%u "
|
|
||||||
"nprocs=%d (current=%d)\n",
|
|
||||||
hdr.magic, hdr.version, hdr.nprocs, current_nprocs);
|
|
||||||
}
|
|
||||||
fclose(fp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
MPI_Bcast(&valid, 1, MPI_INT, 0, comm);
|
|
||||||
if (!valid) return false;
|
|
||||||
|
|
||||||
MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm);
|
|
||||||
MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm);
|
|
||||||
MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int identify_heavy_ranks(const double *rank_times, int nprocs,
|
|
||||||
double threshold_ratio,
|
|
||||||
int *heavy_ranks, int max_heavy)
|
|
||||||
{
|
|
||||||
double sum = 0;
|
|
||||||
for (int i = 0; i < nprocs; i++) sum += rank_times[i];
|
|
||||||
double mean = sum / nprocs;
|
|
||||||
double threshold = threshold_ratio * mean;
|
|
||||||
|
|
||||||
// Collect candidates
|
|
||||||
struct RankTime { int rank; double time; };
|
|
||||||
RankTime *candidates = new RankTime[nprocs];
|
|
||||||
int ncand = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < nprocs; i++) {
|
|
||||||
if (rank_times[i] > threshold)
|
|
||||||
candidates[ncand++] = {i, rank_times[i]};
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort descending by time
|
|
||||||
std::sort(candidates, candidates + ncand,
|
|
||||||
[](const RankTime &a, const RankTime &b) {
|
|
||||||
return a.time > b.time;
|
|
||||||
});
|
|
||||||
|
|
||||||
int count = (ncand < max_heavy) ? ncand : max_heavy;
|
|
||||||
for (int i = 0; i < count; i++)
|
|
||||||
heavy_ranks[i] = candidates[i].rank;
|
|
||||||
|
|
||||||
delete[] candidates;
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace InterpLBProfile
|
|
||||||
Binary file not shown.
@@ -1,38 +0,0 @@
|
|||||||
#ifndef INTERP_LB_PROFILE_H
|
|
||||||
#define INTERP_LB_PROFILE_H
|
|
||||||
|
|
||||||
#include <mpi.h>
|
|
||||||
|
|
||||||
namespace InterpLBProfile {
|
|
||||||
|
|
||||||
static const unsigned int MAGIC = 0x494C4250; // "ILBP"
|
|
||||||
static const unsigned int VERSION = 1;
|
|
||||||
|
|
||||||
struct ProfileHeader {
|
|
||||||
unsigned int magic;
|
|
||||||
unsigned int version;
|
|
||||||
int nprocs;
|
|
||||||
int num_heavy;
|
|
||||||
double threshold_ratio;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Write profile file (rank 0 only)
|
|
||||||
bool write_profile(const char *filepath, int nprocs,
|
|
||||||
const double *rank_times,
|
|
||||||
const int *heavy_ranks, int num_heavy,
|
|
||||||
double threshold_ratio);
|
|
||||||
|
|
||||||
// Read profile file (rank 0 reads, then broadcasts to all)
|
|
||||||
// Returns true if file found and valid for current nprocs
|
|
||||||
bool read_profile(const char *filepath, int current_nprocs,
|
|
||||||
int *heavy_ranks, int &num_heavy,
|
|
||||||
double *rank_times, MPI_Comm comm);
|
|
||||||
|
|
||||||
// Identify heavy ranks: those with time > threshold_ratio * mean
|
|
||||||
int identify_heavy_ranks(const double *rank_times, int nprocs,
|
|
||||||
double threshold_ratio,
|
|
||||||
int *heavy_ranks, int max_heavy);
|
|
||||||
|
|
||||||
} // namespace InterpLBProfile
|
|
||||||
|
|
||||||
#endif /* INTERP_LB_PROFILE_H */
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
/* Auto-generated from interp_lb_profile.bin — do not edit */
|
|
||||||
#ifndef INTERP_LB_PROFILE_DATA_H
|
|
||||||
#define INTERP_LB_PROFILE_DATA_H
|
|
||||||
|
|
||||||
#define INTERP_LB_NPROCS 64
|
|
||||||
#define INTERP_LB_NUM_HEAVY 4
|
|
||||||
|
|
||||||
static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36};
|
|
||||||
|
|
||||||
/* Split table: {block_id, r_left, r_right} */
|
|
||||||
static const int interp_lb_splits[4][3] = {
|
|
||||||
{27, 26, 27},
|
|
||||||
{35, 34, 35},
|
|
||||||
{28, 28, 29},
|
|
||||||
{36, 36, 37},
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Rank remap for displaced neighbor blocks */
|
|
||||||
static const int interp_lb_num_remaps = 4;
|
|
||||||
static const int interp_lb_remaps[][2] = {
|
|
||||||
{26, 25},
|
|
||||||
{29, 30},
|
|
||||||
{34, 33},
|
|
||||||
{37, 38},
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif /* INTERP_LB_PROFILE_DATA_H */
|
|
||||||
@@ -1,109 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
|
|
||||||
/*
|
|
||||||
* C 版 kodis
|
|
||||||
*
|
|
||||||
* Fortran signature:
|
|
||||||
* subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps)
|
|
||||||
*
|
|
||||||
* 约定:
|
|
||||||
* X: ex1, Y: ex2, Z: ex3
|
|
||||||
* f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局
|
|
||||||
* SoA[3]
|
|
||||||
* eps: double
|
|
||||||
*/
|
|
||||||
void kodis(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double SoA[3],
|
|
||||||
int Symmetry, double eps)
|
|
||||||
{
|
|
||||||
const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0;
|
|
||||||
const double cof = 64.0; // 2^6
|
|
||||||
const int NO_SYMM = 0, OCTANT = 2;
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
// Fortran: dX = X(2)-X(1) -> C: X[1]-X[0]
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
(void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上
|
|
||||||
|
|
||||||
// Fortran: imax=ex(1) 等是 1-based 上界
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
// Fortran: imin=jmin=kmin=1,某些对称情况变 -2
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
|
||||||
if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2;
|
|
||||||
if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2;
|
|
||||||
|
|
||||||
// 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3
|
|
||||||
const size_t nx = (size_t)ex1 + 3;
|
|
||||||
const size_t ny = (size_t)ex2 + 3;
|
|
||||||
const size_t nz = (size_t)ex3 + 3;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
|
|
||||||
double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return;
|
|
||||||
|
|
||||||
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
|
||||||
symmetry_bd(3, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fortran loops:
|
|
||||||
* do k=1,ex3
|
|
||||||
* do j=1,ex2
|
|
||||||
* do i=1,ex1
|
|
||||||
*
|
|
||||||
* C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1
|
|
||||||
* 并定义 Fortran index: iF=i0+1, ...
|
|
||||||
*/
|
|
||||||
for (int k0 = 0; k0 < ex3; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = 0; j0 < ex2; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = 0; i0 < ex1; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
|
|
||||||
// Fortran if 条件:
|
|
||||||
// i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引)
|
|
||||||
if ((iF - 3) >= iminF && (iF + 3) <= imaxF &&
|
|
||||||
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
|
|
||||||
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
|
|
||||||
{
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
// 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核)
|
|
||||||
const double Dx_term =
|
|
||||||
( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) -
|
|
||||||
SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) +
|
|
||||||
FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) -
|
|
||||||
TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX;
|
|
||||||
|
|
||||||
const double Dy_term =
|
|
||||||
( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) -
|
|
||||||
SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) +
|
|
||||||
FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) -
|
|
||||||
TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY;
|
|
||||||
|
|
||||||
const double Dz_term =
|
|
||||||
( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) -
|
|
||||||
SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) +
|
|
||||||
FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) -
|
|
||||||
TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ;
|
|
||||||
|
|
||||||
// Fortran:
|
|
||||||
// f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term)
|
|
||||||
f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
free(fh);
|
|
||||||
}
|
|
||||||
@@ -1,255 +0,0 @@
|
|||||||
#include "tool.h"
|
|
||||||
/*
|
|
||||||
* 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。
|
|
||||||
* Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
|
||||||
*
|
|
||||||
* 约定:
|
|
||||||
* nghost = 3
|
|
||||||
* ex[3] = {ex1,ex2,ex3}
|
|
||||||
* f = 原始网格 (ex1*ex2*ex3)
|
|
||||||
* fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...)
|
|
||||||
* SoA[3] = 输入参数
|
|
||||||
*/
|
|
||||||
void lopsided(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
|
||||||
int Symmetry, const double SoA[3])
|
|
||||||
{
|
|
||||||
const double ZEO = 0.0, ONE = 1.0, F3 = 3.0;
|
|
||||||
const double TWO = 2.0, F6 = 6.0, F18 = 18.0;
|
|
||||||
const double F12 = 12.0, F10 = 10.0, EIT = 8.0;
|
|
||||||
|
|
||||||
const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2;
|
|
||||||
(void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系
|
|
||||||
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2];
|
|
||||||
|
|
||||||
// 对应 Fortran: dX = X(2)-X(1) (Fortran 1-based)
|
|
||||||
// C: X[1]-X[0]
|
|
||||||
const double dX = X[1] - X[0];
|
|
||||||
const double dY = Y[1] - Y[0];
|
|
||||||
const double dZ = Z[1] - Z[0];
|
|
||||||
|
|
||||||
const double d12dx = ONE / F12 / dX;
|
|
||||||
const double d12dy = ONE / F12 / dY;
|
|
||||||
const double d12dz = ONE / F12 / dZ;
|
|
||||||
|
|
||||||
// Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来)
|
|
||||||
const double d2dx = ONE / TWO / dX;
|
|
||||||
const double d2dy = ONE / TWO / dY;
|
|
||||||
const double d2dz = ONE / TWO / dZ;
|
|
||||||
(void)d2dx; (void)d2dy; (void)d2dz;
|
|
||||||
|
|
||||||
// Fortran:
|
|
||||||
// imax = ex(1); jmax = ex(2); kmax = ex(3)
|
|
||||||
const int imaxF = ex1;
|
|
||||||
const int jmaxF = ex2;
|
|
||||||
const int kmaxF = ex3;
|
|
||||||
|
|
||||||
// Fortran:
|
|
||||||
// imin=jmin=kmin=1; 若满足对称条件则设为 -2
|
|
||||||
int iminF = 1, jminF = 1, kminF = 1;
|
|
||||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2;
|
|
||||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2;
|
|
||||||
|
|
||||||
// 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3)
|
|
||||||
const size_t nx = (size_t)ex1 + 3;
|
|
||||||
const size_t ny = (size_t)ex2 + 3;
|
|
||||||
const size_t nz = (size_t)ex3 + 3;
|
|
||||||
const size_t fh_size = nx * ny * nz;
|
|
||||||
|
|
||||||
double *fh = (double*)malloc(fh_size * sizeof(double));
|
|
||||||
if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错)
|
|
||||||
|
|
||||||
// Fortran: call symmetry_bd(3,ex,f,fh,SoA)
|
|
||||||
symmetry_bd(3, ex, f, fh, SoA);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Fortran 主循环:
|
|
||||||
* do k=1,ex(3)-1
|
|
||||||
* do j=1,ex(2)-1
|
|
||||||
* do i=1,ex(1)-1
|
|
||||||
*
|
|
||||||
* 转成 C 0-based:
|
|
||||||
* k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2
|
|
||||||
*
|
|
||||||
* 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值:
|
|
||||||
* iF=i0+1, jF=j0+1, kF=k0+1
|
|
||||||
*/
|
|
||||||
for (int k0 = 0; k0 <= ex3 - 2; ++k0) {
|
|
||||||
const int kF = k0 + 1;
|
|
||||||
for (int j0 = 0; j0 <= ex2 - 2; ++j0) {
|
|
||||||
const int jF = j0 + 1;
|
|
||||||
for (int i0 = 0; i0 <= ex1 - 2; ++i0) {
|
|
||||||
const int iF = i0 + 1;
|
|
||||||
|
|
||||||
const size_t p = idx_ex(i0, j0, k0, ex);
|
|
||||||
|
|
||||||
// ---------------- x direction ----------------
|
|
||||||
const double sfx = Sfx[p];
|
|
||||||
if (sfx > ZEO) {
|
|
||||||
// Fortran: if(i+3 <= imax)
|
|
||||||
// iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4
|
|
||||||
if (i0 <= ex1 - 4) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i+2 <= imax) <=> i0 <= ex1-3
|
|
||||||
else if (i0 <= ex1 - 3) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i+1 <= imax) <=> i0 <= ex1-2(循环里总成立)
|
|
||||||
else if (i0 <= ex1 - 2) {
|
|
||||||
f_rhs[p] -= sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
} else if (sfx < ZEO) {
|
|
||||||
// Fortran: if(i-3 >= imin)
|
|
||||||
// (iF-3) >= iminF <=> (i0-2) >= iminF
|
|
||||||
if ((i0 - 2) >= iminF) {
|
|
||||||
f_rhs[p] -= sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF - 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i-2 >= imin) <=> (i0-1) >= iminF
|
|
||||||
else if ((i0 - 1) >= iminF) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
( fh[idx_fh_F(iF - 2, jF, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF + 2, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
// elseif(i-1 >= imin) <=> i0 >= iminF
|
|
||||||
else if (i0 >= iminF) {
|
|
||||||
f_rhs[p] += sfx * d12dx *
|
|
||||||
(-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF , jF, kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF + 3, jF, kF, ex)]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------- y direction ----------------
|
|
||||||
const double sfy = Sfy[p];
|
|
||||||
if (sfy > ZEO) {
|
|
||||||
// jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4
|
|
||||||
if (j0 <= ex2 - 4) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
|
||||||
} else if (j0 <= ex2 - 3) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
|
||||||
} else if (j0 <= ex2 - 2) {
|
|
||||||
f_rhs[p] -= sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
|
||||||
}
|
|
||||||
} else if (sfy < ZEO) {
|
|
||||||
if ((j0 - 2) >= jminF) {
|
|
||||||
f_rhs[p] -= sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF - 3, kF, ex)]);
|
|
||||||
} else if ((j0 - 1) >= jminF) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
( fh[idx_fh_F(iF, jF - 2, kF, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF + 2, kF, ex)]);
|
|
||||||
} else if (j0 >= jminF) {
|
|
||||||
f_rhs[p] += sfy * d12dy *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF , kF, ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF + 3, kF, ex)]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------- z direction ----------------
|
|
||||||
const double sfz = Sfz[p];
|
|
||||||
if (sfz > ZEO) {
|
|
||||||
if (k0 <= ex3 - 4) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
|
||||||
} else if (k0 <= ex3 - 3) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
|
||||||
} else if (k0 <= ex3 - 2) {
|
|
||||||
f_rhs[p] -= sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
|
||||||
}
|
|
||||||
} else if (sfz < ZEO) {
|
|
||||||
if ((k0 - 2) >= kminF) {
|
|
||||||
f_rhs[p] -= sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF - 3, ex)]);
|
|
||||||
} else if ((k0 - 1) >= kminF) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
( fh[idx_fh_F(iF, jF, kF - 2, ex)]
|
|
||||||
-EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
+EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
- fh[idx_fh_F(iF, jF, kF + 2, ex)]);
|
|
||||||
} else if (k0 >= kminF) {
|
|
||||||
f_rhs[p] += sfz * d12dz *
|
|
||||||
(-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)]
|
|
||||||
-F10 * fh[idx_fh_F(iF, jF, kF , ex)]
|
|
||||||
+F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)]
|
|
||||||
-F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)]
|
|
||||||
+ fh[idx_fh_F(iF, jF, kF + 3, ex)]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
free(fh);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,23 +1,7 @@
|
|||||||
|
|
||||||
#define tetradtype 2
|
|
||||||
|
|
||||||
#define Cell
|
|
||||||
|
|
||||||
#define ghost_width 3
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define GAUGE 0
|
|
||||||
|
|
||||||
#define CPBC_ghost_width (ghost_width)
|
|
||||||
|
|
||||||
#define ABV 0
|
|
||||||
|
|
||||||
#define EScalar_CC 2
|
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
note here
|
||||||
define tetradtype
|
|
||||||
v:r; u: phi; w: theta
|
v:r; u: phi; w: theta
|
||||||
tetradtype 0
|
tetradtype 0
|
||||||
v^a = (x,y,z)
|
v^a = (x,y,z)
|
||||||
@@ -30,48 +14,70 @@ define tetradtype
|
|||||||
v_a = (x,y,z)
|
v_a = (x,y,z)
|
||||||
orthonormal order: v,u,w
|
orthonormal order: v,u,w
|
||||||
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007)
|
||||||
|
#endif
|
||||||
|
#define tetradtype 2
|
||||||
|
|
||||||
define Cell or Vertex
|
#if 0
|
||||||
|
note here
|
||||||
Cell center or Vertex center
|
Cell center or Vertex center
|
||||||
|
#endif
|
||||||
|
#define Cell
|
||||||
|
|
||||||
define ghost_width
|
#if 0
|
||||||
|
note here
|
||||||
2nd order: 2
|
2nd order: 2
|
||||||
4th order: 3
|
4th order: 3
|
||||||
6th order: 4
|
6th order: 4
|
||||||
8th order: 5
|
8th order: 5
|
||||||
|
#endif
|
||||||
|
#define ghost_width 3
|
||||||
|
|
||||||
define WithShell
|
#if 0
|
||||||
|
note here
|
||||||
use shell or not
|
use shell or not
|
||||||
|
#endif
|
||||||
|
#define WithShell
|
||||||
|
|
||||||
define CPBC
|
#if 0
|
||||||
|
note here
|
||||||
use constraint preserving boundary condition or not
|
use constraint preserving boundary condition or not
|
||||||
only affect Z4c
|
only affect Z4c
|
||||||
CPBC only supports WithShell
|
#endif
|
||||||
|
#define CPBC
|
||||||
|
|
||||||
define GAUGE
|
#if 0
|
||||||
|
note here
|
||||||
|
Gauge condition type
|
||||||
0: B^i gauge
|
0: B^i gauge
|
||||||
1: David puncture gauge
|
1: David's puncture gauge
|
||||||
2: MB B^i gauge
|
2: MB B^i gauge
|
||||||
3: RIT B^i gauge
|
3: RIT B^i gauge
|
||||||
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||||
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006)
|
||||||
6: MGB1 B^i gauge
|
6: MGB1 B^i gauge
|
||||||
7: MGB2 B^i gauge
|
7: MGB2 B^i gauge
|
||||||
|
#endif
|
||||||
|
#define GAUGE 2
|
||||||
|
|
||||||
define CPBC_ghost_width (ghost_width)
|
#if 0
|
||||||
buffer points for CPBC boundary
|
buffer points for CPBC boundary
|
||||||
|
#endif
|
||||||
|
#define CPBC_ghost_width (ghost_width)
|
||||||
|
|
||||||
define ABV
|
#if 0
|
||||||
0: using BSSN variable for constraint violation and psi4 calculation
|
using BSSN variable for constraint violation and psi4 calculation: 0
|
||||||
1: using ADM variable for constraint violation and psi4 calculation
|
using ADM variable for constraint violation and psi4 calculation: 1
|
||||||
|
#endif
|
||||||
|
#define ABV 0
|
||||||
|
|
||||||
define EScalar_CC
|
#if 0
|
||||||
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
|
Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory
|
||||||
1: Case C of 1112.3928, V=0
|
1: Case C of 1112.3928, V=0
|
||||||
2: shell with phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V
|
2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V
|
||||||
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
|
3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V
|
||||||
4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
|
4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) )
|
||||||
5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
|
5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
#define EScalar_CC 2
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,124 +6,92 @@
|
|||||||
|
|
||||||
// application parameters
|
// application parameters
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// sommerfeld boundary type
|
||||||
|
// 0: bam, 1: shibata
|
||||||
#define SommerType 0
|
#define SommerType 0
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// for Using Gauss-Legendre quadrature in theta direction
|
||||||
#define GaussInt
|
#define GaussInt
|
||||||
|
|
||||||
#define ABEtype 0
|
/// ****
|
||||||
|
|
||||||
//#define With_AHF
|
|
||||||
#define Psi4type 0
|
|
||||||
|
|
||||||
//#define Point_Psi4
|
|
||||||
|
|
||||||
#define RPS 1
|
|
||||||
|
|
||||||
#define AGM 0
|
|
||||||
|
|
||||||
#define RPB 0
|
|
||||||
|
|
||||||
#define MAPBH 1
|
|
||||||
|
|
||||||
#define PSTR 0
|
|
||||||
|
|
||||||
#define REGLEV 0
|
|
||||||
|
|
||||||
//#define USE_GPU
|
|
||||||
|
|
||||||
//#define CHECKDETAIL
|
|
||||||
|
|
||||||
//#define FAKECHECK
|
|
||||||
|
|
||||||
//
|
|
||||||
// define SommerType
|
|
||||||
// sommerfeld boundary type
|
|
||||||
// 0: bam
|
|
||||||
// 1: shibata
|
|
||||||
//
|
|
||||||
// define GaussInt
|
|
||||||
// for Using Gauss-Legendre quadrature in theta direction
|
|
||||||
//
|
|
||||||
// define ABEtype
|
|
||||||
// 0: BSSN vacuum
|
// 0: BSSN vacuum
|
||||||
// 1: coupled to scalar field
|
// 1: coupled to scalar field
|
||||||
// 2: Z4c vacuum
|
// 2: Z4c vacuum
|
||||||
// 3: coupled to Maxwell field
|
// 3: coupled to Maxwell field
|
||||||
//
|
//
|
||||||
// define With_AHF
|
#define ABEtype 2
|
||||||
|
|
||||||
|
/// ****
|
||||||
// using Apparent Horizon Finder
|
// using Apparent Horizon Finder
|
||||||
//
|
//#define With_AHF
|
||||||
// define Psi4type
|
|
||||||
|
/// ****
|
||||||
// Psi4 calculation method
|
// Psi4 calculation method
|
||||||
// 0: EB method
|
// 0: EB method
|
||||||
// 1: 4-D method
|
// 1: 4-D method
|
||||||
//
|
//
|
||||||
// define Point_Psi4
|
#define Psi4type 0
|
||||||
|
|
||||||
|
/// ****
|
||||||
// for Using point psi4 or not
|
// for Using point psi4 or not
|
||||||
//
|
//#define Point_Psi4
|
||||||
// define RPS
|
|
||||||
|
/// ****
|
||||||
// RestrictProlong in Step (0) or after Step (1)
|
// RestrictProlong in Step (0) or after Step (1)
|
||||||
//
|
#define RPS 1
|
||||||
// define AGM
|
|
||||||
|
/// ****
|
||||||
// Enforce algebra constraint
|
// Enforce algebra constraint
|
||||||
// for every RK4 sub step: 0
|
// for every RK4 sub step: 0
|
||||||
// only when iter_count == 3: 1
|
// only when iter_count == 3: 1
|
||||||
// after routine Step: 2
|
// after routine Step: 2
|
||||||
//
|
#define AGM 0
|
||||||
// define RPB
|
|
||||||
// Restrict Prolong using BAM style 1 or old style 0
|
|
||||||
//
|
|
||||||
// define MAPBH
|
|
||||||
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
|
|
||||||
//
|
|
||||||
// define PSTR
|
|
||||||
// parallel structure
|
|
||||||
// 0: level by level
|
|
||||||
// 1: considering all levels
|
|
||||||
// 2: as 1 but reverse the CPU order
|
|
||||||
// 3: Frank's scheme
|
|
||||||
//
|
|
||||||
// define REGLEV
|
|
||||||
// regrid for every level or for all levels at a time
|
|
||||||
// 0: for every level;
|
|
||||||
// 1: for all
|
|
||||||
//
|
|
||||||
// define USE_GPU
|
|
||||||
// use gpu or not
|
|
||||||
//
|
|
||||||
// define CHECKDETAIL
|
|
||||||
// use checkpoint for every process
|
|
||||||
//
|
|
||||||
// define FAKECHECK
|
|
||||||
// use FakeCheckPrepare to write CheckPoint
|
|
||||||
//
|
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// Restrict Prolong using BAM style 1 or old style 0
|
||||||
|
#define RPB 0
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method
|
||||||
|
#define MAPBH 1
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme
|
||||||
|
#define PSTR 0
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// regrid for every level or for all levels at a time
|
||||||
|
// 0: for every level; 1: for all
|
||||||
|
#define REGLEV 0
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// use gpu or not
|
||||||
|
//#define USE_GPU
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// use checkpoint for every process
|
||||||
|
//#define CHECKDETAIL
|
||||||
|
|
||||||
|
/// ****
|
||||||
|
// use FakeCheckPrepare to write CheckPoint
|
||||||
|
//#define FAKECHECK
|
||||||
////================================================================
|
////================================================================
|
||||||
// some basic parameters for numerical calculation
|
// some basic parameters for numerical calculation
|
||||||
////================================================================
|
|
||||||
|
|
||||||
#define dim 3
|
#define dim 3
|
||||||
|
|
||||||
//#define Cell or Vertex in "macrodef.fh"
|
//#define Cell or Vertex in "microdef.fh"
|
||||||
|
|
||||||
|
// ******
|
||||||
|
// buffer point number for mesh refinement interface
|
||||||
#define buffer_width 6
|
#define buffer_width 6
|
||||||
|
|
||||||
#define SC_width buffer_width
|
// ******
|
||||||
|
|
||||||
#define CS_width (2*buffer_width)
|
|
||||||
|
|
||||||
//
|
|
||||||
// define Cell or Vertex in "macrodef.fh"
|
|
||||||
//
|
|
||||||
// define buffer_width
|
|
||||||
// buffer point number for mesh refinement interface
|
|
||||||
//
|
|
||||||
// define SC_width buffer_width
|
|
||||||
// buffer point number shell-box interface, on shell
|
// buffer point number shell-box interface, on shell
|
||||||
//
|
#define SC_width buffer_width
|
||||||
// define CS_width
|
|
||||||
// buffer point number shell-box interface, on box
|
// buffer point number shell-box interface, on box
|
||||||
//
|
#define CS_width (2*buffer_width)
|
||||||
|
|
||||||
#if(buffer_width < ghost_width)
|
#if(buffer_width < ghost_width)
|
||||||
#error we always assume buffer_width>ghost_width
|
#error we always assume buffer_width>ghost_width
|
||||||
@@ -142,4 +110,3 @@
|
|||||||
#define TINY 1e-10
|
#define TINY 1e-10
|
||||||
|
|
||||||
#endif /* MICRODEF_H */
|
#endif /* MICRODEF_H */
|
||||||
|
|
||||||
|
|||||||
@@ -2,27 +2,6 @@
|
|||||||
|
|
||||||
include makefile.inc
|
include makefile.inc
|
||||||
|
|
||||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
|
||||||
## make -> opt (PGO-guided, maximum performance)
|
|
||||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
|
||||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
|
||||||
|
|
||||||
ifeq ($(PGO_MODE),instrument)
|
|
||||||
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
|
||||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
|
||||||
-align array64byte -fpp -I${MKLROOT}/include
|
|
||||||
else
|
|
||||||
## opt (default): maximum performance with PGO profile data
|
|
||||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=$(PROFDATA) \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
|
|
||||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=$(PROFDATA) \
|
|
||||||
-align array64byte -fpp -I${MKLROOT}/include
|
|
||||||
endif
|
|
||||||
|
|
||||||
.SUFFIXES: .o .f90 .C .for .cu
|
.SUFFIXES: .o .f90 .C .for .cu
|
||||||
|
|
||||||
.f90.o:
|
.f90.o:
|
||||||
@@ -37,54 +16,19 @@ endif
|
|||||||
.cu.o:
|
.cu.o:
|
||||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||||
|
|
||||||
# C rewrite of BSSN RHS kernel and helpers
|
|
||||||
bssn_rhs_c.o: bssn_rhs_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
fderivs_c.o: fderivs_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
fdderivs_c.o: fdderivs_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
kodiss_c.o: kodiss_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
lopsided_c.o: lopsided_c.C
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
|
||||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
|
||||||
|
|
||||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
|
||||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
|
||||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
|
||||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
|
||||||
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
|
||||||
|
|
||||||
TwoPunctures.o: TwoPunctures.C
|
TwoPunctures.o: TwoPunctures.C
|
||||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
TwoPunctureABE.o: TwoPunctureABE.C
|
TwoPunctureABE.o: TwoPunctureABE.C
|
||||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
# Input files
|
# Input files
|
||||||
|
|
||||||
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
|
|
||||||
ifeq ($(USE_CXX_KERNELS),0)
|
|
||||||
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
|
|
||||||
CFILES =
|
|
||||||
else
|
|
||||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
|
||||||
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o
|
|
||||||
endif
|
|
||||||
|
|
||||||
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||||
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
||||||
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
||||||
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
||||||
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
||||||
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
|
NullShellPatch2_Evo.o writefile_f.o
|
||||||
|
|
||||||
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||||
cgh.o surface_integral.o ShellPatch.o\
|
cgh.o surface_integral.o ShellPatch.o\
|
||||||
@@ -94,9 +38,9 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
|
|||||||
NullShellPatch2_Evo.o \
|
NullShellPatch2_Evo.o \
|
||||||
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
||||||
|
|
||||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||||
rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\
|
rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\
|
||||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||||
@@ -107,14 +51,6 @@ F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
|||||||
scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
|
scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
|
||||||
NullNews2.o tool_f.o
|
NullNews2.o tool_f.o
|
||||||
|
|
||||||
ifeq ($(USE_CXX_KERNELS),0)
|
|
||||||
# Fortran mode: include original bssn_rhs.o
|
|
||||||
F90FILES = $(F90FILES_BASE) bssn_rhs.o
|
|
||||||
else
|
|
||||||
# C++ mode (default): bssn_rhs.o replaced by C++ kernel
|
|
||||||
F90FILES = $(F90FILES_BASE)
|
|
||||||
endif
|
|
||||||
|
|
||||||
F77FILES = zbesh.o
|
F77FILES = zbesh.o
|
||||||
|
|
||||||
AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
|
AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
|
||||||
@@ -127,7 +63,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
|
|||||||
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
||||||
|
|
||||||
# file dependences
|
# file dependences
|
||||||
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
||||||
|
|
||||||
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||||
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
|
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
|
||||||
@@ -150,7 +86,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
|
|||||||
|
|
||||||
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
|
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
|
||||||
|
|
||||||
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
||||||
|
|
||||||
TwoPunctureFILES: TwoPunctures.h
|
TwoPunctureFILES: TwoPunctures.h
|
||||||
|
|
||||||
@@ -159,14 +95,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h
|
|||||||
misc.o : zbesh.o
|
misc.o : zbesh.o
|
||||||
|
|
||||||
# projects
|
# projects
|
||||||
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
|
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
|
||||||
|
|
||||||
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
||||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
||||||
|
|
||||||
TwoPunctureABE: $(TwoPunctureFILES)
|
TwoPunctureABE: $(TwoPunctureFILES)
|
||||||
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
$(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm *.o ABE ABEGPU TwoPunctureABE make.log -f
|
rm *.o ABE ABEGPU TwoPunctureABE make.log -f
|
||||||
|
|||||||
@@ -8,31 +8,18 @@ filein = -I/usr/include/ -I${MKLROOT}/include
|
|||||||
|
|
||||||
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
|
||||||
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
|
||||||
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
|
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl
|
||||||
|
|
||||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization)
|
||||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
## -fprofile-instr-use: use collected profile data to guide optimization decisions
|
||||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
## (branch prediction, basic block layout, inlining, loop unrolling)
|
||||||
PGO_MODE ?= opt
|
PROFDATA = ../../pgo_profile/default.profdata
|
||||||
|
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
## Interp_Points load balance profiling mode
|
-fprofile-instr-use=$(PROFDATA) \
|
||||||
## off : (default) no load balance instrumentation
|
-Dfortran3 -Dnewc -I${MKLROOT}/include
|
||||||
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
## optimize : Pass 2 — read profile and apply block rebalancing
|
-fprofile-instr-use=$(PROFDATA) \
|
||||||
INTERP_LB_MODE ?= off
|
-align array64byte -fpp -I${MKLROOT}/include
|
||||||
|
|
||||||
ifeq ($(INTERP_LB_MODE),profile)
|
|
||||||
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
|
|
||||||
else ifeq ($(INTERP_LB_MODE),optimize)
|
|
||||||
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
|
|
||||||
else
|
|
||||||
INTERP_LB_FLAGS =
|
|
||||||
endif
|
|
||||||
|
|
||||||
## Kernel implementation switch
|
|
||||||
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
|
||||||
## 0 : fall back to original Fortran kernels
|
|
||||||
USE_CXX_KERNELS ?= 1
|
|
||||||
f90 = ifx
|
f90 = ifx
|
||||||
f77 = ifx
|
f77 = ifx
|
||||||
CXX = icpx
|
CXX = icpx
|
||||||
|
|||||||
@@ -1,146 +0,0 @@
|
|||||||
#ifndef SHARE_FUNC_H
|
|
||||||
#define SHARE_FUNC_H
|
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
/* 主网格:0-based -> 1D */
|
|
||||||
static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) {
|
|
||||||
const int ex1 = ex[0], ex2 = ex[1];
|
|
||||||
return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3)
|
|
||||||
* ord=2 => shift=1
|
|
||||||
* iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex)
|
|
||||||
*/
|
|
||||||
static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) {
|
|
||||||
const int shift = 1;
|
|
||||||
const int nx = ex[0] + 2; // ex1 + ord
|
|
||||||
const int ny = ex[1] + 2;
|
|
||||||
|
|
||||||
const int ii = iF + shift; // 0..ex1+1
|
|
||||||
const int jj = jF + shift; // 0..ex2+1
|
|
||||||
const int kk = kF + shift; // 0..ex3+1
|
|
||||||
|
|
||||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3)
|
|
||||||
* ord=3 => shift=2
|
|
||||||
* iF/jF/kF 是 Fortran 索引(可为负)
|
|
||||||
*/
|
|
||||||
static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) {
|
|
||||||
const int shift = 2; // ord=3 -> -2..ex
|
|
||||||
const int nx = ex[0] + 3; // ex1 + ord
|
|
||||||
const int ny = ex[1] + 3;
|
|
||||||
|
|
||||||
const int ii = iF + shift; // 0..ex1+2
|
|
||||||
const int jj = jF + shift; // 0..ex2+2
|
|
||||||
const int kk = kF + shift; // 0..ex3+2
|
|
||||||
|
|
||||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran
|
|
||||||
* funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran
|
|
||||||
*
|
|
||||||
* C 里我们把:
|
|
||||||
* func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1
|
|
||||||
* funcc 用“平移下标”存为一维数组:
|
|
||||||
* iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1]
|
|
||||||
* 总长度 nx = extc1 + ord
|
|
||||||
* 同理 ny = extc2 + ord, nz = extc3 + ord
|
|
||||||
*/
|
|
||||||
|
|
||||||
static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) {
|
|
||||||
const int nx = extc[0], ny = extc[1];
|
|
||||||
return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) {
|
|
||||||
const int shift = ord - 1; // iF = -shift .. extc1
|
|
||||||
const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个
|
|
||||||
const int ny = extc[1] + ord;
|
|
||||||
|
|
||||||
const int ii = iF + shift; // 0..extc1+shift
|
|
||||||
const int jj = jF + shift; // 0..extc2+shift
|
|
||||||
const int kk = kF + shift; // 0..extc3+shift
|
|
||||||
|
|
||||||
return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* 等价于 Fortran:
|
|
||||||
* funcc(1:extc1,1:extc2,1:extc3)=func
|
|
||||||
* do i=0,ord-1
|
|
||||||
* funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1)
|
|
||||||
* enddo
|
|
||||||
* do i=0,ord-1
|
|
||||||
* funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2)
|
|
||||||
* enddo
|
|
||||||
* do i=0,ord-1
|
|
||||||
* funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3)
|
|
||||||
* enddo
|
|
||||||
*/
|
|
||||||
static inline void symmetry_bd(int ord,
|
|
||||||
const int extc[3],
|
|
||||||
const double *func,
|
|
||||||
double *funcc,
|
|
||||||
const double SoA[3])
|
|
||||||
{
|
|
||||||
const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2];
|
|
||||||
|
|
||||||
// 1) funcc(1:extc1,1:extc2,1:extc3) = func
|
|
||||||
// Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1)
|
|
||||||
for (int k0 = 0; k0 < extc3; ++k0) {
|
|
||||||
for (int j0 = 0; j0 < extc2; ++j0) {
|
|
||||||
for (int i0 = 0; i0 < extc1; ++i0) {
|
|
||||||
const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
||||||
funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1)
|
|
||||||
for (int ii = 0; ii <= ord - 1; ++ii) {
|
|
||||||
const int iF_dst = -ii; // 0, -1, -2, ...
|
|
||||||
const int iF_src = ii + 1; // 1, 2, 3, ...
|
|
||||||
for (int kF = 1; kF <= extc3; ++kF) {
|
|
||||||
for (int jF = 1; jF <= extc2; ++jF) {
|
|
||||||
funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] =
|
|
||||||
funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2)
|
|
||||||
// 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖
|
|
||||||
for (int jj = 0; jj <= ord - 1; ++jj) {
|
|
||||||
const int jF_dst = -jj;
|
|
||||||
const int jF_src = jj + 1;
|
|
||||||
for (int kF = 1; kF <= extc3; ++kF) {
|
|
||||||
for (int iF = -ord + 1; iF <= extc1; ++iF) {
|
|
||||||
funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] =
|
|
||||||
funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3)
|
|
||||||
for (int kk = 0; kk <= ord - 1; ++kk) {
|
|
||||||
const int kF_dst = -kk;
|
|
||||||
const int kF_src = kk + 1;
|
|
||||||
for (int jF = -ord + 1; jF <= extc2; ++jF) {
|
|
||||||
for (int iF = -ord + 1; iF <= extc1; ++iF) {
|
|
||||||
funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] =
|
|
||||||
funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
@@ -11,6 +11,8 @@
|
|||||||
#include <strstream>
|
#include <strstream>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
#else
|
#else
|
||||||
#include <iostream.h>
|
#include <iostream.h>
|
||||||
@@ -238,9 +240,6 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
|
|||||||
shellf = new double[n_tot * InList];
|
shellf = new double[n_tot * InList];
|
||||||
|
|
||||||
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
|
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
|
||||||
|
|
||||||
//|~~~~~> Integrate the dot product of Dphi with the surface normal.
|
|
||||||
|
|
||||||
double *RP_out, *IP_out;
|
double *RP_out, *IP_out;
|
||||||
RP_out = new double[NN];
|
RP_out = new double[NN];
|
||||||
IP_out = new double[NN];
|
IP_out = new double[NN];
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
#include "share_func.h"
|
|
||||||
void fdderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fxx, double *fxy, double *fxz,
|
|
||||||
double *fyy, double *fyz, double *fzz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff);
|
|
||||||
|
|
||||||
void fderivs(const int ex[3],
|
|
||||||
const double *f,
|
|
||||||
double *fx, double *fy, double *fz,
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
double SYM1, double SYM2, double SYM3,
|
|
||||||
int Symmetry, int onoff);
|
|
||||||
|
|
||||||
void kodis(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double SoA[3],
|
|
||||||
int Symmetry, double eps);
|
|
||||||
|
|
||||||
void lopsided(const int ex[3],
|
|
||||||
const double *X, const double *Y, const double *Z,
|
|
||||||
const double *f, double *f_rhs,
|
|
||||||
const double *Sfx, const double *Sfy, const double *Sfz,
|
|
||||||
int Symmetry, const double SoA[3]);
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""Convert interp_lb_profile.bin to a C header for compile-time embedding."""
|
|
||||||
import struct, sys
|
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
|
||||||
print(f"Usage: {sys.argv[0]} <profile.bin> <output.h>")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
with open(sys.argv[1], 'rb') as f:
|
|
||||||
magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16))
|
|
||||||
threshold = struct.unpack('d', f.read(8))[0]
|
|
||||||
times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8)))
|
|
||||||
heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4)))
|
|
||||||
|
|
||||||
# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank
|
|
||||||
# (or vice versa depending on which neighbor is lighter)
|
|
||||||
splits = []
|
|
||||||
for hr in heavy:
|
|
||||||
prev_t = times[hr - 1] if hr > 0 else 1e30
|
|
||||||
next_t = times[hr + 1] if hr < nprocs - 1 else 1e30
|
|
||||||
if prev_t <= next_t:
|
|
||||||
splits.append((hr, hr - 1, hr)) # (block_id, r_left, r_right)
|
|
||||||
else:
|
|
||||||
splits.append((hr, hr, hr + 1))
|
|
||||||
|
|
||||||
# Also remap the displaced neighbor blocks
|
|
||||||
remaps = {}
|
|
||||||
for hr, r_l, r_r in splits:
|
|
||||||
if r_l != hr:
|
|
||||||
# We took r_l's slot, so remap block r_l to its other neighbor
|
|
||||||
displaced = r_l
|
|
||||||
if displaced > 0 and displaced - 1 not in [s[0] for s in splits]:
|
|
||||||
remaps[displaced] = displaced - 1
|
|
||||||
elif displaced < nprocs - 1:
|
|
||||||
remaps[displaced] = displaced + 1
|
|
||||||
else:
|
|
||||||
displaced = r_r
|
|
||||||
if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]:
|
|
||||||
remaps[displaced] = displaced + 1
|
|
||||||
elif displaced > 0:
|
|
||||||
remaps[displaced] = displaced - 1
|
|
||||||
|
|
||||||
with open(sys.argv[2], 'w') as out:
|
|
||||||
out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n")
|
|
||||||
out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n")
|
|
||||||
out.write("#define INTERP_LB_PROFILE_DATA_H\n\n")
|
|
||||||
out.write(f"#define INTERP_LB_NPROCS {nprocs}\n")
|
|
||||||
out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n")
|
|
||||||
out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{")
|
|
||||||
out.write(", ".join(str(h) for h in heavy))
|
|
||||||
out.write("};\n\n")
|
|
||||||
out.write("/* Split table: {block_id, r_left, r_right} */\n")
|
|
||||||
out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n")
|
|
||||||
for bid, rl, rr in splits:
|
|
||||||
out.write(f" {{{bid}, {rl}, {rr}}},\n")
|
|
||||||
out.write("};\n\n")
|
|
||||||
out.write("/* Rank remap for displaced neighbor blocks */\n")
|
|
||||||
out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n")
|
|
||||||
out.write(f"static const int interp_lb_remaps[][2] = {{\n")
|
|
||||||
for src, dst in sorted(remaps.items()):
|
|
||||||
out.write(f" {{{src}, {dst}}},\n")
|
|
||||||
if not remaps:
|
|
||||||
out.write(" {-1, -1},\n")
|
|
||||||
out.write("};\n\n")
|
|
||||||
out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n")
|
|
||||||
|
|
||||||
print(f"Generated {sys.argv[2]}:")
|
|
||||||
print(f" {num_heavy} heavy blocks to split: {heavy}")
|
|
||||||
for bid, rl, rr in splits:
|
|
||||||
print(f" block {bid}: split -> rank {rl} (left), rank {rr} (right)")
|
|
||||||
for src, dst in sorted(remaps.items()):
|
|
||||||
print(f" block {src}: remap -> rank {dst}")
|
|
||||||
@@ -11,46 +11,17 @@
|
|||||||
import AMSS_NCKU_Input as input_data
|
import AMSS_NCKU_Input as input_data
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
|
## CPU core binding configuration using taskset
|
||||||
|
## taskset ensures all child processes inherit the CPU affinity mask
|
||||||
|
## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111)
|
||||||
|
## Format: taskset -c 4-55,60-111 ensures processes only run on these cores
|
||||||
|
#NUMACTL_CPU_BIND = "taskset -c 0-111"
|
||||||
|
NUMACTL_CPU_BIND = "taskset -c 16-47,64-95"
|
||||||
|
|
||||||
|
## Build parallelism configuration
|
||||||
def get_last_n_cores_per_socket(n=32):
|
## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores
|
||||||
"""
|
## Set make -j to utilize available cores for faster builds
|
||||||
Read CPU topology via lscpu and return a taskset -c string
|
BUILD_JOBS = 96
|
||||||
selecting the last `n` cores of each NUMA node (socket).
|
|
||||||
|
|
||||||
Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111
|
|
||||||
-> "taskset -c 24-55,80-111"
|
|
||||||
"""
|
|
||||||
result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True)
|
|
||||||
|
|
||||||
# Build a dict: node_id -> sorted list of CPU ids
|
|
||||||
node_cpus = {}
|
|
||||||
for line in result.stdout.splitlines():
|
|
||||||
if line.startswith("#") or not line.strip():
|
|
||||||
continue
|
|
||||||
parts = line.split(",")
|
|
||||||
if len(parts) < 2:
|
|
||||||
continue
|
|
||||||
node_id, cpu_id = int(parts[0]), int(parts[1])
|
|
||||||
node_cpus.setdefault(node_id, []).append(cpu_id)
|
|
||||||
|
|
||||||
segments = []
|
|
||||||
for node_id in sorted(node_cpus):
|
|
||||||
cpus = sorted(node_cpus[node_id])
|
|
||||||
selected = cpus[-n:] # last n cores of this socket
|
|
||||||
segments.append(f"{selected[0]}-{selected[-1]}")
|
|
||||||
|
|
||||||
cpu_str = ",".join(segments)
|
|
||||||
total = len(segments) * n
|
|
||||||
print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)")
|
|
||||||
return f"taskset -c {cpu_str}"
|
|
||||||
|
|
||||||
|
|
||||||
## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
|
|
||||||
NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
|
|
||||||
|
|
||||||
## Build parallelism: match the number of bound cores
|
|
||||||
BUILD_JOBS = 64
|
|
||||||
|
|
||||||
|
|
||||||
##################################################################
|
##################################################################
|
||||||
@@ -69,7 +40,7 @@ def makefile_ABE():
|
|||||||
|
|
||||||
## Build command with CPU binding to nohz_full cores
|
## Build command with CPU binding to nohz_full cores
|
||||||
if (input_data.GPU_Calculation == "no"):
|
if (input_data.GPU_Calculation == "no"):
|
||||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE"
|
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE"
|
||||||
elif (input_data.GPU_Calculation == "yes"):
|
elif (input_data.GPU_Calculation == "yes"):
|
||||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
||||||
else:
|
else:
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user