diff --git a/AMSS_NCKU_Program.py b/AMSS_NCKU_Program.py index 6a7952a..2d777cd 100755 --- a/AMSS_NCKU_Program.py +++ b/AMSS_NCKU_Program.py @@ -270,6 +270,12 @@ if not os.path.exists( ABE_file ): ## Copy the executable ABE (or ABEGPU) into the run directory shutil.copy2(ABE_file, output_directory) +## Copy interp load balance profile if present (for optimize pass) +interp_lb_profile = os.path.join(AMSS_NCKU_source_copy, "interp_lb_profile.bin") +if os.path.exists(interp_lb_profile): + shutil.copy2(interp_lb_profile, output_directory) + print( " Copied interp_lb_profile.bin to run directory " ) + ########################### ## If the initial-data method is TwoPuncture, copy the TwoPunctureABE executable to the run directory diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 91ead8a..563bfcc 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -13,6 +13,9 @@ using namespace std; #include "MPatch.h" #include "Parallel.h" #include "fmisc.h" +#ifdef INTERP_LB_PROFILE +#include "interp_lb_profile.h" +#endif Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) { @@ -507,6 +510,9 @@ void Patch::Interp_Points(MyList *VarList, // Targeted point-to-point overload: each owner sends each point only to // the one rank that needs it for integration (consumer), reducing // communication volume by ~nprocs times compared to the Bcast version. +#ifdef INTERP_LB_PROFILE + double t_interp_start = MPI_Wtime(); +#endif int myrank, nprocs; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); @@ -608,6 +614,11 @@ void Patch::Interp_Points(MyList *VarList, } } +#ifdef INTERP_LB_PROFILE + double t_interp_end = MPI_Wtime(); + double t_interp_local = t_interp_end - t_interp_start; +#endif + // --- Error check for unfound points --- for (int j = 0; j < NN; j++) { @@ -764,6 +775,31 @@ void Patch::Interp_Points(MyList *VarList, delete[] recv_count; delete[] consumer_rank; delete[] owner_rank; + +#ifdef INTERP_LB_PROFILE + { + static bool profile_written = false; + if (!profile_written) { + double *all_times = nullptr; + if (myrank == 0) all_times = new double[nprocs]; + MPI_Gather(&t_interp_local, 1, MPI_DOUBLE, + all_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (myrank == 0) { + int heavy[64]; + int nh = InterpLBProfile::identify_heavy_ranks( + all_times, nprocs, 2.5, heavy, 64); + InterpLBProfile::write_profile( + "interp_lb_profile.bin", nprocs, + all_times, heavy, nh, 2.5); + printf("[InterpLB] Profile written: %d heavy ranks\n", nh); + for (int i = 0; i < nh; i++) + printf(" Heavy rank %d: %.6f s\n", heavy[i], all_times[heavy[i]]); + delete[] all_times; + } + profile_written = true; + } + } +#endif } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index a9fb3cd..bd591c4 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -462,7 +462,7 @@ MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int i } } #else - ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // delete through KillBlocks + ng = ng0 = new Block(dim, shape_here, bbox_here, n_rank++, ingfsi, fngfsi, PP->lev); // ng->checkBlock(); if (BlL) BlL->insert(ng); @@ -500,6 +500,384 @@ MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int i return BlL; } + +#ifdef INTERP_LB_OPTIMIZE +#include "interp_lb_profile_data.h" + +MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int nodes) +{ +#ifdef USE_GPU_DIVIDE + double cpu_part, gpu_part; + map::iterator iter; + iter = parameters::dou_par.find("cpu part"); + if (iter != parameters::dou_par.end()) + { + cpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + strcpy(pname, (iter->second).c_str()); + else { cout << "Error inputpar" << endl; exit(0); } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); str = pline; + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + else if (status == 0) continue; + if (sgrp == "ABE") { if (skey == "cpu part") cpu_part = atof(sval.c_str()); } + } + inf.close(); + parameters::dou_par.insert(map::value_type("cpu part", cpu_part)); + } + iter = parameters::dou_par.find("gpu part"); + if (iter != parameters::dou_par.end()) + { + gpu_part = iter->second; + } + else + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + const int LEN = 256; + char pline[LEN]; + string str, sgrp, skey, sval; + int sind; + char pname[50]; + { + map::iterator iter = parameters::str_par.find("inputpar"); + if (iter != parameters::str_par.end()) + strcpy(pname, (iter->second).c_str()); + else { cout << "Error inputpar" << endl; exit(0); } + } + ifstream inf(pname, ifstream::in); + if (!inf.good() && myrank == 0) + { cout << "Can not open parameter file " << pname << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + for (int i = 1; inf.good(); i++) + { + inf.getline(pline, LEN); str = pline; + int status = misc::parse_parts(str, sgrp, skey, sval, sind); + if (status == -1) { cout << "error reading parameter file " << pname << " in line " << i << endl; MPI_Abort(MPI_COMM_WORLD, 1); } + else if (status == 0) continue; + if (sgrp == "ABE") { if (skey == "gpu part") gpu_part = atof(sval.c_str()); } + } + inf.close(); + parameters::dou_par.insert(map::value_type("gpu part", gpu_part)); + } + if (nodes == 0) nodes = cpusize / 2; +#else + if (nodes == 0) nodes = cpusize; +#endif + + if (dim != 3) + { + cout << "distrivute: now we only support 3-dimension" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + MyList *BlL = 0; + int split_size, min_size, block_size = 0; + int min_width = 2 * Mymax(ghost_width, buffer_width); + int nxyz[dim], mmin_width[dim], min_shape[dim]; + + MyList *PLi = PatchLIST; + for (int i = 0; i < dim; i++) + min_shape[i] = PLi->data->shape[i]; + int lev = PLi->data->lev; + PLi = PLi->next; + while (PLi) + { + Patch *PP = PLi->data; + for (int i = 0; i < dim; i++) + min_shape[i] = Mymin(min_shape[i], PP->shape[i]); + if (lev != PLi->data->lev) + cout << "Parallel::distribute CAUSTION: meet Patches for different level: " << lev << " and " << PLi->data->lev << endl; + PLi = PLi->next; + } + + for (int i = 0; i < dim; i++) + mmin_width[i] = Mymin(min_width, min_shape[i]); + min_size = mmin_width[0]; + for (int i = 1; i < dim; i++) + min_size = min_size * mmin_width[i]; + + PLi = PatchLIST; + while (PLi) + { + Patch *PP = PLi->data; + int bs = PP->shape[0]; + for (int i = 1; i < dim; i++) + bs = bs * PP->shape[i]; + block_size = block_size + bs; + PLi = PLi->next; + } + split_size = Mymax(min_size, block_size / nodes); + split_size = Mymax(1, split_size); + + int n_rank = 0; + PLi = PatchLIST; + int reacpu = 0; + int current_block_id = 0; + while (PLi) { + Block *ng0, *ng; + bool first_block_in_patch = true; + Patch *PP = PLi->data; + reacpu += partition3(nxyz, split_size, mmin_width, nodes, PP->shape); + + for (int i = 0; i < nxyz[0]; i++) + for (int j = 0; j < nxyz[1]; j++) + for (int k = 0; k < nxyz[2]; k++) + { + int ibbox_here[6], shape_here[3]; + double bbox_here[6], dd; + Block *current_ng_start = nullptr; + + bool is_heavy = false; + int r_l = -1, r_r = -1; + if (cpusize == INTERP_LB_NPROCS) { + for (int si = 0; si < INTERP_LB_NUM_HEAVY; si++) { + if (current_block_id == interp_lb_splits[si][0]) { + is_heavy = true; + r_l = interp_lb_splits[si][1]; + r_r = interp_lb_splits[si][2]; + break; + } + } + } + + if (is_heavy) + { + int ib0 = (PP->shape[0] * i) / nxyz[0]; + int ib3 = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + int jb1 = (PP->shape[1] * j) / nxyz[1]; + int jb4 = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + int kb2 = (PP->shape[2] * k) / nxyz[2]; + int kb5 = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + Block *split_first_block = nullptr; + Block *split_last_block = nullptr; + splitHotspotBlock(BlL, dim, ib0, ib3, jb1, jb4, kb2, kb5, + PP, r_l, r_r, ingfsi, fngfsi, periodic, + split_first_block, split_last_block); + + current_ng_start = split_first_block; + ng = split_last_block; + } + else + { + ibbox_here[0] = (PP->shape[0] * i) / nxyz[0]; + ibbox_here[3] = (PP->shape[0] * (i + 1)) / nxyz[0] - 1; + ibbox_here[1] = (PP->shape[1] * j) / nxyz[1]; + ibbox_here[4] = (PP->shape[1] * (j + 1)) / nxyz[1] - 1; + ibbox_here[2] = (PP->shape[2] * k) / nxyz[2]; + ibbox_here[5] = (PP->shape[2] * (k + 1)) / nxyz[2] - 1; + + if (periodic) { + for(int d=0; d<3; d++) { + ibbox_here[d] -= ghost_width; + ibbox_here[d+3] += ghost_width; + } + } else { + ibbox_here[0] = Mymax(0, ibbox_here[0] - ghost_width); + ibbox_here[3] = Mymin(PP->shape[0] - 1, ibbox_here[3] + ghost_width); + ibbox_here[1] = Mymax(0, ibbox_here[1] - ghost_width); + ibbox_here[4] = Mymin(PP->shape[1] - 1, ibbox_here[4] + ghost_width); + ibbox_here[2] = Mymax(0, ibbox_here[2] - ghost_width); + ibbox_here[5] = Mymin(PP->shape[2] - 1, ibbox_here[5] + ghost_width); + } + + for(int d=0; d<3; d++) shape_here[d] = ibbox_here[d+3] - ibbox_here[d] + 1; + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bbox_here[0] = PP->bbox[0] + ibbox_here[0] * dd; + bbox_here[3] = PP->bbox[0] + ibbox_here[3] * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bbox_here[1] = PP->bbox[1] + ibbox_here[1] * dd; + bbox_here[4] = PP->bbox[1] + ibbox_here[4] * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bbox_here[2] = PP->bbox[2] + ibbox_here[2] * dd; + bbox_here[5] = PP->bbox[2] + ibbox_here[5] * dd; +#else +#ifdef Cell + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bbox_here[0] = PP->bbox[0] + (ibbox_here[0]) * dd; + bbox_here[3] = PP->bbox[0] + (ibbox_here[3] + 1) * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bbox_here[1] = PP->bbox[1] + (ibbox_here[1]) * dd; + bbox_here[4] = PP->bbox[1] + (ibbox_here[4] + 1) * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bbox_here[2] = PP->bbox[2] + (ibbox_here[2]) * dd; + bbox_here[5] = PP->bbox[2] + (ibbox_here[5] + 1) * dd; +#else +#error Not define Vertex nor Cell +#endif +#endif + ng = createMappedBlock(BlL, dim, shape_here, bbox_here, + current_block_id, ingfsi, fngfsi, PP->lev); + current_ng_start = ng; + } + + if (first_block_in_patch) { + ng0 = current_ng_start; + MyList *Bp_start = BlL; + while (Bp_start && Bp_start->data != ng0) Bp_start = Bp_start->next; + PP->blb = Bp_start; + first_block_in_patch = false; + } + + current_block_id++; + } + + { + MyList *Bp_end = BlL; + while (Bp_end && Bp_end->data != ng) Bp_end = Bp_end->next; + PP->ble = Bp_end; + } + + PLi = PLi->next; + } + if (reacpu < nodes * 2 / 3) + { + int myrank; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (myrank == 0) + cout << "Parallel::distribute CAUSTION: level#" << lev << " uses essencially " << reacpu << " processors vs " << nodes << " nodes run, your scientific computation scale is not as large as you estimate." << endl; + } + + return BlL; +} + +Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block) +{ + int mid = (ib0_orig + ib3_orig) / 2; + + int indices_L[6] = {ib0_orig, jb1_orig, kb2_orig, mid, jb4_orig, kb5_orig}; + int indices_R[6] = {mid + 1, jb1_orig, kb2_orig, ib3_orig, jb4_orig, kb5_orig}; + + auto createSubBlock = [&](int* ib_raw, int target_rank) { + int ib_final[6]; + int sh_here[3]; + double bb_here[6], dd; + + if (periodic) { + ib_final[0] = ib_raw[0] - ghost_width; + ib_final[3] = ib_raw[3] + ghost_width; + ib_final[1] = ib_raw[1] - ghost_width; + ib_final[4] = ib_raw[4] + ghost_width; + ib_final[2] = ib_raw[2] - ghost_width; + ib_final[5] = ib_raw[5] + ghost_width; + } else { + ib_final[0] = Mymax(0, ib_raw[0] - ghost_width); + ib_final[3] = Mymin(PP->shape[0] - 1, ib_raw[3] + ghost_width); + ib_final[1] = Mymax(0, ib_raw[1] - ghost_width); + ib_final[4] = Mymin(PP->shape[1] - 1, ib_raw[4] + ghost_width); + ib_final[2] = Mymax(0, ib_raw[2] - ghost_width); + ib_final[5] = Mymin(PP->shape[2] - 1, ib_raw[5] + ghost_width); + } + + sh_here[0] = ib_final[3] - ib_final[0] + 1; + sh_here[1] = ib_final[4] - ib_final[1] + 1; + sh_here[2] = ib_final[5] - ib_final[2] + 1; + +#ifdef Vertex + dd = (PP->bbox[3] - PP->bbox[0]) / (PP->shape[0] - 1); + bb_here[0] = PP->bbox[0] + ib_final[0] * dd; + bb_here[3] = PP->bbox[0] + ib_final[3] * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / (PP->shape[1] - 1); + bb_here[1] = PP->bbox[1] + ib_final[1] * dd; + bb_here[4] = PP->bbox[1] + ib_final[4] * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / (PP->shape[2] - 1); + bb_here[2] = PP->bbox[2] + ib_final[2] * dd; + bb_here[5] = PP->bbox[2] + ib_final[5] * dd; +#else +#ifdef Cell + dd = (PP->bbox[3] - PP->bbox[0]) / PP->shape[0]; + bb_here[0] = PP->bbox[0] + ib_final[0] * dd; + bb_here[3] = PP->bbox[0] + (ib_final[3] + 1) * dd; + dd = (PP->bbox[4] - PP->bbox[1]) / PP->shape[1]; + bb_here[1] = PP->bbox[1] + ib_final[1] * dd; + bb_here[4] = PP->bbox[1] + (ib_final[4] + 1) * dd; + dd = (PP->bbox[5] - PP->bbox[2]) / PP->shape[2]; + bb_here[2] = PP->bbox[2] + ib_final[2] * dd; + bb_here[5] = PP->bbox[2] + (ib_final[5] + 1) * dd; +#endif +#endif + + Block* Bg = new Block(dim, sh_here, bb_here, target_rank, ingfsi, fngfsi, PP->lev); + if (BlL) BlL->insert(Bg); + else BlL = new MyList(Bg); + + return Bg; + }; + + split_first_block = createSubBlock(indices_L, r_left); + split_last_block = createSubBlock(indices_R, r_right); + return split_last_block; +} + +Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev) +{ + int target_rank = block_id; + if (INTERP_LB_NPROCS > 0) { + for (int ri = 0; ri < interp_lb_num_remaps; ri++) { + if (block_id == interp_lb_remaps[ri][0]) { + target_rank = interp_lb_remaps[ri][1]; + break; + } + } + } + + Block* ng = new Block(dim, shape, bbox, target_rank, ingfsi, fngfsi, lev); + if (BlL) BlL->insert(ng); + else BlL = new MyList(ng); + + return ng; +} +#else +// When INTERP_LB_OPTIMIZE is not defined, distribute_optimize falls back to distribute +MyList *Parallel::distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, + bool periodic, int nodes) +{ + return distribute(PatchLIST, cpusize, ingfsi, fngfsi, periodic, nodes); +} +Block* Parallel::splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block) +{ return nullptr; } +Block* Parallel::createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev) +{ return nullptr; } +#endif + #elif (PSTR == 1 || PSTR == 2 || PSTR == 3) MyList *Parallel::distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfsi, bool periodic, int start_rank, int end_rank, int nodes) diff --git a/AMSS_NCKU_source/Parallel.h b/AMSS_NCKU_source/Parallel.h index a6ef351..e17f365 100644 --- a/AMSS_NCKU_source/Parallel.h +++ b/AMSS_NCKU_source/Parallel.h @@ -32,6 +32,16 @@ namespace Parallel int partition2(int *nxy, int split_size, int *min_width, int cpusize, int *shape); // special for 2 diemnsions int partition3(int *nxyz, int split_size, int *min_width, int cpusize, int *shape); MyList *distribute(MyList *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); // produce corresponding Blocks + MyList *distribute_optimize(MyList *PatchLIST, int cpusize, int ingfsi, int fngfs, bool periodic, int nodes = 0); + Block* splitHotspotBlock(MyList* &BlL, int _dim, + int ib0_orig, int ib3_orig, + int jb1_orig, int jb4_orig, + int kb2_orig, int kb5_orig, + Patch* PP, int r_left, int r_right, + int ingfsi, int fngfsi, bool periodic, + Block* &split_first_block, Block* &split_last_block); + Block* createMappedBlock(MyList* &BlL, int _dim, int* shape, double* bbox, + int block_id, int ingfsi, int fngfsi, int lev); void KillBlocks(MyList *PatchLIST); void setfunction(MyList *BlL, var *vn, double func(double x, double y, double z)); diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 18f1388..eb84f8e 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -2426,9 +2426,9 @@ void bssn_class::RecursiveStep(int lev) #endif #if (REGLEV == 0) - GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, + if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } #endif } @@ -2605,9 +2605,9 @@ void bssn_class::ParallelStep() delete[] tporg; delete[] tporgo; #if (REGLEV == 0) - GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, + if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } #endif } @@ -2772,9 +2772,9 @@ void bssn_class::ParallelStep() if (lev + 1 >= GH->movls) { // GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0, - GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, + if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor); + fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); @@ -2787,9 +2787,9 @@ void bssn_class::ParallelStep() // for this level if (YN == 1) { - GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, + if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor); + fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); @@ -2806,9 +2806,9 @@ void bssn_class::ParallelStep() if (YN == 1) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, - GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, + if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); + fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); @@ -2822,9 +2822,9 @@ void bssn_class::ParallelStep() if (i % 4 == 3) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, - GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, + if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor); + fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); } // a_stream.clear(); diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C new file mode 100644 index 0000000..1086026 --- /dev/null +++ b/AMSS_NCKU_source/bssn_rhs_c.C @@ -0,0 +1,1265 @@ +#include "macrodef.h" +#include "bssn_rhs.h" +#include "share_func.h" +#include "tool.h" +// 0-based i,j,k +// #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) +// ex(1)=nx, ex(2)=ny, ex(3)=nz + +// 用法:a[ IDX_F(i,j,k,nx,ny) ] + +// C function that calculates the right-hand side for BSSN equations +int f_compute_rhs_bssn(int *ex, double &T, + double *X, double *Y, double *Z, + double *chi, double *trK, + double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *rho, double *Sx, double *Sy, double *Sz, + double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, + double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, + int &Symmetry, int &Lev, double &eps, int &co + ) // return gont +{ + int nx = ex[0], ny = ex[1], nz=ex[2]; + int all = nx*ny*nz; + // printf("nx=%d ny=%d nz=%d all=%d\n", nx, ny, nz, all); + + // temp variable + double gxx[all],gyy[all],gzz[all]; + double chix[all],chiy[all],chiz[all]; + double gxxx[all],gxyx[all],gxzx[all],gyyx[all],gyzx[all],gzzx[all]; + double gxxy[all],gxyy[all],gxzy[all],gyyy[all],gyzy[all],gzzy[all]; + double gxxz[all],gxyz[all],gxzz[all],gyyz[all],gyzz[all],gzzz[all]; + double Lapx[all], Lapy[all], Lapz[all]; + double betaxx[all], betaxy[all], betaxz[all]; + double betayx[all], betayy[all], betayz[all]; + double betazx[all], betazy[all], betazz[all]; + double Gamxx[all],Gamxy[all],Gamxz[all]; + double Gamyx[all],Gamyy[all],Gamyz[all]; + double Gamzx[all],Gamzy[all],Gamzz[all]; + double Kx[all], Ky[all], Kz[all], div_beta[all], S[all]; + double f[all], fxx[all], fxy[all], fxz[all], fyy[all], fyz[all], fzz[all]; + double Gamxa[all], Gamya[all], Gamza[all], alpn1[all], chin1[all]; + double gupxx[all], gupxy[all], gupxz[all]; + double gupyy[all], gupyz[all], gupzz[all]; + double SSS[3] = { 1.0, 1.0, 1.0}; + double AAS[3] = {-1.0, -1.0, 1.0}; + double ASA[3] = {-1.0, 1.0, -1.0}; + double SAA[3] = { 1.0, -1.0, -1.0}; + double ASS[3] = {-1.0, 1.0, 1.0}; + double SAS[3] = { 1.0, -1.0, 1.0}; + double SSA[3] = { 1.0, 1.0, -1.0}; + double dX, dY, dZ, PI; + const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, FOUR = 4.0; + const double EIGHT = 8.0, HALF = 0.5, THR = 3.0; + const double SYM = 1.0, ANTI = -1.0; + const double FF = 0.75, eta = 2.0; + const double F1o3 = 1.0/3.0, F2o3 = 2.0/3.0, F3o2 = 1.5, F1o6 = 1.0/6.0; + const double F16 = 16.0, F8 = 8.0; + #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7) + double reta[all]; + /* 使用时:reta[idx],其中 idx = i + nx*(j + ny*k) (Fortran列主序) */ + #endif + + #if (GAUGE == 6 || GAUGE == 7) + int BHN = 0; + double Porg[9] = {0.0}; + double Mass[3] = {0.0}; + + extern "C" { + #ifdef fortran1 + void getpbh(int &, double *, double *); + #elif defined(fortran2) + void GETPBH(int &, double *, double *); + #else + void getpbh_(int &, double *, double *); + #endif + } + + #ifdef fortran1 + getpbh(BHN, Porg, Mass); + #elif defined(fortran2) + GETPBH(BHN, Porg, Mass); + #else + getpbh_(BHN, Porg, Mass); + #endif + #endif + PI = acos(-1.0); + dX = X[1] - X[0]; + dY = Y[1] - Y[0]; + dZ = Z[1] - Z[0]; + + // 1ms // + for(int i=0;i0){ + kodis(ex,X,Y,Z,chi,chi_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,trK,trK_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,dxx,gxx_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,gxy,gxy_rhs,AAS,Symmetry,eps); + kodis(ex,X,Y,Z,gxz,gxz_rhs,ASA,Symmetry,eps); + kodis(ex,X,Y,Z,dyy,gyy_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,gyz,gyz_rhs,SAA,Symmetry,eps); + kodis(ex,X,Y,Z,dzz,gzz_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,Axx,Axx_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,SSA,Symmetry,eps); + kodis(ex,X,Y,Z,Axy,Axy_rhs,AAS,Symmetry,eps); + kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,SAS,Symmetry,eps); + kodis(ex,X,Y,Z,Axz,Axz_rhs,ASA,Symmetry,eps); + kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,ASS,Symmetry,eps); + kodis(ex,X,Y,Z,Ayy,Ayy_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,betaz,betaz_rhs,SSA,Symmetry,eps); + kodis(ex,X,Y,Z,Ayz,Ayz_rhs,SAA,Symmetry,eps); + kodis(ex,X,Y,Z,betay,betay_rhs,SAS,Symmetry,eps); + kodis(ex,X,Y,Z,Azz,Azz_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,betax,betax_rhs,ASS,Symmetry,eps); + kodis(ex,X,Y,Z,Gamx,Gamx_rhs,ASS,Symmetry,eps); + kodis(ex,X,Y,Z,Lap,Lap_rhs,SSS,Symmetry,eps); + kodis(ex,X,Y,Z,Gamy,Gamy_rhs,SAS,Symmetry,eps); + kodis(ex,X,Y,Z,Gamz,Gamz_rhs,SSA,Symmetry,eps); + } + // 2ms // + if(co==0){ + for (int i=0;i 0) @@ -1301,13 +1305,13 @@ bool cgh::Interp_One_Point(MyList *VarList, } -void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, +bool cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, MyList *OldList, MyList *StateList, MyList *FutureList, MyList *tmList, bool BB, monitor *ErrorMonitor) { if (lev < movls) - return; + return false; #if (0) // #if (PSTR == 1 || PSTR == 2) @@ -1396,7 +1400,7 @@ void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do for (bhi = 0; bhi < BH_num; bhi++) delete[] tmpPorg[bhi]; delete[] tmpPorg; - return; + return false; } // x direction rr = (Porg0[bhi][0] - handle[lev][grd][0]) / dX; @@ -1500,6 +1504,7 @@ void cgh::Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, do for (int bhi = 0; bhi < BH_num; bhi++) delete[] tmpPorg[bhi]; delete[] tmpPorg; + return tot_flag; } diff --git a/AMSS_NCKU_source/cgh.h b/AMSS_NCKU_source/cgh.h index 79e7bf6..57e489a 100644 --- a/AMSS_NCKU_source/cgh.h +++ b/AMSS_NCKU_source/cgh.h @@ -74,7 +74,7 @@ public: MyList *OldList, MyList *StateList, MyList *FutureList, MyList *tmList, int Symmetry, bool BB); - void Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, + bool Regrid_Onelevel(int lev, int Symmetry, int BH_num, double **Porgbr, double **Porg0, MyList *OldList, MyList *StateList, MyList *FutureList, MyList *tmList, bool BB, monitor *ErrorMonitor); diff --git a/AMSS_NCKU_source/fdderivs_c.C b/AMSS_NCKU_source/fdderivs_c.C new file mode 100644 index 0000000..5cc52d4 --- /dev/null +++ b/AMSS_NCKU_source/fdderivs_c.C @@ -0,0 +1,268 @@ +#include "tool.h" +void fdderivs(const int ex[3], + const double *f, + double *fxx, double *fxy, double *fxz, + double *fyy, double *fyz, double *fzz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff) +{ + (void)onoff; + + const int NO_SYMM = 0, EQ_SYMM = 1; + const double ZEO = 0.0, ONE = 1.0, TWO = 2.0; + const double F1o4 = 2.5e-1; // 1/4 + const double F8 = 8.0; + const double F16 = 16.0; + const double F30 = 30.0; + const double F1o12 = ONE / 12.0; + const double F1o144 = ONE / 144.0; + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */ + const size_t nx = (size_t)ex1 + 2; + const size_t ny = (size_t)ex2 + 2; + const size_t nz = (size_t)ex3 + 2; + const size_t fh_size = nx * ny * nz; + + static double *fh = NULL; + static size_t cap = 0; + + if (fh_size > cap) { + free(fh); + fh = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + // double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + symmetry_bd(2, ex, f, fh, SoA); + + /* 系数:按 Fortran 原式 */ + const double Sdxdx = ONE / (dX * dX); + const double Sdydy = ONE / (dY * dY); + const double Sdzdz = ONE / (dZ * dZ); + + const double Fdxdx = F1o12 / (dX * dX); + const double Fdydy = F1o12 / (dY * dY); + const double Fdzdz = F1o12 / (dZ * dZ); + + const double Sdxdy = F1o4 / (dX * dY); + const double Sdxdz = F1o4 / (dX * dZ); + const double Sdydz = F1o4 / (dY * dZ); + + const double Fdxdy = F1o144 / (dX * dY); + const double Fdxdz = F1o144 / (dX * dZ); + const double Fdydz = F1o144 / (dY * dZ); + + /* 输出清零:fxx,fyy,fzz,fxy,fxz,fyz = 0 */ + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fxx[p] = ZEO; fyy[p] = ZEO; fzz[p] = ZEO; + fxy[p] = ZEO; fxz[p] = ZEO; fyz[p] = ZEO; + } + + /* + * Fortran: + * do k=1,ex3-1 + * do j=1,ex2-1 + * do i=1,ex1-1 + */ + + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + /* 高阶分支:i±2,j±2,k±2 都在范围内 */ + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + fxx[p] = Fdxdx * ( + -fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fyy[p] = Fdydy * ( + -fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fzz[p] = Fdzdz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + + /* fxy 高阶:完全照搬 Fortran 的括号结构 */ + { + const double t_jm2 = + ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] ); + + const double t_jm1 = + ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] ); + + const double t_jp1 = + ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] ); + + const double t_jp2 = + ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] ); + + fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 ); + } + + /* fxz 高阶 */ + { + const double t_km2 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] ); + + const double t_km1 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] ); + + const double t_kp1 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] ); + + const double t_kp2 = + ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] ); + + fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } + + /* fyz 高阶 */ + { + const double t_km2 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] ); + + const double t_km1 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] ); + + const double t_kp1 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] ); + + const double t_kp2 = + ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] ); + + fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } + } + /* 二阶分支:i±1,j±1,k±1 在范围内 */ + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + fxx[p] = Sdxdx * ( + fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fyy[p] = Sdydy * ( + fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fzz[p] = Sdzdz * ( + fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + + fxy[p] = Sdxdy * ( + fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] + ); + + fxz[p] = Sdxdz * ( + fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] + ); + + fyz[p] = Sdydz * ( + fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] + ); + }else{ + fxx[p] = 0.0; + fyy[p] = 0.0; + fzz[p] = 0.0; + fxy[p] = 0.0; + fxz[p] = 0.0; + fyz[p] = 0.0; + } + } + } + } + + // free(fh); +} \ No newline at end of file diff --git a/AMSS_NCKU_source/fderivs_c.C b/AMSS_NCKU_source/fderivs_c.C new file mode 100644 index 0000000..5f6b94c --- /dev/null +++ b/AMSS_NCKU_source/fderivs_c.C @@ -0,0 +1,150 @@ +#include "tool.h" + +/* + * C 版 fderivs + * + * Fortran: + * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff) + * + * 约定: + * f, fx, fy, fz: ex1*ex2*ex3,按 idx_ex 布局 + * X: ex1, Y: ex2, Z: ex3 + */ +void fderivs(const int ex[3], + const double *f, + double *fx, double *fy, double *fz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff) +{ + (void)onoff; // Fortran 里没用到 + + const double ZEO = 0.0, ONE = 1.0; + const double TWO = 2.0, EIT = 8.0; + const double F12 = 12.0; + + const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用 + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + // dX = X(2)-X(1) -> C: X[1]-X[0] + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + + // Fortran 1-based bounds + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; + + // SoA(1:3) = SYM1,SYM2,SYM3 + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 + const size_t nx = (size_t)ex1 + 2; + const size_t ny = (size_t)ex2 + 2; + const size_t nz = (size_t)ex3 + 2; + const size_t fh_size = nx * ny * nz; + static double *fh = NULL; + static size_t cap = 0; + + if (fh_size > cap) { + free(fh); + fh = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + // double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + // call symmetry_bd(2,ex,f,fh,SoA) + symmetry_bd(2, ex, f, fh, SoA); + + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + + // fx = fy = fz = 0 + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fx[p] = ZEO; + fy[p] = ZEO; + fz[p] = ZEO; + } + + /* + * Fortran loops: + * do k=1,ex3-1 + * do j=1,ex2-1 + * do i=1,ex1-1 + * + * C: k0=0..ex3-2, j0=0..ex2-2, i0=0..ex1-2 + */ + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + // if(i+2 <= imax .and. i-2 >= imin ... ) (全是 Fortran 索引) + if ((iF + 2) <= imaxF && (iF - 2) >= iminF && + (jF + 2) <= jmaxF && (jF - 2) >= jminF && + (kF + 2) <= kmaxF && (kF - 2) >= kminF) + { + fx[p] = d12dx * ( + fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] - + EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + + EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + ); + + fy[p] = d12dy * ( + fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] - + EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + + EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + ); + + fz[p] = d12dz * ( + fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] - + EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + + EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + ); + } + // elseif(i+1 <= imax .and. i-1 >= imin ...) + else if ((iF + 1) <= imaxF && (iF - 1) >= iminF && + (jF + 1) <= jmaxF && (jF - 1) >= jminF && + (kF + 1) <= kmaxF && (kF - 1) >= kminF) + { + fx[p] = d2dx * ( + -fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fy[p] = d2dy * ( + -fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fz[p] = d2dz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + } + } + } + } + + // free(fh); +} \ No newline at end of file diff --git a/AMSS_NCKU_source/interp_lb_profile.C b/AMSS_NCKU_source/interp_lb_profile.C new file mode 100644 index 0000000..52d4ff6 --- /dev/null +++ b/AMSS_NCKU_source/interp_lb_profile.C @@ -0,0 +1,107 @@ +#include "interp_lb_profile.h" +#include +#include +#include + +namespace InterpLBProfile { + +bool write_profile(const char *filepath, int nprocs, + const double *rank_times, + const int *heavy_ranks, int num_heavy, + double threshold_ratio) +{ + FILE *fp = fopen(filepath, "wb"); + if (!fp) return false; + + ProfileHeader hdr; + hdr.magic = MAGIC; + hdr.version = VERSION; + hdr.nprocs = nprocs; + hdr.num_heavy = num_heavy; + hdr.threshold_ratio = threshold_ratio; + + fwrite(&hdr, sizeof(hdr), 1, fp); + fwrite(rank_times, sizeof(double), nprocs, fp); + fwrite(heavy_ranks, sizeof(int), num_heavy, fp); + fclose(fp); + return true; +} + +bool read_profile(const char *filepath, int current_nprocs, + int *heavy_ranks, int &num_heavy, + double *rank_times, MPI_Comm comm) +{ + int myrank; + MPI_Comm_rank(comm, &myrank); + + int valid = 0; + ProfileHeader hdr; + memset(&hdr, 0, sizeof(hdr)); + + if (myrank == 0) { + FILE *fp = fopen(filepath, "rb"); + if (fp) { + if (fread(&hdr, sizeof(hdr), 1, fp) == 1 && + hdr.magic == MAGIC && hdr.version == VERSION && + hdr.nprocs == current_nprocs) + { + if (fread(rank_times, sizeof(double), current_nprocs, fp) + == (size_t)current_nprocs && + fread(heavy_ranks, sizeof(int), hdr.num_heavy, fp) + == (size_t)hdr.num_heavy) + { + num_heavy = hdr.num_heavy; + valid = 1; + } + } else if (fp) { + printf("[InterpLB] Profile rejected: magic=0x%X version=%u " + "nprocs=%d (current=%d)\n", + hdr.magic, hdr.version, hdr.nprocs, current_nprocs); + } + fclose(fp); + } + } + + MPI_Bcast(&valid, 1, MPI_INT, 0, comm); + if (!valid) return false; + + MPI_Bcast(&num_heavy, 1, MPI_INT, 0, comm); + MPI_Bcast(heavy_ranks, num_heavy, MPI_INT, 0, comm); + MPI_Bcast(rank_times, current_nprocs, MPI_DOUBLE, 0, comm); + return true; +} + +int identify_heavy_ranks(const double *rank_times, int nprocs, + double threshold_ratio, + int *heavy_ranks, int max_heavy) +{ + double sum = 0; + for (int i = 0; i < nprocs; i++) sum += rank_times[i]; + double mean = sum / nprocs; + double threshold = threshold_ratio * mean; + + // Collect candidates + struct RankTime { int rank; double time; }; + RankTime *candidates = new RankTime[nprocs]; + int ncand = 0; + + for (int i = 0; i < nprocs; i++) { + if (rank_times[i] > threshold) + candidates[ncand++] = {i, rank_times[i]}; + } + + // Sort descending by time + std::sort(candidates, candidates + ncand, + [](const RankTime &a, const RankTime &b) { + return a.time > b.time; + }); + + int count = (ncand < max_heavy) ? ncand : max_heavy; + for (int i = 0; i < count; i++) + heavy_ranks[i] = candidates[i].rank; + + delete[] candidates; + return count; +} + +} // namespace InterpLBProfile diff --git a/AMSS_NCKU_source/interp_lb_profile.bin b/AMSS_NCKU_source/interp_lb_profile.bin new file mode 100644 index 0000000..4356b28 Binary files /dev/null and b/AMSS_NCKU_source/interp_lb_profile.bin differ diff --git a/AMSS_NCKU_source/interp_lb_profile.h b/AMSS_NCKU_source/interp_lb_profile.h new file mode 100644 index 0000000..10b9dac --- /dev/null +++ b/AMSS_NCKU_source/interp_lb_profile.h @@ -0,0 +1,38 @@ +#ifndef INTERP_LB_PROFILE_H +#define INTERP_LB_PROFILE_H + +#include + +namespace InterpLBProfile { + +static const unsigned int MAGIC = 0x494C4250; // "ILBP" +static const unsigned int VERSION = 1; + +struct ProfileHeader { + unsigned int magic; + unsigned int version; + int nprocs; + int num_heavy; + double threshold_ratio; +}; + +// Write profile file (rank 0 only) +bool write_profile(const char *filepath, int nprocs, + const double *rank_times, + const int *heavy_ranks, int num_heavy, + double threshold_ratio); + +// Read profile file (rank 0 reads, then broadcasts to all) +// Returns true if file found and valid for current nprocs +bool read_profile(const char *filepath, int current_nprocs, + int *heavy_ranks, int &num_heavy, + double *rank_times, MPI_Comm comm); + +// Identify heavy ranks: those with time > threshold_ratio * mean +int identify_heavy_ranks(const double *rank_times, int nprocs, + double threshold_ratio, + int *heavy_ranks, int max_heavy); + +} // namespace InterpLBProfile + +#endif /* INTERP_LB_PROFILE_H */ diff --git a/AMSS_NCKU_source/interp_lb_profile_data.h b/AMSS_NCKU_source/interp_lb_profile_data.h new file mode 100644 index 0000000..159c01d --- /dev/null +++ b/AMSS_NCKU_source/interp_lb_profile_data.h @@ -0,0 +1,27 @@ +/* Auto-generated from interp_lb_profile.bin — do not edit */ +#ifndef INTERP_LB_PROFILE_DATA_H +#define INTERP_LB_PROFILE_DATA_H + +#define INTERP_LB_NPROCS 64 +#define INTERP_LB_NUM_HEAVY 4 + +static const int interp_lb_heavy_blocks[4] = {27, 35, 28, 36}; + +/* Split table: {block_id, r_left, r_right} */ +static const int interp_lb_splits[4][3] = { + {27, 26, 27}, + {35, 34, 35}, + {28, 28, 29}, + {36, 36, 37}, +}; + +/* Rank remap for displaced neighbor blocks */ +static const int interp_lb_num_remaps = 4; +static const int interp_lb_remaps[][2] = { + {26, 25}, + {29, 30}, + {34, 33}, + {37, 38}, +}; + +#endif /* INTERP_LB_PROFILE_DATA_H */ diff --git a/AMSS_NCKU_source/kodiss_c.C b/AMSS_NCKU_source/kodiss_c.C new file mode 100644 index 0000000..2a04abe --- /dev/null +++ b/AMSS_NCKU_source/kodiss_c.C @@ -0,0 +1,109 @@ +#include "tool.h" + +/* + * C 版 kodis + * + * Fortran signature: + * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps) + * + * 约定: + * X: ex1, Y: ex2, Z: ex3 + * f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局 + * SoA[3] + * eps: double + */ +void kodis(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double SoA[3], + int Symmetry, double eps) +{ + const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0; + const double cof = 64.0; // 2^6 + const int NO_SYMM = 0, OCTANT = 2; + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0] + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + (void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上 + + // Fortran: imax=ex(1) 等是 1-based 上界 + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + // Fortran: imin=jmin=kmin=1,某些对称情况变 -2 + int iminF = 1, jminF = 1, kminF = 1; + + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2; + if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2; + + // 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3 + const size_t nx = (size_t)ex1 + 3; + const size_t ny = (size_t)ex2 + 3; + const size_t nz = (size_t)ex3 + 3; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + // Fortran: call symmetry_bd(3,ex,f,fh,SoA) + symmetry_bd(3, ex, f, fh, SoA); + + /* + * Fortran loops: + * do k=1,ex3 + * do j=1,ex2 + * do i=1,ex1 + * + * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1 + * 并定义 Fortran index: iF=i0+1, ... + */ + for (int k0 = 0; k0 < ex3; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 < ex2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 < ex1; ++i0) { + const int iF = i0 + 1; + + // Fortran if 条件: + // i-3 >= imin .and. i+3 <= imax 等(都是 Fortran 索引) + if ((iF - 3) >= iminF && (iF + 3) <= imaxF && + (jF - 3) >= jminF && (jF + 3) <= jmaxF && + (kF - 3) >= kminF && (kF + 3) <= kmaxF) + { + const size_t p = idx_ex(i0, j0, k0, ex); + + // 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核) + const double Dx_term = + ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) - + SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) + + FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) - + TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX; + + const double Dy_term = + ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) - + SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) + + FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) - + TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY; + + const double Dz_term = + ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) - + SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) + + FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ; + + // Fortran: + // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term) + f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term); + } + } + } + } + + free(fh); +} \ No newline at end of file diff --git a/AMSS_NCKU_source/lopsided_c.C b/AMSS_NCKU_source/lopsided_c.C new file mode 100644 index 0000000..fe2c6e4 --- /dev/null +++ b/AMSS_NCKU_source/lopsided_c.C @@ -0,0 +1,255 @@ +#include "tool.h" +/* + * 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。 + * Fortran: call symmetry_bd(3,ex,f,fh,SoA) + * + * 约定: + * nghost = 3 + * ex[3] = {ex1,ex2,ex3} + * f = 原始网格 (ex1*ex2*ex3) + * fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...) + * SoA[3] = 输入参数 + */ +void lopsided(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double *Sfx, const double *Sfy, const double *Sfz, + int Symmetry, const double SoA[3]) +{ + const double ZEO = 0.0, ONE = 1.0, F3 = 3.0; + const double TWO = 2.0, F6 = 6.0, F18 = 18.0; + const double F12 = 12.0, F10 = 10.0, EIT = 8.0; + + const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2; + (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系 + + const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; + + // 对应 Fortran: dX = X(2)-X(1) (Fortran 1-based) + // C: X[1]-X[0] + const double dX = X[1] - X[0]; + const double dY = Y[1] - Y[0]; + const double dZ = Z[1] - Z[0]; + + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + + // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来) + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + (void)d2dx; (void)d2dy; (void)d2dz; + + // Fortran: + // imax = ex(1); jmax = ex(2); kmax = ex(3) + const int imaxF = ex1; + const int jmaxF = ex2; + const int kmaxF = ex3; + + // Fortran: + // imin=jmin=kmin=1; 若满足对称条件则设为 -2 + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + + // 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3) + const size_t nx = (size_t)ex1 + 3; + const size_t ny = (size_t)ex2 + 3; + const size_t nz = (size_t)ex3 + 3; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错) + + // Fortran: call symmetry_bd(3,ex,f,fh,SoA) + symmetry_bd(3, ex, f, fh, SoA); + + /* + * Fortran 主循环: + * do k=1,ex(3)-1 + * do j=1,ex(2)-1 + * do i=1,ex(1)-1 + * + * 转成 C 0-based: + * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2 + * + * 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值: + * iF=i0+1, jF=j0+1, kF=k0+1 + */ + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + + const size_t p = idx_ex(i0, j0, k0, ex); + + // ---------------- x direction ---------------- + const double sfx = Sfx[p]; + if (sfx > ZEO) { + // Fortran: if(i+3 <= imax) + // iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4 + if (i0 <= ex1 - 4) { + f_rhs[p] += sfx * d12dx * + (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + } + // elseif(i+2 <= imax) <=> i0 <= ex1-3 + else if (i0 <= ex1 - 3) { + f_rhs[p] += sfx * d12dx * + ( fh[idx_fh_F(iF - 2, jF, kF, ex)] + -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] + +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] + - fh[idx_fh_F(iF + 2, jF, kF, ex)]); + } + // elseif(i+1 <= imax) <=> i0 <= ex1-2(循环里总成立) + else if (i0 <= ex1 - 2) { + f_rhs[p] -= sfx * d12dx * + (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] + + fh[idx_fh_F(iF - 3, jF, kF, ex)]); + } + } else if (sfx < ZEO) { + // Fortran: if(i-3 >= imin) + // (iF-3) >= iminF <=> (i0-2) >= iminF + if ((i0 - 2) >= iminF) { + f_rhs[p] -= sfx * d12dx * + (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] + + fh[idx_fh_F(iF - 3, jF, kF, ex)]); + } + // elseif(i-2 >= imin) <=> (i0-1) >= iminF + else if ((i0 - 1) >= iminF) { + f_rhs[p] += sfx * d12dx * + ( fh[idx_fh_F(iF - 2, jF, kF, ex)] + -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] + +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] + - fh[idx_fh_F(iF + 2, jF, kF, ex)]); + } + // elseif(i-1 >= imin) <=> i0 >= iminF + else if (i0 >= iminF) { + f_rhs[p] += sfx * d12dx * + (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + -F10 * fh[idx_fh_F(iF , jF, kF, ex)] + +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] + -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + } + } + + // ---------------- y direction ---------------- + const double sfy = Sfy[p]; + if (sfy > ZEO) { + // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4 + if (j0 <= ex2 - 4) { + f_rhs[p] += sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + fh[idx_fh_F(iF, jF + 3, kF, ex)]); + } else if (j0 <= ex2 - 3) { + f_rhs[p] += sfy * d12dy * + ( fh[idx_fh_F(iF, jF - 2, kF, ex)] + -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] + +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] + - fh[idx_fh_F(iF, jF + 2, kF, ex)]); + } else if (j0 <= ex2 - 2) { + f_rhs[p] -= sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] + + fh[idx_fh_F(iF, jF - 3, kF, ex)]); + } + } else if (sfy < ZEO) { + if ((j0 - 2) >= jminF) { + f_rhs[p] -= sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] + + fh[idx_fh_F(iF, jF - 3, kF, ex)]); + } else if ((j0 - 1) >= jminF) { + f_rhs[p] += sfy * d12dy * + ( fh[idx_fh_F(iF, jF - 2, kF, ex)] + -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] + +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] + - fh[idx_fh_F(iF, jF + 2, kF, ex)]); + } else if (j0 >= jminF) { + f_rhs[p] += sfy * d12dy * + (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + -F10 * fh[idx_fh_F(iF, jF , kF, ex)] + +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] + -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + fh[idx_fh_F(iF, jF + 3, kF, ex)]); + } + } + + // ---------------- z direction ---------------- + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0 <= ex3 - 4) { + f_rhs[p] += sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + } else if (k0 <= ex3 - 3) { + f_rhs[p] += sfz * d12dz * + ( fh[idx_fh_F(iF, jF, kF - 2, ex)] + -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] + +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] + - fh[idx_fh_F(iF, jF, kF + 2, ex)]); + } else if (k0 <= ex3 - 2) { + f_rhs[p] -= sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] + + fh[idx_fh_F(iF, jF, kF - 3, ex)]); + } + } else if (sfz < ZEO) { + if ((k0 - 2) >= kminF) { + f_rhs[p] -= sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] + + fh[idx_fh_F(iF, jF, kF - 3, ex)]); + } else if ((k0 - 1) >= kminF) { + f_rhs[p] += sfz * d12dz * + ( fh[idx_fh_F(iF, jF, kF - 2, ex)] + -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] + +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] + - fh[idx_fh_F(iF, jF, kF + 2, ex)]); + } else if (k0 >= kminF) { + f_rhs[p] += sfz * d12dz * + (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + -F10 * fh[idx_fh_F(iF, jF, kF , ex)] + +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] + -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + } + } + } + } + } + free(fh); +} + + + + + diff --git a/AMSS_NCKU_source/macrodef.fh b/AMSS_NCKU_source/macrodef.fh index ead5fe0..84fbf50 100644 --- a/AMSS_NCKU_source/macrodef.fh +++ b/AMSS_NCKU_source/macrodef.fh @@ -1,83 +1,77 @@ - - -#if 0 -note here -v:r; u: phi; w: theta -tetradtype 0 -v^a = (x,y,z) -orthonormal order: v,u,w -m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007) -tetradtype 1 -orthonormal order: w,u,v -m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012) -tetradtype 2 -v_a = (x,y,z) -orthonormal order: v,u,w -m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007) -#endif -#define tetradtype 2 - -#if 0 -note here -Cell center or Vertex center -#endif -#define Cell - -#if 0 -note here -2nd order: 2 -4th order: 3 -6th order: 4 -8th order: 5 -#endif -#define ghost_width 3 - -#if 0 -note here -use shell or not -#endif -#define WithShell - -#if 0 -note here -use constraint preserving boundary condition or not -only affect Z4c -#endif -#define CPBC - -#if 0 -note here -Gauge condition type -0: B^i gauge -1: David's puncture gauge -2: MB B^i gauge -3: RIT B^i gauge -4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006) -5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006) -6: MGB1 B^i gauge -7: MGB2 B^i gauge -#endif -#define GAUGE 2 - -#if 0 -buffer points for CPBC boundary -#endif -#define CPBC_ghost_width (ghost_width) - -#if 0 -using BSSN variable for constraint violation and psi4 calculation: 0 -using ADM variable for constraint violation and psi4 calculation: 1 -#endif -#define ABV 0 - -#if 0 -Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory -1: Case C of 1112.3928, V=0 -2: shell with a2^2*phi0/(1+a2^2), f(R) = R+a2*R^2 induced V -3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V -4: a2 = oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) ) -5: shell with phi(r) = phi0*Exp(-(r-r0)**2/sigma), V = 0 -#endif -#define EScalar_CC 2 - - + +#define tetradtype 2 + +#define Cell + +#define ghost_width 3 + + + +#define GAUGE 0 + +#define CPBC_ghost_width (ghost_width) + +#define ABV 0 + +#define EScalar_CC 2 + +#if 0 + +define tetradtype + v:r; u: phi; w: theta + tetradtype 0 + v^a = (x,y,z) + orthonormal order: v,u,w + m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007) + tetradtype 1 + orthonormal order: w,u,v + m = (theta + i phi)/sqrt(2) following Sperhake, Eq.(3.2) of PRD 85, 124062(2012) + tetradtype 2 + v_a = (x,y,z) + orthonormal order: v,u,w + m = (phi - i theta)/sqrt(2) following Frans, Eq.(8) of PRD 75, 124018(2007) + +define Cell or Vertex + Cell center or Vertex center + +define ghost_width + 2nd order: 2 + 4th order: 3 + 6th order: 4 + 8th order: 5 + +define WithShell + use shell or not + +define CPBC + use constraint preserving boundary condition or not + only affect Z4c + CPBC only supports WithShell + +define GAUGE + 0: B^i gauge + 1: David puncture gauge + 2: MB B^i gauge + 3: RIT B^i gauge + 4: MB beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006) + 5: RIT beta gauge (beta gauge not means Eq.(3) of PRD 84, 124006) + 6: MGB1 B^i gauge + 7: MGB2 B^i gauge + +define CPBC_ghost_width (ghost_width) + buffer points for CPBC boundary + +define ABV + 0: using BSSN variable for constraint violation and psi4 calculation + 1: using ADM variable for constraint violation and psi4 calculation + +define EScalar_CC +Type of Potential and Scalar Distribution in F(R) Scalar-Tensor Theory + 1: Case C of 1112.3928, V=0 + 2: shell with phi(r) = phi0 * a2^2/(1+a2^2), f(R) = R+a2*R^2 induced V + 3: ground state of Schrodinger-Newton system, f(R) = R+a2*R^2 induced V + 4: a2 = +oo and phi(r) = phi0 * 0.5 * ( tanh((r+r0)/sigma) - tanh((r-r0)/sigma) ) + 5: shell with phi(r) = phi0 * Exp(-(r-r0)**2/sigma), V = 0 + +#endif + diff --git a/AMSS_NCKU_source/macrodef.h b/AMSS_NCKU_source/macrodef.h index 164785a..631f33b 100644 --- a/AMSS_NCKU_source/macrodef.h +++ b/AMSS_NCKU_source/macrodef.h @@ -1,112 +1,145 @@ - -#ifndef MICRODEF_H -#define MICRODEF_H - -#include "macrodef.fh" - -// application parameters - -/// **** -// sommerfeld boundary type -// 0: bam, 1: shibata -#define SommerType 0 - -/// **** -// for Using Gauss-Legendre quadrature in theta direction -#define GaussInt - -/// **** -// 0: BSSN vacuum -// 1: coupled to scalar field -// 2: Z4c vacuum -// 3: coupled to Maxwell field -// -#define ABEtype 2 - -/// **** -// using Apparent Horizon Finder -//#define With_AHF - -/// **** -// Psi4 calculation method -// 0: EB method -// 1: 4-D method -// -#define Psi4type 0 - -/// **** -// for Using point psi4 or not -//#define Point_Psi4 - -/// **** -// RestrictProlong in Step (0) or after Step (1) -#define RPS 1 - -/// **** -// Enforce algebra constraint -// for every RK4 sub step: 0 -// only when iter_count == 3: 1 -// after routine Step: 2 -#define AGM 0 - -/// **** -// Restrict Prolong using BAM style 1 or old style 0 -#define RPB 0 - -/// **** -// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method -#define MAPBH 1 - -/// **** -// parallel structure, 0: level by level, 1: considering all levels, 2: as 1 but reverse the CPU order, 3: Frank's scheme -#define PSTR 0 - -/// **** -// regrid for every level or for all levels at a time -// 0: for every level; 1: for all -#define REGLEV 0 - -/// **** -// use gpu or not -//#define USE_GPU - -/// **** -// use checkpoint for every process -//#define CHECKDETAIL - -/// **** -// use FakeCheckPrepare to write CheckPoint -//#define FAKECHECK -////================================================================ -// some basic parameters for numerical calculation -#define dim 3 - -//#define Cell or Vertex in "microdef.fh" - -// ****** -// buffer point number for mesh refinement interface -#define buffer_width 6 - -// ****** -// buffer point number shell-box interface, on shell -#define SC_width buffer_width -// buffer point number shell-box interface, on box -#define CS_width (2*buffer_width) - -#if(buffer_width < ghost_width) -#error we always assume buffer_width>ghost_width -#endif - -#define PACK 1 -#define UNPACK 2 - -#define Mymax(a,b) (((a) > (b)) ? (a) : (b)) -#define Mymin(a,b) (((a) < (b)) ? (a) : (b)) - -#define feq(a,b,d) (fabs(a-b)d) - -#define TINY 1e-10 - -#endif /* MICRODEF_H */ + +#ifndef MICRODEF_H +#define MICRODEF_H + +#include "macrodef.fh" + +// application parameters + +#define SommerType 0 + +#define GaussInt + +#define ABEtype 0 + +//#define With_AHF +#define Psi4type 0 + +//#define Point_Psi4 + +#define RPS 1 + +#define AGM 0 + +#define RPB 0 + +#define MAPBH 1 + +#define PSTR 0 + +#define REGLEV 0 + +//#define USE_GPU + +//#define CHECKDETAIL + +//#define FAKECHECK + +// +// define SommerType +// sommerfeld boundary type +// 0: bam +// 1: shibata +// +// define GaussInt +// for Using Gauss-Legendre quadrature in theta direction +// +// define ABEtype +// 0: BSSN vacuum +// 1: coupled to scalar field +// 2: Z4c vacuum +// 3: coupled to Maxwell field +// +// define With_AHF +// using Apparent Horizon Finder +// +// define Psi4type +// Psi4 calculation method +// 0: EB method +// 1: 4-D method +// +// define Point_Psi4 +// for Using point psi4 or not +// +// define RPS +// RestrictProlong in Step (0) or after Step (1) +// +// define AGM +// Enforce algebra constraint +// for every RK4 sub step: 0 +// only when iter_count == 3: 1 +// after routine Step: 2 +// +// define RPB +// Restrict Prolong using BAM style 1 or old style 0 +// +// define MAPBH +// 1: move Analysis out ot 4 sub steps and treat PBH with Euler method +// +// define PSTR +// parallel structure +// 0: level by level +// 1: considering all levels +// 2: as 1 but reverse the CPU order +// 3: Frank's scheme +// +// define REGLEV +// regrid for every level or for all levels at a time +// 0: for every level; +// 1: for all +// +// define USE_GPU +// use gpu or not +// +// define CHECKDETAIL +// use checkpoint for every process +// +// define FAKECHECK +// use FakeCheckPrepare to write CheckPoint +// + +////================================================================ +// some basic parameters for numerical calculation +////================================================================ + +#define dim 3 + +//#define Cell or Vertex in "macrodef.fh" + +#define buffer_width 6 + +#define SC_width buffer_width + +#define CS_width (2*buffer_width) + +// +// define Cell or Vertex in "macrodef.fh" +// +// define buffer_width +// buffer point number for mesh refinement interface +// +// define SC_width buffer_width +// buffer point number shell-box interface, on shell +// +// define CS_width +// buffer point number shell-box interface, on box +// + +#if(buffer_width < ghost_width) +# error we always assume buffer_width>ghost_width +#endif + +#define PACK 1 +#define UNPACK 2 + +#define Mymax(a,b) (((a) > (b)) ? (a) : (b)) +#define Mymin(a,b) (((a) < (b)) ? (a) : (b)) + +#define feq(a,b,d) (fabs(a-b)d) + +#define TINY 1e-10 + +#endif /* MICRODEF_H */ + diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index f2d4e3c..7849f37 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -2,6 +2,27 @@ include makefile.inc +## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) +## make -> opt (PGO-guided, maximum performance) +## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) +PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata + +ifeq ($(PGO_MODE),instrument) +## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability +CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ + -align array64byte -fpp -I${MKLROOT}/include +else +## opt (default): maximum performance with PGO profile data +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(PROFDATA) \ + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(PROFDATA) \ + -align array64byte -fpp -I${MKLROOT}/include +endif + .SUFFIXES: .o .f90 .C .for .cu .f90.o: @@ -16,19 +37,54 @@ include makefile.inc .cu.o: $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) +# C rewrite of BSSN RHS kernel and helpers +bssn_rhs_c.o: bssn_rhs_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +fderivs_c.o: fderivs_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +fdderivs_c.o: fdderivs_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +kodiss_c.o: kodiss_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +lopsided_c.o: lopsided_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS +TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata +TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(TP_PROFDATA) \ + -Dfortran3 -Dnewc -I${MKLROOT}/include + TwoPunctures.o: TwoPunctures.C - ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@ + ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ TwoPunctureABE.o: TwoPunctureABE.C - ${CXX} $(CXXAPPFLAGS) -qopenmp -c $< -o $@ + ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ # Input files + +## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) +ifeq ($(USE_CXX_KERNELS),0) +# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below +CFILES = +else +# C++ mode (default): C rewrite of bssn_rhs and helper kernels +CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o +endif + C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o bssn_class.o surface_integral.o ShellPatch.o\ bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ - NullShellPatch2_Evo.o writefile_f.o + NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ cgh.o surface_integral.o ShellPatch.o\ @@ -38,9 +94,9 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o NullShellPatch2_Evo.o \ bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o -F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ +F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\ - rungekutta4_rout.o bssn_rhs.o diff_new.o kodiss.o kodiss_sh.o\ + rungekutta4_rout.o diff_new.o kodiss.o kodiss_sh.o\ lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ @@ -51,6 +107,14 @@ F90FILES = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\ NullNews2.o tool_f.o +ifeq ($(USE_CXX_KERNELS),0) +# Fortran mode: include original bssn_rhs.o +F90FILES = $(F90FILES_BASE) bssn_rhs.o +else +# C++ mode (default): bssn_rhs.o replaced by C++ kernel +F90FILES = $(F90FILES_BASE) +endif + F77FILES = zbesh.o AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \ @@ -63,7 +127,7 @@ TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o # file dependences -$(C++FILES) $(C++FILESGPU) $(F90FILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh +$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ @@ -86,7 +150,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h $(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h -$(C++FILES) $(C++FILES_GPU) $(AHFDOBJS) $(CUDAFILES): macrodef.h +$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h TwoPunctureFILES: TwoPunctures.h @@ -95,14 +159,14 @@ $(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h misc.o : zbesh.o # projects -ABE: $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) +ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -ABEGPU: $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) +ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) TwoPunctureABE: $(TwoPunctureFILES) - $(CLINKER) $(CXXAPPFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) + $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) clean: rm *.o ABE ABEGPU TwoPunctureABE make.log -f diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index d0899af..ad83ceb 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -8,18 +8,31 @@ filein = -I/usr/include/ -I${MKLROOT}/include ## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl +LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 -## Aggressive optimization flags + PGO Phase 2 (profile-guided optimization) -## -fprofile-instr-use: use collected profile data to guide optimization decisions -## (branch prediction, basic block layout, inlining, loop unrolling) -PROFDATA = ../../pgo_profile/default.profdata -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(PROFDATA) \ - -align array64byte -fpp -I${MKLROOT}/include +## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) +## opt : (default) maximum performance with PGO profile-guided optimization +## instrument : PGO Phase 1 instrumentation to collect fresh profile data +PGO_MODE ?= opt + +## Interp_Points load balance profiling mode +## off : (default) no load balance instrumentation +## profile : Pass 1 — instrument Interp_Points to collect timing profile +## optimize : Pass 2 — read profile and apply block rebalancing +INTERP_LB_MODE ?= off + +ifeq ($(INTERP_LB_MODE),profile) +INTERP_LB_FLAGS = -DINTERP_LB_PROFILE +else ifeq ($(INTERP_LB_MODE),optimize) +INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE +else +INTERP_LB_FLAGS = +endif + +## Kernel implementation switch +## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster) +## 0 : fall back to original Fortran kernels +USE_CXX_KERNELS ?= 1 f90 = ifx f77 = ifx CXX = icpx diff --git a/AMSS_NCKU_source/share_func.h b/AMSS_NCKU_source/share_func.h new file mode 100644 index 0000000..5051448 --- /dev/null +++ b/AMSS_NCKU_source/share_func.h @@ -0,0 +1,146 @@ +#ifndef SHARE_FUNC_H +#define SHARE_FUNC_H + +#include +#include +#include +#include +/* 主网格:0-based -> 1D */ +static inline size_t idx_ex(int i0, int j0, int k0, const int ex[3]) { + const int ex1 = ex[0], ex2 = ex[1]; + return (size_t)i0 + (size_t)j0 * (size_t)ex1 + (size_t)k0 * (size_t)ex1 * (size_t)ex2; +} + +/* + * fh 对应 Fortran: fh(-1:ex1, -1:ex2, -1:ex3) + * ord=2 => shift=1 + * iF/jF/kF 为 Fortran 索引(可为 -1,0,1..ex) + */ +static inline size_t idx_fh_F_ord2(int iF, int jF, int kF, const int ex[3]) { + const int shift = 1; + const int nx = ex[0] + 2; // ex1 + ord + const int ny = ex[1] + 2; + + const int ii = iF + shift; // 0..ex1+1 + const int jj = jF + shift; // 0..ex2+1 + const int kk = kF + shift; // 0..ex3+1 + + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + +/* + * fh 对应 Fortran: fh(-2:ex1, -2:ex2, -2:ex3) + * ord=3 => shift=2 + * iF/jF/kF 是 Fortran 索引(可为负) + */ +static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) { + const int shift = 2; // ord=3 -> -2..ex + const int nx = ex[0] + 3; // ex1 + ord + const int ny = ex[1] + 3; + + const int ii = iF + shift; // 0..ex1+2 + const int jj = jF + shift; // 0..ex2+2 + const int kk = kF + shift; // 0..ex3+2 + + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + +/* + * func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran + * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran + * + * C 里我们把: + * func 视为 0-based: i0=0..extc1-1, j0=0..extc2-1, k0=0..extc3-1 + * funcc 用“平移下标”存为一维数组: + * iF in [-ord+1..extc1] -> ii = iF + (ord-1) in [0..extc1+ord-1] + * 总长度 nx = extc1 + ord + * 同理 ny = extc2 + ord, nz = extc3 + ord + */ + +static inline size_t idx_func0(int i0, int j0, int k0, const int extc[3]) { + const int nx = extc[0], ny = extc[1]; + return (size_t)i0 + (size_t)j0 * (size_t)nx + (size_t)k0 * (size_t)nx * (size_t)ny; +} + +static inline size_t idx_funcc_F(int iF, int jF, int kF, int ord, const int extc[3]) { + const int shift = ord - 1; // iF = -shift .. extc1 + const int nx = extc[0] + ord; // [-shift..extc1] 共 extc1+ord 个 + const int ny = extc[1] + ord; + + const int ii = iF + shift; // 0..extc1+shift + const int jj = jF + shift; // 0..extc2+shift + const int kk = kF + shift; // 0..extc3+shift + + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + +/* + * 等价于 Fortran: + * funcc(1:extc1,1:extc2,1:extc3)=func + * do i=0,ord-1 + * funcc(-i,1:extc2,1:extc3) = funcc(i+1,1:extc2,1:extc3)*SoA(1) + * enddo + * do i=0,ord-1 + * funcc(:,-i,1:extc3) = funcc(:,i+1,1:extc3)*SoA(2) + * enddo + * do i=0,ord-1 + * funcc(:,:,-i) = funcc(:,:,i+1)*SoA(3) + * enddo + */ +static inline void symmetry_bd(int ord, + const int extc[3], + const double *func, + double *funcc, + const double SoA[3]) +{ + const int extc1 = extc[0], extc2 = extc[1], extc3 = extc[2]; + + // 1) funcc(1:extc1,1:extc2,1:extc3) = func + // Fortran 的 (iF=1..extc1) 对应 C 的 func(i0=0..extc1-1) + for (int k0 = 0; k0 < extc3; ++k0) { + for (int j0 = 0; j0 < extc2; ++j0) { + for (int i0 = 0; i0 < extc1; ++i0) { + const int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1; + funcc[idx_funcc_F(iF, jF, kF, ord, extc)] = func[idx_func0(i0, j0, k0, extc)]; + } + } + } + + // 2) do i=0..ord-1: funcc(-i, 1:extc2, 1:extc3) = funcc(i+1, ...)*SoA(1) + for (int ii = 0; ii <= ord - 1; ++ii) { + const int iF_dst = -ii; // 0, -1, -2, ... + const int iF_src = ii + 1; // 1, 2, 3, ... + for (int kF = 1; kF <= extc3; ++kF) { + for (int jF = 1; jF <= extc2; ++jF) { + funcc[idx_funcc_F(iF_dst, jF, kF, ord, extc)] = + funcc[idx_funcc_F(iF_src, jF, kF, ord, extc)] * SoA[0]; + } + } + } + + // 3) do i=0..ord-1: funcc(:,-i, 1:extc3) = funcc(:, i+1, 1:extc3)*SoA(2) + // 注意 Fortran 这里的 ":" 表示 iF 从 (-ord+1..extc1) 全覆盖 + for (int jj = 0; jj <= ord - 1; ++jj) { + const int jF_dst = -jj; + const int jF_src = jj + 1; + for (int kF = 1; kF <= extc3; ++kF) { + for (int iF = -ord + 1; iF <= extc1; ++iF) { + funcc[idx_funcc_F(iF, jF_dst, kF, ord, extc)] = + funcc[idx_funcc_F(iF, jF_src, kF, ord, extc)] * SoA[1]; + } + } + } + + // 4) do i=0..ord-1: funcc(:,:,-i) = funcc(:,:, i+1)*SoA(3) + for (int kk = 0; kk <= ord - 1; ++kk) { + const int kF_dst = -kk; + const int kF_src = kk + 1; + for (int jF = -ord + 1; jF <= extc2; ++jF) { + for (int iF = -ord + 1; iF <= extc1; ++iF) { + funcc[idx_funcc_F(iF, jF, kF_dst, ord, extc)] = + funcc[idx_funcc_F(iF, jF, kF_src, ord, extc)] * SoA[2]; + } + } + } +} +#endif diff --git a/AMSS_NCKU_source/tool.h b/AMSS_NCKU_source/tool.h new file mode 100644 index 0000000..154b5ec --- /dev/null +++ b/AMSS_NCKU_source/tool.h @@ -0,0 +1,27 @@ +#include "share_func.h" +void fdderivs(const int ex[3], + const double *f, + double *fxx, double *fxy, double *fxz, + double *fyy, double *fyz, double *fzz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff); + +void fderivs(const int ex[3], + const double *f, + double *fx, double *fy, double *fz, + const double *X, const double *Y, const double *Z, + double SYM1, double SYM2, double SYM3, + int Symmetry, int onoff); + +void kodis(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double SoA[3], + int Symmetry, double eps); + +void lopsided(const int ex[3], + const double *X, const double *Y, const double *Z, + const double *f, double *f_rhs, + const double *Sfx, const double *Sfy, const double *Sfz, + int Symmetry, const double SoA[3]); \ No newline at end of file diff --git a/generate_interp_lb_header.py b/generate_interp_lb_header.py new file mode 100644 index 0000000..a1f1c59 --- /dev/null +++ b/generate_interp_lb_header.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Convert interp_lb_profile.bin to a C header for compile-time embedding.""" +import struct, sys + +if len(sys.argv) < 3: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + +with open(sys.argv[1], 'rb') as f: + magic, version, nprocs, num_heavy = struct.unpack('IIii', f.read(16)) + threshold = struct.unpack('d', f.read(8))[0] + times = list(struct.unpack(f'{nprocs}d', f.read(nprocs * 8))) + heavy = list(struct.unpack(f'{num_heavy}i', f.read(num_heavy * 4))) + +# For each heavy rank, compute split: left half -> lighter neighbor, right half -> heavy rank +# (or vice versa depending on which neighbor is lighter) +splits = [] +for hr in heavy: + prev_t = times[hr - 1] if hr > 0 else 1e30 + next_t = times[hr + 1] if hr < nprocs - 1 else 1e30 + if prev_t <= next_t: + splits.append((hr, hr - 1, hr)) # (block_id, r_left, r_right) + else: + splits.append((hr, hr, hr + 1)) + +# Also remap the displaced neighbor blocks +remaps = {} +for hr, r_l, r_r in splits: + if r_l != hr: + # We took r_l's slot, so remap block r_l to its other neighbor + displaced = r_l + if displaced > 0 and displaced - 1 not in [s[0] for s in splits]: + remaps[displaced] = displaced - 1 + elif displaced < nprocs - 1: + remaps[displaced] = displaced + 1 + else: + displaced = r_r + if displaced < nprocs - 1 and displaced + 1 not in [s[0] for s in splits]: + remaps[displaced] = displaced + 1 + elif displaced > 0: + remaps[displaced] = displaced - 1 + +with open(sys.argv[2], 'w') as out: + out.write("/* Auto-generated from interp_lb_profile.bin — do not edit */\n") + out.write("#ifndef INTERP_LB_PROFILE_DATA_H\n") + out.write("#define INTERP_LB_PROFILE_DATA_H\n\n") + out.write(f"#define INTERP_LB_NPROCS {nprocs}\n") + out.write(f"#define INTERP_LB_NUM_HEAVY {num_heavy}\n\n") + out.write(f"static const int interp_lb_heavy_blocks[{num_heavy}] = {{") + out.write(", ".join(str(h) for h in heavy)) + out.write("};\n\n") + out.write("/* Split table: {block_id, r_left, r_right} */\n") + out.write(f"static const int interp_lb_splits[{num_heavy}][3] = {{\n") + for bid, rl, rr in splits: + out.write(f" {{{bid}, {rl}, {rr}}},\n") + out.write("};\n\n") + out.write("/* Rank remap for displaced neighbor blocks */\n") + out.write(f"static const int interp_lb_num_remaps = {len(remaps)};\n") + out.write(f"static const int interp_lb_remaps[][2] = {{\n") + for src, dst in sorted(remaps.items()): + out.write(f" {{{src}, {dst}}},\n") + if not remaps: + out.write(" {-1, -1},\n") + out.write("};\n\n") + out.write("#endif /* INTERP_LB_PROFILE_DATA_H */\n") + +print(f"Generated {sys.argv[2]}:") +print(f" {num_heavy} heavy blocks to split: {heavy}") +for bid, rl, rr in splits: + print(f" block {bid}: split -> rank {rl} (left), rank {rr} (right)") +for src, dst in sorted(remaps.items()): + print(f" block {src}: remap -> rank {dst}") diff --git a/makefile_and_run.py b/makefile_and_run.py index 096ed58..1a0b937 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -11,17 +11,46 @@ import AMSS_NCKU_Input as input_data import subprocess import time -## CPU core binding configuration using taskset -## taskset ensures all child processes inherit the CPU affinity mask -## This forces make and all compiler processes to use only nohz_full cores (4-55, 60-111) -## Format: taskset -c 4-55,60-111 ensures processes only run on these cores -#NUMACTL_CPU_BIND = "taskset -c 0-111" -NUMACTL_CPU_BIND = "taskset -c 16-47,64-95" -## Build parallelism configuration -## Use nohz_full cores (4-55, 60-111) for compilation: 52 + 52 = 104 cores -## Set make -j to utilize available cores for faster builds -BUILD_JOBS = 96 + +def get_last_n_cores_per_socket(n=32): + """ + Read CPU topology via lscpu and return a taskset -c string + selecting the last `n` cores of each NUMA node (socket). + + Example: 2 sockets x 56 cores each, n=32 -> node0: 24-55, node1: 80-111 + -> "taskset -c 24-55,80-111" + """ + result = subprocess.run(["lscpu", "--parse=NODE,CPU"], capture_output=True, text=True) + + # Build a dict: node_id -> sorted list of CPU ids + node_cpus = {} + for line in result.stdout.splitlines(): + if line.startswith("#") or not line.strip(): + continue + parts = line.split(",") + if len(parts) < 2: + continue + node_id, cpu_id = int(parts[0]), int(parts[1]) + node_cpus.setdefault(node_id, []).append(cpu_id) + + segments = [] + for node_id in sorted(node_cpus): + cpus = sorted(node_cpus[node_id]) + selected = cpus[-n:] # last n cores of this socket + segments.append(f"{selected[0]}-{selected[-1]}") + + cpu_str = ",".join(segments) + total = len(segments) * n + print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)") + return f"taskset -c {cpu_str}" + + +## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total) +NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32) + +## Build parallelism: match the number of bound cores +BUILD_JOBS = 64 ################################################################## @@ -40,7 +69,7 @@ def makefile_ABE(): ## Build command with CPU binding to nohz_full cores if (input_data.GPU_Calculation == "no"): - makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABE" + makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=optimize ABE" elif (input_data.GPU_Calculation == "yes"): makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU" else: diff --git a/pgo_profile/TwoPunctureABE.profdata b/pgo_profile/TwoPunctureABE.profdata new file mode 100644 index 0000000..ea95175 Binary files /dev/null and b/pgo_profile/TwoPunctureABE.profdata differ diff --git a/pgo_profile/default.profdata b/pgo_profile/default.profdata index c09d078..e875ff0 100644 Binary files a/pgo_profile/default.profdata and b/pgo_profile/default.profdata differ diff --git a/pgo_profile/default.profdata-f b/pgo_profile/default.profdata-f new file mode 100644 index 0000000..c09d078 Binary files /dev/null and b/pgo_profile/default.profdata-f differ diff --git a/pgo_profile/default.profdata.backup2 b/pgo_profile/default.profdata.backup2 new file mode 100644 index 0000000..c09d078 Binary files /dev/null and b/pgo_profile/default.profdata.backup2 differ diff --git a/pgo_profile/default.profdatabackup3 b/pgo_profile/default.profdatabackup3 new file mode 100644 index 0000000..156744d Binary files /dev/null and b/pgo_profile/default.profdatabackup3 differ diff --git a/pgo_profile/default_9725923726611433605_0.profraw b/pgo_profile/default_9725923726611433605_0.profraw new file mode 100644 index 0000000..e38d300 Binary files /dev/null and b/pgo_profile/default_9725923726611433605_0.profraw differ diff --git a/pgo_profile/default_9726420327935033477_0.profraw b/pgo_profile/default_9726420327935033477_0.profraw new file mode 100644 index 0000000..e46d05a Binary files /dev/null and b/pgo_profile/default_9726420327935033477_0.profraw differ