diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 2a193a7..3e941ff 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -6,6 +6,7 @@ #include "parameters.h" #include #include +#include #include #ifndef USE_CUDA_Z4C @@ -219,6 +220,12 @@ int fortran_idint(double x) bool cuda_amr_restrict_device_enabled(); bool cuda_amr_prolong_device_enabled(); +bool cuda_amr_restrict_compare_enabled(); +bool cuda_amr_restrict_batch_enabled(); +bool cuda_device_segment_batch_enabled(); +bool cuda_download_resident_subset_to_host(Block *block, + MyList *vars, + int state_count); bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, const Parallel::gridseg *dst, @@ -479,6 +486,61 @@ bool cuda_amr_prolong_device_enabled() return enabled != 0; } +bool cuda_amr_restrict_compare_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_COMPARE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool cuda_amr_restrict_batch_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_BATCH"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool cuda_device_segment_batch_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_DEVICE_SEGMENT_BATCH"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +double cuda_amr_restrict_compare_tol() +{ + static double tol = -1.0; + if (tol < 0.0) + { + const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_COMPARE_TOL"); + tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-9; + } + return tol; +} + +int cuda_amr_restrict_compare_limit() +{ + static int limit = -1; + if (limit < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_COMPARE_LIMIT"); + limit = (env && atoi(env) > 0) ? atoi(env) : 8; + } + return limit; +} + bool cuda_mpi_diag_enabled() { static int enabled = -1; @@ -543,7 +605,8 @@ bool cuda_direct_pack_segment_to_device(double *buffer, const Parallel::gridseg *dst, int state_count, int type, - MyList *VarLists) + MyList *VarLists, + int Symmetry) { #if USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) @@ -584,6 +647,103 @@ bool cuda_direct_pack_segment_to_device(double *buffer, src->Bg, state_count, buffer, src->Bg->shape, dst->shape[0], dst->shape[1], dst->shape[2], first_fine[0], first_fine[1], first_fine[2]) == 0; + if (ok && cuda_amr_restrict_compare_enabled()) + { + const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2]; + const int total = state_count * region_all; + double *cpu = new double[total]; + double *gpu = new double[total]; + if (!cuda_download_resident_subset_to_host(src->Bg, VarLists, state_count)) + { + delete[] cpu; + delete[] gpu; + return false; + } + int DIM = dim; + MyList *v = VarLists; + for (int s = 0; s < state_count && v; ++s, v = v->next) + { + f_restrict3(DIM, + const_cast(dst->llb), + const_cast(dst->uub), + const_cast(dst->shape), + cpu + (size_t)s * region_all, + src->Bg->bbox, + src->Bg->bbox + dim, + src->Bg->shape, + src->Bg->fgfs[v->data->sgfn], + const_cast(dst->llb), + const_cast(dst->uub), + v->data->SoA, + Symmetry); + } + cudaError_t cerr = cudaMemcpy(gpu, buffer, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost); + if (cerr != cudaSuccess) + { + fprintf(stderr, "Parallel: restrict compare cudaMemcpy failed, err=%d\n", (int)cerr); + delete[] cpu; + delete[] gpu; + return false; + } + double max_abs = 0.0; + double max_rel = 0.0; + int max_idx = -1; + for (int i = 0; i < total; ++i) + { + const double diff = fabs(cpu[i] - gpu[i]); + const double den = fmax(fabs(cpu[i]), fabs(gpu[i])); + const double rel = den > 0.0 ? diff / den : diff; + if (diff > max_abs) + { + max_abs = diff; + max_rel = rel; + max_idx = i; + } + } + static int report_count = 0; + const double tol = cuda_amr_restrict_compare_tol(); + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (max_abs > tol || max_rel > tol) + { + const int state = max_idx / region_all; + const int local = max_idx - state * region_all; + const int ii = local % dst->shape[0]; + const int jj = (local / dst->shape[0]) % dst->shape[1]; + const int kk = local / (dst->shape[0] * dst->shape[1]); + if (report_count < cuda_amr_restrict_compare_limit()) + { + fprintf(stderr, + "[AMSS-CUDA-RESTRICT-CMP][rank %d] mismatch state=%d point=(%d,%d,%d) " + "shape=(%d,%d,%d) first_fine=(%d,%d,%d) max_abs=%.17e max_rel=%.17e " + "cpu=%.17e gpu=%.17e src_lev=%d dst_lev=%d\n", + rank, state, ii, jj, kk, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + max_abs, max_rel, cpu[max_idx], gpu[max_idx], + src->Bg->lev, dst->Bg->lev); + fflush(stderr); + report_count++; + } + delete[] cpu; + delete[] gpu; + return false; + } + else if (report_count < cuda_amr_restrict_compare_limit()) + { + fprintf(stderr, + "[AMSS-CUDA-RESTRICT-CMP][rank %d] ok shape=(%d,%d,%d) " + "first_fine=(%d,%d,%d) max_abs=%.17e max_rel=%.17e\n", + rank, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + max_abs, max_rel); + fflush(stderr); + report_count++; + } + delete[] cpu; + delete[] gpu; + } } else if (type == 3) { @@ -762,6 +922,10 @@ int cuda_data_packer_device_batched(double *data, (void)Symmetry; if (!data || (dir != PACK && dir != UNPACK) || !src || !dst) return -1; + if (!cuda_device_segment_batch_enabled()) + return -1; + if (cuda_amr_restrict_compare_enabled()) + return -1; int myrank; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); @@ -790,6 +954,8 @@ int cuda_data_packer_device_batched(double *data, type = 2; else type = 3; + if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled()) + return -1; Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg; if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) || @@ -4819,7 +4985,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) { if (s_cuda_aware_pack_active) { - handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists); + handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry); } else { handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists); }