From 5b00d49070a2aec2cd9779c930b8c13fe01681f8 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 9 Apr 2026 16:44:08 +0800 Subject: [PATCH] Reduce staged GPU host-device copies --- AMSS_NCKU_source/bssn_cuda_ops.cu | 42 ++++- AMSS_NCKU_source/bssn_cuda_ops.h | 6 +- AMSS_NCKU_source/bssn_cuda_step.C | 49 ++++- AMSS_NCKU_source/bssn_gpu.cu | 299 ++++++++++++++++++++++++++---- AMSS_NCKU_source/bssn_gpu.h | 2 + 5 files changed, 349 insertions(+), 49 deletions(-) diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index 6ed08ee..b5fb35a 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -728,7 +728,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, const double SoA[3], int symmetry, int lev, - int rk_stage) + int rk_stage, + bool download_to_host) { struct Rk4VarCache { @@ -790,7 +791,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, ok = ok && (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) && - (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) && + (!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) && (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes)); if (ok && need_stage_input) @@ -885,16 +886,18 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (ok) { bssn_gpu_register_device_buffer(stage_data, stage_ptr); - - cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); - ok = err == cudaSuccess; + if (download_to_host) + { + cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); + ok = err == cudaSuccess; + } } return ok ? 0 : 1; } -int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) +int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host) { static thread_local CachedBuffer d_chi; @@ -926,13 +929,32 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) if (ok) { bssn_gpu_register_device_buffer(chi, device_chi); - cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err); - ok = err == cudaSuccess; + if (download_to_host) + { + cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err); + ok = err == cudaSuccess; + } } return ok ? 0 : 1; } +int bssn_cuda_download_buffer(int *ex, double *host_ptr) +{ + const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr); + if (!device_ptr) + return 1; + + const size_t bytes = static_cast(count_points(ex)) * sizeof(double); + cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(D2H) buffered download", err); + return 1; + } + return 0; +} + int bssn_cuda_interp_points_batch(const int *ex, const double *X, const double *Y, const double *Z, const double *const *fields, diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h index 4a11644..27237d9 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.h +++ b/AMSS_NCKU_source/bssn_cuda_ops.h @@ -19,9 +19,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, const double SoA[3], int symmetry, int lev, - int rk_stage); + int rk_stage, + bool download_to_host = true); -int bssn_cuda_lowerbound(int *ex, double *chi, double tinny); +int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true); +int bssn_cuda_download_buffer(int *ex, double *host_ptr); int bssn_cuda_prolong3_pack(int wei, const double *llbc, const double *uubc, const int *extc, const double *func, diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index 6b25691..2b3d08e 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -83,7 +83,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) cg->fgfs[varlr->data->sgfn], varl0->data->propspeed, varl0->data->SoA, - Symmetry, lev, rk_stage)) + Symmetry, lev, rk_stage, false)) { cerr << "GPU rk4/boundary failure: lev=" << lev << " rk_stage=" << rk_stage @@ -101,6 +101,43 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) } }; + auto stage_download_var_list = + [&](Block *cg, MyList *var_list) { + while (var_list) + { + if (bssn_cuda_download_buffer(cg->shape, cg->fgfs[var_list->data->sgfn])) + { + cerr << "GPU stage download failure: lev=" << lev + << " var=" << var_list->data->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + break; + } + var_list = var_list->next; + } + }; + + auto stage_upload_var_list = + [&](Block *cg, MyList *var_list) { + const int n = cg->shape[0] * cg->shape[1] * cg->shape[2]; + while (var_list) + { + if (bssn_gpu_stage_upload_buffer(cg->fgfs[var_list->data->sgfn], n)) + { + cerr << "GPU state upload failure: lev=" << lev + << " var=" << var_list->data->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + break; + } + var_list = var_list->next; + } + }; + MyList *Pp = GH->PatL[lev]; while (Pp) { @@ -110,12 +147,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { + stage_upload_var_list(cg, StateList); if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME)) ERROR = 1; run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count); - if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny)) + if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false)) { cerr << "GPU lowerbound failure: lev=" << lev << " rk_stage=" << iter_count @@ -125,6 +163,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } + if (!ERROR) + stage_download_var_list(cg, SynchList_pre); } if (BP == Pp->data->ble) break; @@ -194,12 +234,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { + stage_upload_var_list(cg, SynchList_pre); if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN)) ERROR = 1; run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count); - if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny)) + if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false)) { cerr << "GPU lowerbound failure: lev=" << lev << " rk_stage=" << iter_count @@ -209,6 +250,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } + if (!ERROR) + stage_download_var_list(cg, SynchList_cor); } if (BP == Pp->data->ble) diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index e522d48..bcdf57e 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -149,6 +149,16 @@ struct ExternalBufferRegistry int mapped_buffer_count = 0; }; +struct OwnedBufferRegistry +{ + static const int max_mapped_buffers = 256; + const double *host_buffers[max_mapped_buffers] = {nullptr}; + double *device_buffers[max_mapped_buffers] = {nullptr}; + size_t capacities[max_mapped_buffers] = {0}; + bool valid[max_mapped_buffers] = {false}; + int mapped_buffer_count = 0; +}; + GpuRhsCache &gpu_rhs_cache() { static GpuRhsCache cache; @@ -161,6 +171,12 @@ ExternalBufferRegistry &external_buffer_registry() return registry; } +OwnedBufferRegistry &owned_buffer_registry() +{ + static thread_local OwnedBufferRegistry registry; + return registry; +} + void reset_meta(Meta *meta) { memset(meta, 0, sizeof(Meta)); @@ -230,6 +246,114 @@ void map_external_buffer(ExternalBufferRegistry ®istry, const double *host_pt registry.mapped_buffer_count++; } +void invalidate_owned_buffer_map(OwnedBufferRegistry ®istry) +{ + for (int i = 0; i < registry.mapped_buffer_count; ++i) + registry.valid[i] = false; +} + +const double *find_owned_device_buffer(const OwnedBufferRegistry ®istry, const double *host_ptr) +{ + if (!host_ptr) + return nullptr; + + for (int i = 0; i < registry.mapped_buffer_count; ++i) + { + if (registry.valid[i] && registry.host_buffers[i] == host_ptr) + return registry.device_buffers[i]; + } + return nullptr; +} + +int find_owned_buffer_slot(OwnedBufferRegistry ®istry, const double *host_ptr) +{ + int reusable_slot = -1; + for (int i = 0; i < registry.mapped_buffer_count; ++i) + { + if (registry.host_buffers[i] == host_ptr) + return i; + if (!registry.valid[i] && reusable_slot < 0) + reusable_slot = i; + } + + if (reusable_slot >= 0) + { + registry.host_buffers[reusable_slot] = host_ptr; + return reusable_slot; + } + + if (registry.mapped_buffer_count >= OwnedBufferRegistry::max_mapped_buffers) + return -1; + + const int slot = registry.mapped_buffer_count++; + registry.host_buffers[slot] = host_ptr; + registry.device_buffers[slot] = nullptr; + registry.capacities[slot] = 0; + registry.valid[slot] = false; + return slot; +} + +bool ensure_owned_buffer_capacity(OwnedBufferRegistry ®istry, int slot, size_t bytes) +{ + if (slot < 0) + return false; + if (registry.device_buffers[slot] && registry.capacities[slot] >= bytes) + return true; + + if (registry.device_buffers[slot]) + { + cudaError_t free_err = cudaFree(registry.device_buffers[slot]); + if (free_err != cudaSuccess) + { + cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl; + return false; + } + registry.device_buffers[slot] = nullptr; + registry.capacities[slot] = 0; + } + + cudaError_t err = cudaMalloc((void **)®istry.device_buffers[slot], bytes); + if (err != cudaSuccess) + { + cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << endl; + return false; + } + + registry.capacities[slot] = bytes; + return true; +} + +bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill) +{ + if (!host_ptr || count == 0) + return false; + + OwnedBufferRegistry ®istry = owned_buffer_registry(); + const int slot = find_owned_buffer_slot(registry, host_ptr); + if (slot < 0) + { + cerr << "owned CUDA buffer registry exhausted" << endl; + return false; + } + + const size_t bytes = count * sizeof(double); + if (!ensure_owned_buffer_capacity(registry, slot, bytes)) + return false; + + cudaError_t err = zero_fill + ? cudaMemset(registry.device_buffers[slot], 0, bytes) + : cudaMemcpy(registry.device_buffers[slot], host_ptr, bytes, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + cerr << (zero_fill ? "cudaMemset" : "cudaMemcpy(H2D)") + << " failed: " << cudaGetErrorString(err) << endl; + return false; + } + + registry.valid[slot] = true; + return true; +} + bool ensure_device_buffer(double **ptr, size_t count) { if (*ptr) @@ -270,6 +394,17 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count) return true; } +bool copy_buffer_to_device(double *dst, const double *src, size_t count) +{ + cudaError_t err = cudaMemcpy(dst, src, count * sizeof(double), cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + cerr << "cudaMemcpy(H2D) failed: " << cudaGetErrorString(err) << endl; + return false; + } + return true; +} + const double *find_external_device_buffer(const ExternalBufferRegistry ®istry, const double *host_ptr) { if (!host_ptr) @@ -326,8 +461,25 @@ bool zero_buffers(const ZeroSpec *specs, size_t count) void cleanup_gpu_rhs_cache() { GpuRhsCache &cache = gpu_rhs_cache(); + OwnedBufferRegistry &owned = owned_buffer_registry(); if (!cache.allocated) + { + for (int i = 0; i < owned.mapped_buffer_count; ++i) + { + if (owned.device_buffers[i]) + { + cudaError_t free_err = cudaFree(owned.device_buffers[i]); + if (free_err != cudaSuccess) + cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl; + } + owned.device_buffers[i] = nullptr; + owned.capacities[i] = 0; + owned.valid[i] = false; + owned.host_buffers[i] = nullptr; + } + owned.mapped_buffer_count = 0; return; + } if (cache.device >= 0) cudaSetDevice(cache.device); @@ -342,6 +494,21 @@ void cleanup_gpu_rhs_cache() cache.last_z = nullptr; reset_buffer_map(cache); reset_external_buffer_map(external_buffer_registry()); + + for (int i = 0; i < owned.mapped_buffer_count; ++i) + { + if (owned.device_buffers[i]) + { + cudaError_t free_err = cudaFree(owned.device_buffers[i]); + if (free_err != cudaSuccess) + cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl; + } + owned.device_buffers[i] = nullptr; + owned.capacities[i] = 0; + owned.valid[i] = false; + owned.host_buffers[i] = nullptr; + } + owned.mapped_buffer_count = 0; } bool register_gpu_rhs_cleanup() @@ -634,6 +801,9 @@ const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double * const double *bssn_gpu_find_device_buffer(const double *host_ptr) { const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr); + if (device_ptr) + return device_ptr; + device_ptr = find_owned_device_buffer(owned_buffer_registry(), host_ptr); if (device_ptr) return device_ptr; return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr); @@ -659,6 +829,7 @@ void bssn_gpu_clear_cached_device_buffers() { reset_external_buffer_map(external_buffer_registry()); reset_buffer_map(gpu_rhs_cache()); + invalidate_owned_buffer_map(owned_buffer_registry()); } void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr) @@ -666,6 +837,16 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic map_external_buffer(external_buffer_registry(), host_ptr, device_ptr); } +int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count) +{ + return prepare_owned_buffer(host_ptr, static_cast(count), false) ? 0 : 1; +} + +int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count) +{ + return prepare_owned_buffer(host_ptr, static_cast(count), true) ? 0 : 1; +} + __global__ void test_const_address(double * testd){ int _t = blockIdx.x*blockDim.x+threadIdx.x; if(_t == 0) @@ -3009,7 +3190,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, #endif #endif -//2 ----------------Copy Data to Device------------------ + //2 ----------------Copy Data to Device------------------ if (cache.last_x != X || cache.last_y != Y || cache.last_z != Z) { const CopySpec coord_copies[] = { @@ -3025,35 +3206,81 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, } reset_buffer_map(cache); + Meta saved_meta = *meta; - const CopySpec state_copies[] = { - {Mh_ chi, chi, static_cast(matrix_size)}, - {Mh_ dxx, dxx, static_cast(matrix_size)}, - {Mh_ dyy, dyy, static_cast(matrix_size)}, - {Mh_ dzz, dzz, static_cast(matrix_size)}, - {Mh_ trK, trK, static_cast(matrix_size)}, - {Mh_ gxy, gxy, static_cast(matrix_size)}, - {Mh_ gxz, gxz, static_cast(matrix_size)}, - {Mh_ gyz, gyz, static_cast(matrix_size)}, - {Mh_ Axx, Axx, static_cast(matrix_size)}, - {Mh_ Axy, Axy, static_cast(matrix_size)}, - {Mh_ Axz, Axz, static_cast(matrix_size)}, - {Mh_ Ayz, Ayz, static_cast(matrix_size)}, - {Mh_ Ayy, Ayy, static_cast(matrix_size)}, - {Mh_ Azz, Azz, static_cast(matrix_size)}, - {Mh_ Gamx, Gamx, static_cast(matrix_size)}, - {Mh_ Gamy, Gamy, static_cast(matrix_size)}, - {Mh_ Gamz, Gamz, static_cast(matrix_size)}, - {Mh_ betax, betax, static_cast(matrix_size)}, - {Mh_ betay, betay, static_cast(matrix_size)}, - {Mh_ betaz, betaz, static_cast(matrix_size)}, - {Mh_ Lap, Lap, static_cast(matrix_size)}, - {Mh_ dtSfx, dtSfx, static_cast(matrix_size)}, - {Mh_ dtSfy, dtSfy, static_cast(matrix_size)}, - {Mh_ dtSfz, dtSfz, static_cast(matrix_size)}, + auto bind_or_copy_input = [&](double *&slot, const double *host_ptr, size_t count) -> bool + { + const double *mapped = bssn_gpu_find_device_buffer(host_ptr); + if (mapped) + { + slot = const_cast(mapped); + return true; + } + return copy_buffer_to_device(slot, host_ptr, count); }; - if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0]))) + + auto bind_or_keep_output = [&](double *&slot, const double *host_ptr) + { + const double *mapped = bssn_gpu_find_device_buffer(host_ptr); + if (mapped) + slot = const_cast(mapped); + }; + + if (!(bind_or_copy_input(meta->chi, chi, static_cast(matrix_size)) && + bind_or_copy_input(meta->dxx, dxx, static_cast(matrix_size)) && + bind_or_copy_input(meta->dyy, dyy, static_cast(matrix_size)) && + bind_or_copy_input(meta->dzz, dzz, static_cast(matrix_size)) && + bind_or_copy_input(meta->trK, trK, static_cast(matrix_size)) && + bind_or_copy_input(meta->gxy, gxy, static_cast(matrix_size)) && + bind_or_copy_input(meta->gxz, gxz, static_cast(matrix_size)) && + bind_or_copy_input(meta->gyz, gyz, static_cast(matrix_size)) && + bind_or_copy_input(meta->Axx, Axx, static_cast(matrix_size)) && + bind_or_copy_input(meta->Axy, Axy, static_cast(matrix_size)) && + bind_or_copy_input(meta->Axz, Axz, static_cast(matrix_size)) && + bind_or_copy_input(meta->Ayz, Ayz, static_cast(matrix_size)) && + bind_or_copy_input(meta->Ayy, Ayy, static_cast(matrix_size)) && + bind_or_copy_input(meta->Azz, Azz, static_cast(matrix_size)) && + bind_or_copy_input(meta->Gamx, Gamx, static_cast(matrix_size)) && + bind_or_copy_input(meta->Gamy, Gamy, static_cast(matrix_size)) && + bind_or_copy_input(meta->Gamz, Gamz, static_cast(matrix_size)) && + bind_or_copy_input(meta->betax, betax, static_cast(matrix_size)) && + bind_or_copy_input(meta->betay, betay, static_cast(matrix_size)) && + bind_or_copy_input(meta->betaz, betaz, static_cast(matrix_size)) && + bind_or_copy_input(meta->Lap, Lap, static_cast(matrix_size)) && + bind_or_copy_input(meta->dtSfx, dtSfx, static_cast(matrix_size)) && + bind_or_copy_input(meta->dtSfy, dtSfy, static_cast(matrix_size)) && + bind_or_copy_input(meta->dtSfz, dtSfz, static_cast(matrix_size)))) + { + *meta = saved_meta; return 1; + } + + bind_or_keep_output(meta->chi_rhs, chi_rhs); + bind_or_keep_output(meta->trK_rhs, trK_rhs); + bind_or_keep_output(meta->gxx_rhs, gxx_rhs); + bind_or_keep_output(meta->gxy_rhs, gxy_rhs); + bind_or_keep_output(meta->gxz_rhs, gxz_rhs); + bind_or_keep_output(meta->gyy_rhs, gyy_rhs); + bind_or_keep_output(meta->gyz_rhs, gyz_rhs); + bind_or_keep_output(meta->gzz_rhs, gzz_rhs); + bind_or_keep_output(meta->Axx_rhs, Axx_rhs); + bind_or_keep_output(meta->Axy_rhs, Axy_rhs); + bind_or_keep_output(meta->Axz_rhs, Axz_rhs); + bind_or_keep_output(meta->Ayy_rhs, Ayy_rhs); + bind_or_keep_output(meta->Ayz_rhs, Ayz_rhs); + bind_or_keep_output(meta->Azz_rhs, Azz_rhs); + bind_or_keep_output(meta->Gamx_rhs, Gamx_rhs); + bind_or_keep_output(meta->Gamy_rhs, Gamy_rhs); + bind_or_keep_output(meta->Gamz_rhs, Gamz_rhs); + bind_or_keep_output(meta->Lap_rhs, Lap_rhs); + bind_or_keep_output(meta->betax_rhs, betax_rhs); + bind_or_keep_output(meta->betay_rhs, betay_rhs); + bind_or_keep_output(meta->betaz_rhs, betaz_rhs); + bind_or_keep_output(meta->dtSfx_rhs, dtSfx_rhs); + bind_or_keep_output(meta->dtSfy_rhs, dtSfy_rhs); + bind_or_keep_output(meta->dtSfz_rhs, dtSfz_rhs); + + cudaMemcpyToSymbol(metac, meta, sizeof(Meta)); const ZeroSpec zero_specs[] = { {Mh_ rho, static_cast(matrix_size)}, @@ -3068,7 +3295,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, {Mh_ Sz, static_cast(matrix_size)}, }; if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0]))) + { + *meta = saved_meta; return 1; + } map_buffer(cache, chi, Mh_ chi); map_buffer(cache, trK, Mh_ trK); map_buffer(cache, dxx, Mh_ dxx); @@ -3454,12 +3684,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, //----------------------------------------------------- //-------------------FOR GPU TEST---------------------- //----------------------------------------------------- -#ifdef TIMING - cudaThreadSynchronize(); - gettimeofday(&tv2, NULL); - cout<<"MPI rank is: "<