Reduce staged GPU host-device copies

This commit is contained in:
2026-04-09 16:44:08 +08:00
parent 42e851d19a
commit 5b00d49070
5 changed files with 349 additions and 49 deletions

View File

@@ -728,7 +728,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double SoA[3],
int symmetry,
int lev,
int rk_stage)
int rk_stage,
bool download_to_host)
{
struct Rk4VarCache
{
@@ -790,7 +791,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
ok = ok &&
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
if (ok && need_stage_input)
@@ -885,16 +886,18 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
if (ok)
{
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
if (download_to_host)
{
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
ok = err == cudaSuccess;
}
}
return ok ? 0 : 1;
}
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
{
static thread_local CachedBuffer d_chi;
@@ -926,13 +929,32 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
if (ok)
{
bssn_gpu_register_device_buffer(chi, device_chi);
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
ok = err == cudaSuccess;
if (download_to_host)
{
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
ok = err == cudaSuccess;
}
}
return ok ? 0 : 1;
}
int bssn_cuda_download_buffer(int *ex, double *host_ptr)
{
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
if (!device_ptr)
return 1;
const size_t bytes = static_cast<size_t>(count_points(ex)) * sizeof(double);
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
if (err != cudaSuccess)
{
report_cuda_error("cudaMemcpy(D2H) buffered download", err);
return 1;
}
return 0;
}
int bssn_cuda_interp_points_batch(const int *ex,
const double *X, const double *Y, const double *Z,
const double *const *fields,

View File

@@ -19,9 +19,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
const double SoA[3],
int symmetry,
int lev,
int rk_stage);
int rk_stage,
bool download_to_host = true);
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
int bssn_cuda_prolong3_pack(int wei,
const double *llbc, const double *uubc, const int *extc, const double *func,

View File

@@ -83,7 +83,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
cg->fgfs[varlr->data->sgfn],
varl0->data->propspeed,
varl0->data->SoA,
Symmetry, lev, rk_stage))
Symmetry, lev, rk_stage, false))
{
cerr << "GPU rk4/boundary failure: lev=" << lev
<< " rk_stage=" << rk_stage
@@ -101,6 +101,43 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
}
};
auto stage_download_var_list =
[&](Block *cg, MyList<var> *var_list) {
while (var_list)
{
if (bssn_cuda_download_buffer(cg->shape, cg->fgfs[var_list->data->sgfn]))
{
cerr << "GPU stage download failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
var_list = var_list->next;
}
};
auto stage_upload_var_list =
[&](Block *cg, MyList<var> *var_list) {
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
while (var_list)
{
if (bssn_gpu_stage_upload_buffer(cg->fgfs[var_list->data->sgfn], n))
{
cerr << "GPU state upload failure: lev=" << lev
<< " var=" << var_list->data->name
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
break;
}
var_list = var_list->next;
}
};
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
@@ -110,12 +147,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
Block *cg = BP->data;
if (myrank == cg->rank)
{
stage_upload_var_list(cg, StateList);
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
ERROR = 1;
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny))
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
@@ -125,6 +163,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (!ERROR)
stage_download_var_list(cg, SynchList_pre);
}
if (BP == Pp->data->ble)
break;
@@ -194,12 +234,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
Block *cg = BP->data;
if (myrank == cg->rank)
{
stage_upload_var_list(cg, SynchList_pre);
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
ERROR = 1;
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny))
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
{
cerr << "GPU lowerbound failure: lev=" << lev
<< " rk_stage=" << iter_count
@@ -209,6 +250,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (!ERROR)
stage_download_var_list(cg, SynchList_cor);
}
if (BP == Pp->data->ble)

View File

@@ -149,6 +149,16 @@ struct ExternalBufferRegistry
int mapped_buffer_count = 0;
};
struct OwnedBufferRegistry
{
static const int max_mapped_buffers = 256;
const double *host_buffers[max_mapped_buffers] = {nullptr};
double *device_buffers[max_mapped_buffers] = {nullptr};
size_t capacities[max_mapped_buffers] = {0};
bool valid[max_mapped_buffers] = {false};
int mapped_buffer_count = 0;
};
GpuRhsCache &gpu_rhs_cache()
{
static GpuRhsCache cache;
@@ -161,6 +171,12 @@ ExternalBufferRegistry &external_buffer_registry()
return registry;
}
OwnedBufferRegistry &owned_buffer_registry()
{
static thread_local OwnedBufferRegistry registry;
return registry;
}
void reset_meta(Meta *meta)
{
memset(meta, 0, sizeof(Meta));
@@ -230,6 +246,114 @@ void map_external_buffer(ExternalBufferRegistry &registry, const double *host_pt
registry.mapped_buffer_count++;
}
void invalidate_owned_buffer_map(OwnedBufferRegistry &registry)
{
for (int i = 0; i < registry.mapped_buffer_count; ++i)
registry.valid[i] = false;
}
const double *find_owned_device_buffer(const OwnedBufferRegistry &registry, const double *host_ptr)
{
if (!host_ptr)
return nullptr;
for (int i = 0; i < registry.mapped_buffer_count; ++i)
{
if (registry.valid[i] && registry.host_buffers[i] == host_ptr)
return registry.device_buffers[i];
}
return nullptr;
}
int find_owned_buffer_slot(OwnedBufferRegistry &registry, const double *host_ptr)
{
int reusable_slot = -1;
for (int i = 0; i < registry.mapped_buffer_count; ++i)
{
if (registry.host_buffers[i] == host_ptr)
return i;
if (!registry.valid[i] && reusable_slot < 0)
reusable_slot = i;
}
if (reusable_slot >= 0)
{
registry.host_buffers[reusable_slot] = host_ptr;
return reusable_slot;
}
if (registry.mapped_buffer_count >= OwnedBufferRegistry::max_mapped_buffers)
return -1;
const int slot = registry.mapped_buffer_count++;
registry.host_buffers[slot] = host_ptr;
registry.device_buffers[slot] = nullptr;
registry.capacities[slot] = 0;
registry.valid[slot] = false;
return slot;
}
bool ensure_owned_buffer_capacity(OwnedBufferRegistry &registry, int slot, size_t bytes)
{
if (slot < 0)
return false;
if (registry.device_buffers[slot] && registry.capacities[slot] >= bytes)
return true;
if (registry.device_buffers[slot])
{
cudaError_t free_err = cudaFree(registry.device_buffers[slot]);
if (free_err != cudaSuccess)
{
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
return false;
}
registry.device_buffers[slot] = nullptr;
registry.capacities[slot] = 0;
}
cudaError_t err = cudaMalloc((void **)&registry.device_buffers[slot], bytes);
if (err != cudaSuccess)
{
cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << endl;
return false;
}
registry.capacities[slot] = bytes;
return true;
}
bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill)
{
if (!host_ptr || count == 0)
return false;
OwnedBufferRegistry &registry = owned_buffer_registry();
const int slot = find_owned_buffer_slot(registry, host_ptr);
if (slot < 0)
{
cerr << "owned CUDA buffer registry exhausted" << endl;
return false;
}
const size_t bytes = count * sizeof(double);
if (!ensure_owned_buffer_capacity(registry, slot, bytes))
return false;
cudaError_t err = zero_fill
? cudaMemset(registry.device_buffers[slot], 0, bytes)
: cudaMemcpy(registry.device_buffers[slot], host_ptr, bytes, cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
cerr << (zero_fill ? "cudaMemset" : "cudaMemcpy(H2D)")
<< " failed: " << cudaGetErrorString(err) << endl;
return false;
}
registry.valid[slot] = true;
return true;
}
bool ensure_device_buffer(double **ptr, size_t count)
{
if (*ptr)
@@ -270,6 +394,17 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count)
return true;
}
bool copy_buffer_to_device(double *dst, const double *src, size_t count)
{
cudaError_t err = cudaMemcpy(dst, src, count * sizeof(double), cudaMemcpyHostToDevice);
if (err != cudaSuccess)
{
cerr << "cudaMemcpy(H2D) failed: " << cudaGetErrorString(err) << endl;
return false;
}
return true;
}
const double *find_external_device_buffer(const ExternalBufferRegistry &registry, const double *host_ptr)
{
if (!host_ptr)
@@ -326,8 +461,25 @@ bool zero_buffers(const ZeroSpec *specs, size_t count)
void cleanup_gpu_rhs_cache()
{
GpuRhsCache &cache = gpu_rhs_cache();
OwnedBufferRegistry &owned = owned_buffer_registry();
if (!cache.allocated)
{
for (int i = 0; i < owned.mapped_buffer_count; ++i)
{
if (owned.device_buffers[i])
{
cudaError_t free_err = cudaFree(owned.device_buffers[i]);
if (free_err != cudaSuccess)
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
}
owned.device_buffers[i] = nullptr;
owned.capacities[i] = 0;
owned.valid[i] = false;
owned.host_buffers[i] = nullptr;
}
owned.mapped_buffer_count = 0;
return;
}
if (cache.device >= 0)
cudaSetDevice(cache.device);
@@ -342,6 +494,21 @@ void cleanup_gpu_rhs_cache()
cache.last_z = nullptr;
reset_buffer_map(cache);
reset_external_buffer_map(external_buffer_registry());
for (int i = 0; i < owned.mapped_buffer_count; ++i)
{
if (owned.device_buffers[i])
{
cudaError_t free_err = cudaFree(owned.device_buffers[i]);
if (free_err != cudaSuccess)
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
}
owned.device_buffers[i] = nullptr;
owned.capacities[i] = 0;
owned.valid[i] = false;
owned.host_buffers[i] = nullptr;
}
owned.mapped_buffer_count = 0;
}
bool register_gpu_rhs_cleanup()
@@ -634,6 +801,9 @@ const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *
const double *bssn_gpu_find_device_buffer(const double *host_ptr)
{
const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
if (device_ptr)
return device_ptr;
device_ptr = find_owned_device_buffer(owned_buffer_registry(), host_ptr);
if (device_ptr)
return device_ptr;
return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
@@ -659,6 +829,7 @@ void bssn_gpu_clear_cached_device_buffers()
{
reset_external_buffer_map(external_buffer_registry());
reset_buffer_map(gpu_rhs_cache());
invalidate_owned_buffer_map(owned_buffer_registry());
}
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
@@ -666,6 +837,16 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
}
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count)
{
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), false) ? 0 : 1;
}
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
{
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
}
__global__ void test_const_address(double * testd){
int _t = blockIdx.x*blockDim.x+threadIdx.x;
if(_t == 0)
@@ -3009,7 +3190,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
#endif
#endif
//2 ----------------Copy Data to Device------------------
//2 ----------------Copy Data to Device------------------
if (cache.last_x != X || cache.last_y != Y || cache.last_z != Z)
{
const CopySpec coord_copies[] = {
@@ -3025,35 +3206,81 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
}
reset_buffer_map(cache);
Meta saved_meta = *meta;
const CopySpec state_copies[] = {
{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
{Mh_ dyy, dyy, static_cast<size_t>(matrix_size)},
{Mh_ dzz, dzz, static_cast<size_t>(matrix_size)},
{Mh_ trK, trK, static_cast<size_t>(matrix_size)},
{Mh_ gxy, gxy, static_cast<size_t>(matrix_size)},
{Mh_ gxz, gxz, static_cast<size_t>(matrix_size)},
{Mh_ gyz, gyz, static_cast<size_t>(matrix_size)},
{Mh_ Axx, Axx, static_cast<size_t>(matrix_size)},
{Mh_ Axy, Axy, static_cast<size_t>(matrix_size)},
{Mh_ Axz, Axz, static_cast<size_t>(matrix_size)},
{Mh_ Ayz, Ayz, static_cast<size_t>(matrix_size)},
{Mh_ Ayy, Ayy, static_cast<size_t>(matrix_size)},
{Mh_ Azz, Azz, static_cast<size_t>(matrix_size)},
{Mh_ Gamx, Gamx, static_cast<size_t>(matrix_size)},
{Mh_ Gamy, Gamy, static_cast<size_t>(matrix_size)},
{Mh_ Gamz, Gamz, static_cast<size_t>(matrix_size)},
{Mh_ betax, betax, static_cast<size_t>(matrix_size)},
{Mh_ betay, betay, static_cast<size_t>(matrix_size)},
{Mh_ betaz, betaz, static_cast<size_t>(matrix_size)},
{Mh_ Lap, Lap, static_cast<size_t>(matrix_size)},
{Mh_ dtSfx, dtSfx, static_cast<size_t>(matrix_size)},
{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
auto bind_or_copy_input = [&](double *&slot, const double *host_ptr, size_t count) -> bool
{
const double *mapped = bssn_gpu_find_device_buffer(host_ptr);
if (mapped)
{
slot = const_cast<double *>(mapped);
return true;
}
return copy_buffer_to_device(slot, host_ptr, count);
};
if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
auto bind_or_keep_output = [&](double *&slot, const double *host_ptr)
{
const double *mapped = bssn_gpu_find_device_buffer(host_ptr);
if (mapped)
slot = const_cast<double *>(mapped);
};
if (!(bind_or_copy_input(meta->chi, chi, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->dxx, dxx, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->dyy, dyy, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->dzz, dzz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->trK, trK, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->gxy, gxy, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->gxz, gxz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->gyz, gyz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Axx, Axx, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Axy, Axy, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Axz, Axz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Ayz, Ayz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Ayy, Ayy, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Azz, Azz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Gamx, Gamx, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Gamy, Gamy, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Gamz, Gamz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->betax, betax, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->betay, betay, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->betaz, betaz, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->Lap, Lap, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->dtSfx, dtSfx, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->dtSfy, dtSfy, static_cast<size_t>(matrix_size)) &&
bind_or_copy_input(meta->dtSfz, dtSfz, static_cast<size_t>(matrix_size))))
{
*meta = saved_meta;
return 1;
}
bind_or_keep_output(meta->chi_rhs, chi_rhs);
bind_or_keep_output(meta->trK_rhs, trK_rhs);
bind_or_keep_output(meta->gxx_rhs, gxx_rhs);
bind_or_keep_output(meta->gxy_rhs, gxy_rhs);
bind_or_keep_output(meta->gxz_rhs, gxz_rhs);
bind_or_keep_output(meta->gyy_rhs, gyy_rhs);
bind_or_keep_output(meta->gyz_rhs, gyz_rhs);
bind_or_keep_output(meta->gzz_rhs, gzz_rhs);
bind_or_keep_output(meta->Axx_rhs, Axx_rhs);
bind_or_keep_output(meta->Axy_rhs, Axy_rhs);
bind_or_keep_output(meta->Axz_rhs, Axz_rhs);
bind_or_keep_output(meta->Ayy_rhs, Ayy_rhs);
bind_or_keep_output(meta->Ayz_rhs, Ayz_rhs);
bind_or_keep_output(meta->Azz_rhs, Azz_rhs);
bind_or_keep_output(meta->Gamx_rhs, Gamx_rhs);
bind_or_keep_output(meta->Gamy_rhs, Gamy_rhs);
bind_or_keep_output(meta->Gamz_rhs, Gamz_rhs);
bind_or_keep_output(meta->Lap_rhs, Lap_rhs);
bind_or_keep_output(meta->betax_rhs, betax_rhs);
bind_or_keep_output(meta->betay_rhs, betay_rhs);
bind_or_keep_output(meta->betaz_rhs, betaz_rhs);
bind_or_keep_output(meta->dtSfx_rhs, dtSfx_rhs);
bind_or_keep_output(meta->dtSfy_rhs, dtSfy_rhs);
bind_or_keep_output(meta->dtSfz_rhs, dtSfz_rhs);
cudaMemcpyToSymbol(metac, meta, sizeof(Meta));
const ZeroSpec zero_specs[] = {
{Mh_ rho, static_cast<size_t>(matrix_size)},
@@ -3068,7 +3295,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
{Mh_ Sz, static_cast<size_t>(matrix_size)},
};
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
{
*meta = saved_meta;
return 1;
}
map_buffer(cache, chi, Mh_ chi);
map_buffer(cache, trK, Mh_ trK);
map_buffer(cache, dxx, Mh_ dxx);
@@ -3454,12 +3684,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
//-----------------------------------------------------
//-------------------FOR GPU TEST----------------------
//-----------------------------------------------------
#ifdef TIMING
cudaThreadSynchronize();
gettimeofday(&tv2, NULL);
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
#endif
#ifdef TIMING
cudaThreadSynchronize();
gettimeofday(&tv2, NULL);
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
#endif
*meta = saved_meta;
return 0;//TODO return
}

View File

@@ -69,6 +69,8 @@ int bssn_gpu_bind_process_device(int mpi_rank);
void bssn_gpu_clear_cached_device_buffers();
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
/** Init GPU side data in GPUMeta. */
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);