Reduce staged GPU host-device copies
This commit is contained in:
@@ -728,7 +728,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
const double SoA[3],
|
||||
int symmetry,
|
||||
int lev,
|
||||
int rk_stage)
|
||||
int rk_stage,
|
||||
bool download_to_host)
|
||||
{
|
||||
struct Rk4VarCache
|
||||
{
|
||||
@@ -790,7 +791,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
ok = ok &&
|
||||
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
||||
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
|
||||
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
|
||||
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
||||
|
||||
if (ok && need_stage_input)
|
||||
@@ -885,16 +886,18 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
if (ok)
|
||||
{
|
||||
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
|
||||
|
||||
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = err == cudaSuccess;
|
||||
if (download_to_host)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = err == cudaSuccess;
|
||||
}
|
||||
}
|
||||
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
|
||||
{
|
||||
static thread_local CachedBuffer d_chi;
|
||||
|
||||
@@ -926,13 +929,32 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
||||
if (ok)
|
||||
{
|
||||
bssn_gpu_register_device_buffer(chi, device_chi);
|
||||
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
||||
ok = err == cudaSuccess;
|
||||
if (download_to_host)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
||||
ok = err == cudaSuccess;
|
||||
}
|
||||
}
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_cuda_download_buffer(int *ex, double *host_ptr)
|
||||
{
|
||||
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
||||
if (!device_ptr)
|
||||
return 1;
|
||||
|
||||
const size_t bytes = static_cast<size_t>(count_points(ex)) * sizeof(double);
|
||||
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) buffered download", err);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bssn_cuda_interp_points_batch(const int *ex,
|
||||
const double *X, const double *Y, const double *Z,
|
||||
const double *const *fields,
|
||||
|
||||
@@ -19,9 +19,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
const double SoA[3],
|
||||
int symmetry,
|
||||
int lev,
|
||||
int rk_stage);
|
||||
int rk_stage,
|
||||
bool download_to_host = true);
|
||||
|
||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
|
||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
|
||||
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
|
||||
|
||||
int bssn_cuda_prolong3_pack(int wei,
|
||||
const double *llbc, const double *uubc, const int *extc, const double *func,
|
||||
|
||||
@@ -83,7 +83,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
cg->fgfs[varlr->data->sgfn],
|
||||
varl0->data->propspeed,
|
||||
varl0->data->SoA,
|
||||
Symmetry, lev, rk_stage))
|
||||
Symmetry, lev, rk_stage, false))
|
||||
{
|
||||
cerr << "GPU rk4/boundary failure: lev=" << lev
|
||||
<< " rk_stage=" << rk_stage
|
||||
@@ -101,6 +101,43 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
}
|
||||
};
|
||||
|
||||
auto stage_download_var_list =
|
||||
[&](Block *cg, MyList<var> *var_list) {
|
||||
while (var_list)
|
||||
{
|
||||
if (bssn_cuda_download_buffer(cg->shape, cg->fgfs[var_list->data->sgfn]))
|
||||
{
|
||||
cerr << "GPU stage download failure: lev=" << lev
|
||||
<< " var=" << var_list->data->name
|
||||
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
break;
|
||||
}
|
||||
var_list = var_list->next;
|
||||
}
|
||||
};
|
||||
|
||||
auto stage_upload_var_list =
|
||||
[&](Block *cg, MyList<var> *var_list) {
|
||||
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
|
||||
while (var_list)
|
||||
{
|
||||
if (bssn_gpu_stage_upload_buffer(cg->fgfs[var_list->data->sgfn], n))
|
||||
{
|
||||
cerr << "GPU state upload failure: lev=" << lev
|
||||
<< " var=" << var_list->data->name
|
||||
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
break;
|
||||
}
|
||||
var_list = var_list->next;
|
||||
}
|
||||
};
|
||||
|
||||
MyList<Patch> *Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
@@ -110,12 +147,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
{
|
||||
stage_upload_var_list(cg, StateList);
|
||||
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
|
||||
ERROR = 1;
|
||||
|
||||
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
|
||||
|
||||
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny))
|
||||
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
|
||||
{
|
||||
cerr << "GPU lowerbound failure: lev=" << lev
|
||||
<< " rk_stage=" << iter_count
|
||||
@@ -125,6 +163,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
}
|
||||
if (!ERROR)
|
||||
stage_download_var_list(cg, SynchList_pre);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
@@ -194,12 +234,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
{
|
||||
stage_upload_var_list(cg, SynchList_pre);
|
||||
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
|
||||
ERROR = 1;
|
||||
|
||||
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
|
||||
|
||||
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny))
|
||||
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
|
||||
{
|
||||
cerr << "GPU lowerbound failure: lev=" << lev
|
||||
<< " rk_stage=" << iter_count
|
||||
@@ -209,6 +250,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
}
|
||||
if (!ERROR)
|
||||
stage_download_var_list(cg, SynchList_cor);
|
||||
}
|
||||
|
||||
if (BP == Pp->data->ble)
|
||||
|
||||
@@ -149,6 +149,16 @@ struct ExternalBufferRegistry
|
||||
int mapped_buffer_count = 0;
|
||||
};
|
||||
|
||||
struct OwnedBufferRegistry
|
||||
{
|
||||
static const int max_mapped_buffers = 256;
|
||||
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
||||
double *device_buffers[max_mapped_buffers] = {nullptr};
|
||||
size_t capacities[max_mapped_buffers] = {0};
|
||||
bool valid[max_mapped_buffers] = {false};
|
||||
int mapped_buffer_count = 0;
|
||||
};
|
||||
|
||||
GpuRhsCache &gpu_rhs_cache()
|
||||
{
|
||||
static GpuRhsCache cache;
|
||||
@@ -161,6 +171,12 @@ ExternalBufferRegistry &external_buffer_registry()
|
||||
return registry;
|
||||
}
|
||||
|
||||
OwnedBufferRegistry &owned_buffer_registry()
|
||||
{
|
||||
static thread_local OwnedBufferRegistry registry;
|
||||
return registry;
|
||||
}
|
||||
|
||||
void reset_meta(Meta *meta)
|
||||
{
|
||||
memset(meta, 0, sizeof(Meta));
|
||||
@@ -230,6 +246,114 @@ void map_external_buffer(ExternalBufferRegistry ®istry, const double *host_pt
|
||||
registry.mapped_buffer_count++;
|
||||
}
|
||||
|
||||
void invalidate_owned_buffer_map(OwnedBufferRegistry ®istry)
|
||||
{
|
||||
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||
registry.valid[i] = false;
|
||||
}
|
||||
|
||||
const double *find_owned_device_buffer(const OwnedBufferRegistry ®istry, const double *host_ptr)
|
||||
{
|
||||
if (!host_ptr)
|
||||
return nullptr;
|
||||
|
||||
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (registry.valid[i] && registry.host_buffers[i] == host_ptr)
|
||||
return registry.device_buffers[i];
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
int find_owned_buffer_slot(OwnedBufferRegistry ®istry, const double *host_ptr)
|
||||
{
|
||||
int reusable_slot = -1;
|
||||
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (registry.host_buffers[i] == host_ptr)
|
||||
return i;
|
||||
if (!registry.valid[i] && reusable_slot < 0)
|
||||
reusable_slot = i;
|
||||
}
|
||||
|
||||
if (reusable_slot >= 0)
|
||||
{
|
||||
registry.host_buffers[reusable_slot] = host_ptr;
|
||||
return reusable_slot;
|
||||
}
|
||||
|
||||
if (registry.mapped_buffer_count >= OwnedBufferRegistry::max_mapped_buffers)
|
||||
return -1;
|
||||
|
||||
const int slot = registry.mapped_buffer_count++;
|
||||
registry.host_buffers[slot] = host_ptr;
|
||||
registry.device_buffers[slot] = nullptr;
|
||||
registry.capacities[slot] = 0;
|
||||
registry.valid[slot] = false;
|
||||
return slot;
|
||||
}
|
||||
|
||||
bool ensure_owned_buffer_capacity(OwnedBufferRegistry ®istry, int slot, size_t bytes)
|
||||
{
|
||||
if (slot < 0)
|
||||
return false;
|
||||
if (registry.device_buffers[slot] && registry.capacities[slot] >= bytes)
|
||||
return true;
|
||||
|
||||
if (registry.device_buffers[slot])
|
||||
{
|
||||
cudaError_t free_err = cudaFree(registry.device_buffers[slot]);
|
||||
if (free_err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
|
||||
return false;
|
||||
}
|
||||
registry.device_buffers[slot] = nullptr;
|
||||
registry.capacities[slot] = 0;
|
||||
}
|
||||
|
||||
cudaError_t err = cudaMalloc((void **)®istry.device_buffers[slot], bytes);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
registry.capacities[slot] = bytes;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill)
|
||||
{
|
||||
if (!host_ptr || count == 0)
|
||||
return false;
|
||||
|
||||
OwnedBufferRegistry ®istry = owned_buffer_registry();
|
||||
const int slot = find_owned_buffer_slot(registry, host_ptr);
|
||||
if (slot < 0)
|
||||
{
|
||||
cerr << "owned CUDA buffer registry exhausted" << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
const size_t bytes = count * sizeof(double);
|
||||
if (!ensure_owned_buffer_capacity(registry, slot, bytes))
|
||||
return false;
|
||||
|
||||
cudaError_t err = zero_fill
|
||||
? cudaMemset(registry.device_buffers[slot], 0, bytes)
|
||||
: cudaMemcpy(registry.device_buffers[slot], host_ptr, bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << (zero_fill ? "cudaMemset" : "cudaMemcpy(H2D)")
|
||||
<< " failed: " << cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
registry.valid[slot] = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ensure_device_buffer(double **ptr, size_t count)
|
||||
{
|
||||
if (*ptr)
|
||||
@@ -270,6 +394,17 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool copy_buffer_to_device(double *dst, const double *src, size_t count)
|
||||
{
|
||||
cudaError_t err = cudaMemcpy(dst, src, count * sizeof(double), cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
cerr << "cudaMemcpy(H2D) failed: " << cudaGetErrorString(err) << endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const double *find_external_device_buffer(const ExternalBufferRegistry ®istry, const double *host_ptr)
|
||||
{
|
||||
if (!host_ptr)
|
||||
@@ -326,8 +461,25 @@ bool zero_buffers(const ZeroSpec *specs, size_t count)
|
||||
void cleanup_gpu_rhs_cache()
|
||||
{
|
||||
GpuRhsCache &cache = gpu_rhs_cache();
|
||||
OwnedBufferRegistry &owned = owned_buffer_registry();
|
||||
if (!cache.allocated)
|
||||
{
|
||||
for (int i = 0; i < owned.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (owned.device_buffers[i])
|
||||
{
|
||||
cudaError_t free_err = cudaFree(owned.device_buffers[i]);
|
||||
if (free_err != cudaSuccess)
|
||||
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
|
||||
}
|
||||
owned.device_buffers[i] = nullptr;
|
||||
owned.capacities[i] = 0;
|
||||
owned.valid[i] = false;
|
||||
owned.host_buffers[i] = nullptr;
|
||||
}
|
||||
owned.mapped_buffer_count = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if (cache.device >= 0)
|
||||
cudaSetDevice(cache.device);
|
||||
@@ -342,6 +494,21 @@ void cleanup_gpu_rhs_cache()
|
||||
cache.last_z = nullptr;
|
||||
reset_buffer_map(cache);
|
||||
reset_external_buffer_map(external_buffer_registry());
|
||||
|
||||
for (int i = 0; i < owned.mapped_buffer_count; ++i)
|
||||
{
|
||||
if (owned.device_buffers[i])
|
||||
{
|
||||
cudaError_t free_err = cudaFree(owned.device_buffers[i]);
|
||||
if (free_err != cudaSuccess)
|
||||
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
|
||||
}
|
||||
owned.device_buffers[i] = nullptr;
|
||||
owned.capacities[i] = 0;
|
||||
owned.valid[i] = false;
|
||||
owned.host_buffers[i] = nullptr;
|
||||
}
|
||||
owned.mapped_buffer_count = 0;
|
||||
}
|
||||
|
||||
bool register_gpu_rhs_cleanup()
|
||||
@@ -634,6 +801,9 @@ const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *
|
||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr)
|
||||
{
|
||||
const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
|
||||
if (device_ptr)
|
||||
return device_ptr;
|
||||
device_ptr = find_owned_device_buffer(owned_buffer_registry(), host_ptr);
|
||||
if (device_ptr)
|
||||
return device_ptr;
|
||||
return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
|
||||
@@ -659,6 +829,7 @@ void bssn_gpu_clear_cached_device_buffers()
|
||||
{
|
||||
reset_external_buffer_map(external_buffer_registry());
|
||||
reset_buffer_map(gpu_rhs_cache());
|
||||
invalidate_owned_buffer_map(owned_buffer_registry());
|
||||
}
|
||||
|
||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
|
||||
@@ -666,6 +837,16 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic
|
||||
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
|
||||
}
|
||||
|
||||
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count)
|
||||
{
|
||||
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), false) ? 0 : 1;
|
||||
}
|
||||
|
||||
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
|
||||
{
|
||||
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
|
||||
}
|
||||
|
||||
__global__ void test_const_address(double * testd){
|
||||
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
||||
if(_t == 0)
|
||||
@@ -3009,7 +3190,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
#endif
|
||||
|
||||
#endif
|
||||
//2 ----------------Copy Data to Device------------------
|
||||
//2 ----------------Copy Data to Device------------------
|
||||
if (cache.last_x != X || cache.last_y != Y || cache.last_z != Z)
|
||||
{
|
||||
const CopySpec coord_copies[] = {
|
||||
@@ -3025,35 +3206,81 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
}
|
||||
|
||||
reset_buffer_map(cache);
|
||||
Meta saved_meta = *meta;
|
||||
|
||||
const CopySpec state_copies[] = {
|
||||
{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dyy, dyy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dzz, dzz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ trK, trK, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxy, gxy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gxz, gxz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ gyz, gyz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Axx, Axx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Axy, Axy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Axz, Axz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Ayz, Ayz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Ayy, Ayy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Azz, Azz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamx, Gamx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamy, Gamy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Gamz, Gamz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betax, betax, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betay, betay, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ betaz, betaz, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ Lap, Lap, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dtSfx, dtSfx, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
|
||||
{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
|
||||
auto bind_or_copy_input = [&](double *&slot, const double *host_ptr, size_t count) -> bool
|
||||
{
|
||||
const double *mapped = bssn_gpu_find_device_buffer(host_ptr);
|
||||
if (mapped)
|
||||
{
|
||||
slot = const_cast<double *>(mapped);
|
||||
return true;
|
||||
}
|
||||
return copy_buffer_to_device(slot, host_ptr, count);
|
||||
};
|
||||
if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
|
||||
|
||||
auto bind_or_keep_output = [&](double *&slot, const double *host_ptr)
|
||||
{
|
||||
const double *mapped = bssn_gpu_find_device_buffer(host_ptr);
|
||||
if (mapped)
|
||||
slot = const_cast<double *>(mapped);
|
||||
};
|
||||
|
||||
if (!(bind_or_copy_input(meta->chi, chi, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->dxx, dxx, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->dyy, dyy, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->dzz, dzz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->trK, trK, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->gxy, gxy, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->gxz, gxz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->gyz, gyz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Axx, Axx, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Axy, Axy, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Axz, Axz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Ayz, Ayz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Ayy, Ayy, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Azz, Azz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Gamx, Gamx, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Gamy, Gamy, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Gamz, Gamz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->betax, betax, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->betay, betay, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->betaz, betaz, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->Lap, Lap, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->dtSfx, dtSfx, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->dtSfy, dtSfy, static_cast<size_t>(matrix_size)) &&
|
||||
bind_or_copy_input(meta->dtSfz, dtSfz, static_cast<size_t>(matrix_size))))
|
||||
{
|
||||
*meta = saved_meta;
|
||||
return 1;
|
||||
}
|
||||
|
||||
bind_or_keep_output(meta->chi_rhs, chi_rhs);
|
||||
bind_or_keep_output(meta->trK_rhs, trK_rhs);
|
||||
bind_or_keep_output(meta->gxx_rhs, gxx_rhs);
|
||||
bind_or_keep_output(meta->gxy_rhs, gxy_rhs);
|
||||
bind_or_keep_output(meta->gxz_rhs, gxz_rhs);
|
||||
bind_or_keep_output(meta->gyy_rhs, gyy_rhs);
|
||||
bind_or_keep_output(meta->gyz_rhs, gyz_rhs);
|
||||
bind_or_keep_output(meta->gzz_rhs, gzz_rhs);
|
||||
bind_or_keep_output(meta->Axx_rhs, Axx_rhs);
|
||||
bind_or_keep_output(meta->Axy_rhs, Axy_rhs);
|
||||
bind_or_keep_output(meta->Axz_rhs, Axz_rhs);
|
||||
bind_or_keep_output(meta->Ayy_rhs, Ayy_rhs);
|
||||
bind_or_keep_output(meta->Ayz_rhs, Ayz_rhs);
|
||||
bind_or_keep_output(meta->Azz_rhs, Azz_rhs);
|
||||
bind_or_keep_output(meta->Gamx_rhs, Gamx_rhs);
|
||||
bind_or_keep_output(meta->Gamy_rhs, Gamy_rhs);
|
||||
bind_or_keep_output(meta->Gamz_rhs, Gamz_rhs);
|
||||
bind_or_keep_output(meta->Lap_rhs, Lap_rhs);
|
||||
bind_or_keep_output(meta->betax_rhs, betax_rhs);
|
||||
bind_or_keep_output(meta->betay_rhs, betay_rhs);
|
||||
bind_or_keep_output(meta->betaz_rhs, betaz_rhs);
|
||||
bind_or_keep_output(meta->dtSfx_rhs, dtSfx_rhs);
|
||||
bind_or_keep_output(meta->dtSfy_rhs, dtSfy_rhs);
|
||||
bind_or_keep_output(meta->dtSfz_rhs, dtSfz_rhs);
|
||||
|
||||
cudaMemcpyToSymbol(metac, meta, sizeof(Meta));
|
||||
|
||||
const ZeroSpec zero_specs[] = {
|
||||
{Mh_ rho, static_cast<size_t>(matrix_size)},
|
||||
@@ -3068,7 +3295,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
{Mh_ Sz, static_cast<size_t>(matrix_size)},
|
||||
};
|
||||
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
|
||||
{
|
||||
*meta = saved_meta;
|
||||
return 1;
|
||||
}
|
||||
map_buffer(cache, chi, Mh_ chi);
|
||||
map_buffer(cache, trK, Mh_ trK);
|
||||
map_buffer(cache, dxx, Mh_ dxx);
|
||||
@@ -3454,12 +3684,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
//-----------------------------------------------------
|
||||
//-------------------FOR GPU TEST----------------------
|
||||
//-----------------------------------------------------
|
||||
#ifdef TIMING
|
||||
cudaThreadSynchronize();
|
||||
gettimeofday(&tv2, NULL);
|
||||
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef TIMING
|
||||
cudaThreadSynchronize();
|
||||
gettimeofday(&tv2, NULL);
|
||||
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
|
||||
#endif
|
||||
|
||||
*meta = saved_meta;
|
||||
|
||||
return 0;//TODO return
|
||||
}
|
||||
|
||||
@@ -69,6 +69,8 @@ int bssn_gpu_bind_process_device(int mpi_rank);
|
||||
void bssn_gpu_clear_cached_device_buffers();
|
||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
|
||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
|
||||
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
|
||||
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
|
||||
|
||||
/** Init GPU side data in GPUMeta. */
|
||||
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
||||
|
||||
Reference in New Issue
Block a user