Reduce staged GPU host-device copies
This commit is contained in:
@@ -728,7 +728,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
const double SoA[3],
|
const double SoA[3],
|
||||||
int symmetry,
|
int symmetry,
|
||||||
int lev,
|
int lev,
|
||||||
int rk_stage)
|
int rk_stage,
|
||||||
|
bool download_to_host)
|
||||||
{
|
{
|
||||||
struct Rk4VarCache
|
struct Rk4VarCache
|
||||||
{
|
{
|
||||||
@@ -790,7 +791,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
|
|
||||||
ok = ok &&
|
ok = ok &&
|
||||||
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
||||||
(!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) &&
|
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
|
||||||
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
||||||
|
|
||||||
if (ok && need_stage_input)
|
if (ok && need_stage_input)
|
||||||
@@ -885,16 +886,18 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
if (ok)
|
if (ok)
|
||||||
{
|
{
|
||||||
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
|
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
|
||||||
|
if (download_to_host)
|
||||||
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
{
|
||||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||||
ok = err == cudaSuccess;
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||||
|
ok = err == cudaSuccess;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ok ? 0 : 1;
|
return ok ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
|
||||||
{
|
{
|
||||||
static thread_local CachedBuffer d_chi;
|
static thread_local CachedBuffer d_chi;
|
||||||
|
|
||||||
@@ -926,13 +929,32 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny)
|
|||||||
if (ok)
|
if (ok)
|
||||||
{
|
{
|
||||||
bssn_gpu_register_device_buffer(chi, device_chi);
|
bssn_gpu_register_device_buffer(chi, device_chi);
|
||||||
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
|
if (download_to_host)
|
||||||
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
{
|
||||||
ok = err == cudaSuccess;
|
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
||||||
|
ok = err == cudaSuccess;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ok ? 0 : 1;
|
return ok ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bssn_cuda_download_buffer(int *ex, double *host_ptr)
|
||||||
|
{
|
||||||
|
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
||||||
|
if (!device_ptr)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
const size_t bytes = static_cast<size_t>(count_points(ex)) * sizeof(double);
|
||||||
|
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
report_cuda_error("cudaMemcpy(D2H) buffered download", err);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int bssn_cuda_interp_points_batch(const int *ex,
|
int bssn_cuda_interp_points_batch(const int *ex,
|
||||||
const double *X, const double *Y, const double *Z,
|
const double *X, const double *Y, const double *Z,
|
||||||
const double *const *fields,
|
const double *const *fields,
|
||||||
|
|||||||
@@ -19,9 +19,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|||||||
const double SoA[3],
|
const double SoA[3],
|
||||||
int symmetry,
|
int symmetry,
|
||||||
int lev,
|
int lev,
|
||||||
int rk_stage);
|
int rk_stage,
|
||||||
|
bool download_to_host = true);
|
||||||
|
|
||||||
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny);
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
|
||||||
|
int bssn_cuda_download_buffer(int *ex, double *host_ptr);
|
||||||
|
|
||||||
int bssn_cuda_prolong3_pack(int wei,
|
int bssn_cuda_prolong3_pack(int wei,
|
||||||
const double *llbc, const double *uubc, const int *extc, const double *func,
|
const double *llbc, const double *uubc, const int *extc, const double *func,
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
cg->fgfs[varlr->data->sgfn],
|
cg->fgfs[varlr->data->sgfn],
|
||||||
varl0->data->propspeed,
|
varl0->data->propspeed,
|
||||||
varl0->data->SoA,
|
varl0->data->SoA,
|
||||||
Symmetry, lev, rk_stage))
|
Symmetry, lev, rk_stage, false))
|
||||||
{
|
{
|
||||||
cerr << "GPU rk4/boundary failure: lev=" << lev
|
cerr << "GPU rk4/boundary failure: lev=" << lev
|
||||||
<< " rk_stage=" << rk_stage
|
<< " rk_stage=" << rk_stage
|
||||||
@@ -101,6 +101,43 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto stage_download_var_list =
|
||||||
|
[&](Block *cg, MyList<var> *var_list) {
|
||||||
|
while (var_list)
|
||||||
|
{
|
||||||
|
if (bssn_cuda_download_buffer(cg->shape, cg->fgfs[var_list->data->sgfn]))
|
||||||
|
{
|
||||||
|
cerr << "GPU stage download failure: lev=" << lev
|
||||||
|
<< " var=" << var_list->data->name
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
var_list = var_list->next;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
auto stage_upload_var_list =
|
||||||
|
[&](Block *cg, MyList<var> *var_list) {
|
||||||
|
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
|
||||||
|
while (var_list)
|
||||||
|
{
|
||||||
|
if (bssn_gpu_stage_upload_buffer(cg->fgfs[var_list->data->sgfn], n))
|
||||||
|
{
|
||||||
|
cerr << "GPU state upload failure: lev=" << lev
|
||||||
|
<< " var=" << var_list->data->name
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
var_list = var_list->next;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
MyList<Patch> *Pp = GH->PatL[lev];
|
MyList<Patch> *Pp = GH->PatL[lev];
|
||||||
while (Pp)
|
while (Pp)
|
||||||
{
|
{
|
||||||
@@ -110,12 +147,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
Block *cg = BP->data;
|
Block *cg = BP->data;
|
||||||
if (myrank == cg->rank)
|
if (myrank == cg->rank)
|
||||||
{
|
{
|
||||||
|
stage_upload_var_list(cg, StateList);
|
||||||
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
|
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
|
||||||
ERROR = 1;
|
ERROR = 1;
|
||||||
|
|
||||||
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
|
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
|
||||||
|
|
||||||
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny))
|
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
|
||||||
{
|
{
|
||||||
cerr << "GPU lowerbound failure: lev=" << lev
|
cerr << "GPU lowerbound failure: lev=" << lev
|
||||||
<< " rk_stage=" << iter_count
|
<< " rk_stage=" << iter_count
|
||||||
@@ -125,6 +163,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
ERROR = 1;
|
ERROR = 1;
|
||||||
}
|
}
|
||||||
|
if (!ERROR)
|
||||||
|
stage_download_var_list(cg, SynchList_pre);
|
||||||
}
|
}
|
||||||
if (BP == Pp->data->ble)
|
if (BP == Pp->data->ble)
|
||||||
break;
|
break;
|
||||||
@@ -194,12 +234,13 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
Block *cg = BP->data;
|
Block *cg = BP->data;
|
||||||
if (myrank == cg->rank)
|
if (myrank == cg->rank)
|
||||||
{
|
{
|
||||||
|
stage_upload_var_list(cg, SynchList_pre);
|
||||||
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
|
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
|
||||||
ERROR = 1;
|
ERROR = 1;
|
||||||
|
|
||||||
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
|
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
|
||||||
|
|
||||||
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny))
|
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
|
||||||
{
|
{
|
||||||
cerr << "GPU lowerbound failure: lev=" << lev
|
cerr << "GPU lowerbound failure: lev=" << lev
|
||||||
<< " rk_stage=" << iter_count
|
<< " rk_stage=" << iter_count
|
||||||
@@ -209,6 +250,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
ERROR = 1;
|
ERROR = 1;
|
||||||
}
|
}
|
||||||
|
if (!ERROR)
|
||||||
|
stage_download_var_list(cg, SynchList_cor);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (BP == Pp->data->ble)
|
if (BP == Pp->data->ble)
|
||||||
|
|||||||
@@ -149,6 +149,16 @@ struct ExternalBufferRegistry
|
|||||||
int mapped_buffer_count = 0;
|
int mapped_buffer_count = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct OwnedBufferRegistry
|
||||||
|
{
|
||||||
|
static const int max_mapped_buffers = 256;
|
||||||
|
const double *host_buffers[max_mapped_buffers] = {nullptr};
|
||||||
|
double *device_buffers[max_mapped_buffers] = {nullptr};
|
||||||
|
size_t capacities[max_mapped_buffers] = {0};
|
||||||
|
bool valid[max_mapped_buffers] = {false};
|
||||||
|
int mapped_buffer_count = 0;
|
||||||
|
};
|
||||||
|
|
||||||
GpuRhsCache &gpu_rhs_cache()
|
GpuRhsCache &gpu_rhs_cache()
|
||||||
{
|
{
|
||||||
static GpuRhsCache cache;
|
static GpuRhsCache cache;
|
||||||
@@ -161,6 +171,12 @@ ExternalBufferRegistry &external_buffer_registry()
|
|||||||
return registry;
|
return registry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OwnedBufferRegistry &owned_buffer_registry()
|
||||||
|
{
|
||||||
|
static thread_local OwnedBufferRegistry registry;
|
||||||
|
return registry;
|
||||||
|
}
|
||||||
|
|
||||||
void reset_meta(Meta *meta)
|
void reset_meta(Meta *meta)
|
||||||
{
|
{
|
||||||
memset(meta, 0, sizeof(Meta));
|
memset(meta, 0, sizeof(Meta));
|
||||||
@@ -230,6 +246,114 @@ void map_external_buffer(ExternalBufferRegistry ®istry, const double *host_pt
|
|||||||
registry.mapped_buffer_count++;
|
registry.mapped_buffer_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void invalidate_owned_buffer_map(OwnedBufferRegistry ®istry)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||||
|
registry.valid[i] = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const double *find_owned_device_buffer(const OwnedBufferRegistry ®istry, const double *host_ptr)
|
||||||
|
{
|
||||||
|
if (!host_ptr)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||||
|
{
|
||||||
|
if (registry.valid[i] && registry.host_buffers[i] == host_ptr)
|
||||||
|
return registry.device_buffers[i];
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
int find_owned_buffer_slot(OwnedBufferRegistry ®istry, const double *host_ptr)
|
||||||
|
{
|
||||||
|
int reusable_slot = -1;
|
||||||
|
for (int i = 0; i < registry.mapped_buffer_count; ++i)
|
||||||
|
{
|
||||||
|
if (registry.host_buffers[i] == host_ptr)
|
||||||
|
return i;
|
||||||
|
if (!registry.valid[i] && reusable_slot < 0)
|
||||||
|
reusable_slot = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (reusable_slot >= 0)
|
||||||
|
{
|
||||||
|
registry.host_buffers[reusable_slot] = host_ptr;
|
||||||
|
return reusable_slot;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (registry.mapped_buffer_count >= OwnedBufferRegistry::max_mapped_buffers)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
const int slot = registry.mapped_buffer_count++;
|
||||||
|
registry.host_buffers[slot] = host_ptr;
|
||||||
|
registry.device_buffers[slot] = nullptr;
|
||||||
|
registry.capacities[slot] = 0;
|
||||||
|
registry.valid[slot] = false;
|
||||||
|
return slot;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ensure_owned_buffer_capacity(OwnedBufferRegistry ®istry, int slot, size_t bytes)
|
||||||
|
{
|
||||||
|
if (slot < 0)
|
||||||
|
return false;
|
||||||
|
if (registry.device_buffers[slot] && registry.capacities[slot] >= bytes)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (registry.device_buffers[slot])
|
||||||
|
{
|
||||||
|
cudaError_t free_err = cudaFree(registry.device_buffers[slot]);
|
||||||
|
if (free_err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
registry.device_buffers[slot] = nullptr;
|
||||||
|
registry.capacities[slot] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaError_t err = cudaMalloc((void **)®istry.device_buffers[slot], bytes);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
registry.capacities[slot] = bytes;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill)
|
||||||
|
{
|
||||||
|
if (!host_ptr || count == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
OwnedBufferRegistry ®istry = owned_buffer_registry();
|
||||||
|
const int slot = find_owned_buffer_slot(registry, host_ptr);
|
||||||
|
if (slot < 0)
|
||||||
|
{
|
||||||
|
cerr << "owned CUDA buffer registry exhausted" << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t bytes = count * sizeof(double);
|
||||||
|
if (!ensure_owned_buffer_capacity(registry, slot, bytes))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
cudaError_t err = zero_fill
|
||||||
|
? cudaMemset(registry.device_buffers[slot], 0, bytes)
|
||||||
|
: cudaMemcpy(registry.device_buffers[slot], host_ptr, bytes, cudaMemcpyHostToDevice);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << (zero_fill ? "cudaMemset" : "cudaMemcpy(H2D)")
|
||||||
|
<< " failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
registry.valid[slot] = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool ensure_device_buffer(double **ptr, size_t count)
|
bool ensure_device_buffer(double **ptr, size_t count)
|
||||||
{
|
{
|
||||||
if (*ptr)
|
if (*ptr)
|
||||||
@@ -270,6 +394,17 @@ bool copy_buffers_to_device(const CopySpec *specs, size_t count)
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool copy_buffer_to_device(double *dst, const double *src, size_t count)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaMemcpy(dst, src, count * sizeof(double), cudaMemcpyHostToDevice);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaMemcpy(H2D) failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
const double *find_external_device_buffer(const ExternalBufferRegistry ®istry, const double *host_ptr)
|
const double *find_external_device_buffer(const ExternalBufferRegistry ®istry, const double *host_ptr)
|
||||||
{
|
{
|
||||||
if (!host_ptr)
|
if (!host_ptr)
|
||||||
@@ -326,8 +461,25 @@ bool zero_buffers(const ZeroSpec *specs, size_t count)
|
|||||||
void cleanup_gpu_rhs_cache()
|
void cleanup_gpu_rhs_cache()
|
||||||
{
|
{
|
||||||
GpuRhsCache &cache = gpu_rhs_cache();
|
GpuRhsCache &cache = gpu_rhs_cache();
|
||||||
|
OwnedBufferRegistry &owned = owned_buffer_registry();
|
||||||
if (!cache.allocated)
|
if (!cache.allocated)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < owned.mapped_buffer_count; ++i)
|
||||||
|
{
|
||||||
|
if (owned.device_buffers[i])
|
||||||
|
{
|
||||||
|
cudaError_t free_err = cudaFree(owned.device_buffers[i]);
|
||||||
|
if (free_err != cudaSuccess)
|
||||||
|
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
|
||||||
|
}
|
||||||
|
owned.device_buffers[i] = nullptr;
|
||||||
|
owned.capacities[i] = 0;
|
||||||
|
owned.valid[i] = false;
|
||||||
|
owned.host_buffers[i] = nullptr;
|
||||||
|
}
|
||||||
|
owned.mapped_buffer_count = 0;
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (cache.device >= 0)
|
if (cache.device >= 0)
|
||||||
cudaSetDevice(cache.device);
|
cudaSetDevice(cache.device);
|
||||||
@@ -342,6 +494,21 @@ void cleanup_gpu_rhs_cache()
|
|||||||
cache.last_z = nullptr;
|
cache.last_z = nullptr;
|
||||||
reset_buffer_map(cache);
|
reset_buffer_map(cache);
|
||||||
reset_external_buffer_map(external_buffer_registry());
|
reset_external_buffer_map(external_buffer_registry());
|
||||||
|
|
||||||
|
for (int i = 0; i < owned.mapped_buffer_count; ++i)
|
||||||
|
{
|
||||||
|
if (owned.device_buffers[i])
|
||||||
|
{
|
||||||
|
cudaError_t free_err = cudaFree(owned.device_buffers[i]);
|
||||||
|
if (free_err != cudaSuccess)
|
||||||
|
cerr << "cudaFree failed: " << cudaGetErrorString(free_err) << endl;
|
||||||
|
}
|
||||||
|
owned.device_buffers[i] = nullptr;
|
||||||
|
owned.capacities[i] = 0;
|
||||||
|
owned.valid[i] = false;
|
||||||
|
owned.host_buffers[i] = nullptr;
|
||||||
|
}
|
||||||
|
owned.mapped_buffer_count = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool register_gpu_rhs_cleanup()
|
bool register_gpu_rhs_cleanup()
|
||||||
@@ -634,6 +801,9 @@ const double *find_mapped_device_buffer(const GpuRhsCache &cache, const double *
|
|||||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr)
|
const double *bssn_gpu_find_device_buffer(const double *host_ptr)
|
||||||
{
|
{
|
||||||
const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
|
const double *device_ptr = find_external_device_buffer(external_buffer_registry(), host_ptr);
|
||||||
|
if (device_ptr)
|
||||||
|
return device_ptr;
|
||||||
|
device_ptr = find_owned_device_buffer(owned_buffer_registry(), host_ptr);
|
||||||
if (device_ptr)
|
if (device_ptr)
|
||||||
return device_ptr;
|
return device_ptr;
|
||||||
return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
|
return find_mapped_device_buffer(gpu_rhs_cache(), host_ptr);
|
||||||
@@ -659,6 +829,7 @@ void bssn_gpu_clear_cached_device_buffers()
|
|||||||
{
|
{
|
||||||
reset_external_buffer_map(external_buffer_registry());
|
reset_external_buffer_map(external_buffer_registry());
|
||||||
reset_buffer_map(gpu_rhs_cache());
|
reset_buffer_map(gpu_rhs_cache());
|
||||||
|
invalidate_owned_buffer_map(owned_buffer_registry());
|
||||||
}
|
}
|
||||||
|
|
||||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
|
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr)
|
||||||
@@ -666,6 +837,16 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic
|
|||||||
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
|
map_external_buffer(external_buffer_registry(), host_ptr, device_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count)
|
||||||
|
{
|
||||||
|
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), false) ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
|
||||||
|
{
|
||||||
|
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void test_const_address(double * testd){
|
__global__ void test_const_address(double * testd){
|
||||||
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
int _t = blockIdx.x*blockDim.x+threadIdx.x;
|
||||||
if(_t == 0)
|
if(_t == 0)
|
||||||
@@ -3009,7 +3190,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
//2 ----------------Copy Data to Device------------------
|
//2 ----------------Copy Data to Device------------------
|
||||||
if (cache.last_x != X || cache.last_y != Y || cache.last_z != Z)
|
if (cache.last_x != X || cache.last_y != Y || cache.last_z != Z)
|
||||||
{
|
{
|
||||||
const CopySpec coord_copies[] = {
|
const CopySpec coord_copies[] = {
|
||||||
@@ -3025,35 +3206,81 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
|||||||
}
|
}
|
||||||
|
|
||||||
reset_buffer_map(cache);
|
reset_buffer_map(cache);
|
||||||
|
Meta saved_meta = *meta;
|
||||||
|
|
||||||
const CopySpec state_copies[] = {
|
auto bind_or_copy_input = [&](double *&slot, const double *host_ptr, size_t count) -> bool
|
||||||
{Mh_ chi, chi, static_cast<size_t>(matrix_size)},
|
{
|
||||||
{Mh_ dxx, dxx, static_cast<size_t>(matrix_size)},
|
const double *mapped = bssn_gpu_find_device_buffer(host_ptr);
|
||||||
{Mh_ dyy, dyy, static_cast<size_t>(matrix_size)},
|
if (mapped)
|
||||||
{Mh_ dzz, dzz, static_cast<size_t>(matrix_size)},
|
{
|
||||||
{Mh_ trK, trK, static_cast<size_t>(matrix_size)},
|
slot = const_cast<double *>(mapped);
|
||||||
{Mh_ gxy, gxy, static_cast<size_t>(matrix_size)},
|
return true;
|
||||||
{Mh_ gxz, gxz, static_cast<size_t>(matrix_size)},
|
}
|
||||||
{Mh_ gyz, gyz, static_cast<size_t>(matrix_size)},
|
return copy_buffer_to_device(slot, host_ptr, count);
|
||||||
{Mh_ Axx, Axx, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Axy, Axy, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Axz, Axz, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Ayz, Ayz, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Ayy, Ayy, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Azz, Azz, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Gamx, Gamx, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Gamy, Gamy, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Gamz, Gamz, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ betax, betax, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ betay, betay, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ betaz, betaz, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ Lap, Lap, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ dtSfx, dtSfx, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ dtSfy, dtSfy, static_cast<size_t>(matrix_size)},
|
|
||||||
{Mh_ dtSfz, dtSfz, static_cast<size_t>(matrix_size)},
|
|
||||||
};
|
};
|
||||||
if (!copy_buffers_to_device_preferring_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0])))
|
|
||||||
|
auto bind_or_keep_output = [&](double *&slot, const double *host_ptr)
|
||||||
|
{
|
||||||
|
const double *mapped = bssn_gpu_find_device_buffer(host_ptr);
|
||||||
|
if (mapped)
|
||||||
|
slot = const_cast<double *>(mapped);
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!(bind_or_copy_input(meta->chi, chi, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->dxx, dxx, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->dyy, dyy, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->dzz, dzz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->trK, trK, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->gxy, gxy, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->gxz, gxz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->gyz, gyz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Axx, Axx, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Axy, Axy, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Axz, Axz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Ayz, Ayz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Ayy, Ayy, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Azz, Azz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Gamx, Gamx, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Gamy, Gamy, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Gamz, Gamz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->betax, betax, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->betay, betay, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->betaz, betaz, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->Lap, Lap, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->dtSfx, dtSfx, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->dtSfy, dtSfy, static_cast<size_t>(matrix_size)) &&
|
||||||
|
bind_or_copy_input(meta->dtSfz, dtSfz, static_cast<size_t>(matrix_size))))
|
||||||
|
{
|
||||||
|
*meta = saved_meta;
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bind_or_keep_output(meta->chi_rhs, chi_rhs);
|
||||||
|
bind_or_keep_output(meta->trK_rhs, trK_rhs);
|
||||||
|
bind_or_keep_output(meta->gxx_rhs, gxx_rhs);
|
||||||
|
bind_or_keep_output(meta->gxy_rhs, gxy_rhs);
|
||||||
|
bind_or_keep_output(meta->gxz_rhs, gxz_rhs);
|
||||||
|
bind_or_keep_output(meta->gyy_rhs, gyy_rhs);
|
||||||
|
bind_or_keep_output(meta->gyz_rhs, gyz_rhs);
|
||||||
|
bind_or_keep_output(meta->gzz_rhs, gzz_rhs);
|
||||||
|
bind_or_keep_output(meta->Axx_rhs, Axx_rhs);
|
||||||
|
bind_or_keep_output(meta->Axy_rhs, Axy_rhs);
|
||||||
|
bind_or_keep_output(meta->Axz_rhs, Axz_rhs);
|
||||||
|
bind_or_keep_output(meta->Ayy_rhs, Ayy_rhs);
|
||||||
|
bind_or_keep_output(meta->Ayz_rhs, Ayz_rhs);
|
||||||
|
bind_or_keep_output(meta->Azz_rhs, Azz_rhs);
|
||||||
|
bind_or_keep_output(meta->Gamx_rhs, Gamx_rhs);
|
||||||
|
bind_or_keep_output(meta->Gamy_rhs, Gamy_rhs);
|
||||||
|
bind_or_keep_output(meta->Gamz_rhs, Gamz_rhs);
|
||||||
|
bind_or_keep_output(meta->Lap_rhs, Lap_rhs);
|
||||||
|
bind_or_keep_output(meta->betax_rhs, betax_rhs);
|
||||||
|
bind_or_keep_output(meta->betay_rhs, betay_rhs);
|
||||||
|
bind_or_keep_output(meta->betaz_rhs, betaz_rhs);
|
||||||
|
bind_or_keep_output(meta->dtSfx_rhs, dtSfx_rhs);
|
||||||
|
bind_or_keep_output(meta->dtSfy_rhs, dtSfy_rhs);
|
||||||
|
bind_or_keep_output(meta->dtSfz_rhs, dtSfz_rhs);
|
||||||
|
|
||||||
|
cudaMemcpyToSymbol(metac, meta, sizeof(Meta));
|
||||||
|
|
||||||
const ZeroSpec zero_specs[] = {
|
const ZeroSpec zero_specs[] = {
|
||||||
{Mh_ rho, static_cast<size_t>(matrix_size)},
|
{Mh_ rho, static_cast<size_t>(matrix_size)},
|
||||||
@@ -3068,7 +3295,10 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
|||||||
{Mh_ Sz, static_cast<size_t>(matrix_size)},
|
{Mh_ Sz, static_cast<size_t>(matrix_size)},
|
||||||
};
|
};
|
||||||
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
|
if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0])))
|
||||||
|
{
|
||||||
|
*meta = saved_meta;
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
map_buffer(cache, chi, Mh_ chi);
|
map_buffer(cache, chi, Mh_ chi);
|
||||||
map_buffer(cache, trK, Mh_ trK);
|
map_buffer(cache, trK, Mh_ trK);
|
||||||
map_buffer(cache, dxx, Mh_ dxx);
|
map_buffer(cache, dxx, Mh_ dxx);
|
||||||
@@ -3454,12 +3684,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
|||||||
//-----------------------------------------------------
|
//-----------------------------------------------------
|
||||||
//-------------------FOR GPU TEST----------------------
|
//-------------------FOR GPU TEST----------------------
|
||||||
//-----------------------------------------------------
|
//-----------------------------------------------------
|
||||||
#ifdef TIMING
|
#ifdef TIMING
|
||||||
cudaThreadSynchronize();
|
cudaThreadSynchronize();
|
||||||
gettimeofday(&tv2, NULL);
|
gettimeofday(&tv2, NULL);
|
||||||
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
|
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
*meta = saved_meta;
|
||||||
|
|
||||||
return 0;//TODO return
|
return 0;//TODO return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -69,6 +69,8 @@ int bssn_gpu_bind_process_device(int mpi_rank);
|
|||||||
void bssn_gpu_clear_cached_device_buffers();
|
void bssn_gpu_clear_cached_device_buffers();
|
||||||
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
|
const double *bssn_gpu_find_device_buffer(const double *host_ptr);
|
||||||
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
|
void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
|
||||||
|
int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
|
||||||
|
int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
|
||||||
|
|
||||||
/** Init GPU side data in GPUMeta. */
|
/** Init GPU side data in GPUMeta. */
|
||||||
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
||||||
|
|||||||
Reference in New Issue
Block a user