Broaden cached CUDA sync paths

This commit is contained in:
2026-05-01 18:03:04 +08:00
parent 51f3819892
commit 35b6ceff02
6 changed files with 122 additions and 6 deletions

View File

@@ -5224,6 +5224,36 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
}
}
static void copy_state_region_packed_batch_device_cuda(void *block_tag,
int state_count,
double *device_buffer,
const int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz,
int pack_not_unpack)
{
if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const int region_all = sx * sy * sz;
dim3 launch_grid((unsigned int)grid((size_t)region_all),
(unsigned int)state_count);
if (pack_not_unpack) {
kern_pack_state_region_batch<<<launch_grid, BLK>>>(
ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
i0, j0, k0, sx, sy, sz, region_all, state_count,
ex[0] * ex[1] * ex[2]);
} else {
kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
ctx.d_state_curr_mem, device_buffer, ex[0], ex[1],
i0, j0, k0, sx, sy, sz, region_all, state_count,
ex[0] * ex[1] * ex[2]);
ctx.state_ready = true;
}
}
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
{
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
@@ -7451,6 +7481,36 @@ extern "C" int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
return 0;
}
extern "C" int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
using namespace z4c_cuda;
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
i0, j0, k0, sx, sy, sz, 1);
return 0;
}
extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz)
{
using namespace z4c_cuda;
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
copy_state_region_packed_batch_device_cuda(block_tag, state_count, device_buffer, ex,
i0, j0, k0, sx, sy, sz, 0);
return 0;
}
extern "C" int z4c_cuda_download_state_subset(void *block_tag,
int *ex,
int subset_count,