Add mixed GPU RP path for EScalar

This commit is contained in:
2026-05-02 18:27:26 +08:00
parent 59a216ad93
commit f638cbc4e8
5 changed files with 280 additions and 23 deletions

View File

@@ -7565,6 +7565,78 @@ int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag
return 0;
}
extern "C"
int bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *host_buffer,
int *ex,
int sx, int sy, int sz,
int fi0, int fj0, int fk0,
const double *state_soa)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
if (!host_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, false);
if (bank < 0 || !ctx.resident_valid[bank]) return 1;
const int region_all = sx * sy * sz;
const size_t total_doubles = (size_t)state_count * (size_t)region_all;
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
upload_comm_state_soa(state_soa, state_count);
dim3 launch_grid((unsigned int)grid((size_t)region_all),
(unsigned int)state_count);
kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
ctx.d_resident_mem[bank], d_comm,
ex[0], ex[1], sx, sy, sz,
fi0, fj0, fk0, region_all, state_count,
ex[0] * ex[1] * ex[2]);
CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
total_doubles * sizeof(double),
cudaMemcpyDeviceToHost));
return 0;
}
extern "C"
int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *host_buffer,
int *ex,
int sx, int sy, int sz,
int ii0, int jj0, int kk0,
int lbc_i, int lbc_j, int lbc_k,
const double *state_soa)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
if (!host_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, state_host_key, all, false);
if (bank < 0 || !ctx.resident_valid[bank]) return 1;
const int region_all = sx * sy * sz;
const size_t total_doubles = (size_t)state_count * (size_t)region_all;
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
upload_comm_state_soa(state_soa, state_count);
dim3 launch_grid((unsigned int)grid((size_t)region_all),
(unsigned int)state_count);
kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
ctx.d_resident_mem[bank], d_comm,
ex[0], ex[1], sx, sy, sz,
ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
region_all, state_count,
ex[0] * ex[1] * ex[2]);
CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
total_doubles * sizeof(double),
cudaMemcpyDeviceToHost));
return 0;
}
static void copy_state_device_batch(void *block_tag,
int state_count,
double *device_buffer,