Add mixed GPU RP path for EScalar

2026-05-02 18:27:26 +08:00
parent 59a216ad93
commit f638cbc4e8
5 changed files with 280 additions and 23 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -7565,6 +7565,78 @@ int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag
    return 0;
 }

+extern "C"
+int bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views(void *block_tag,
+                                                                 double **state_host_key,
+                                                                 int state_count,
+                                                                 double *host_buffer,
+                                                                 int *ex,
+                                                                 int sx, int sy, int sz,
+                                                                 int fi0, int fj0, int fk0,
+                                                                 const double *state_soa)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (!host_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all, false);
+    if (bank < 0 || !ctx.resident_valid[bank]) return 1;
+    const int region_all = sx * sy * sz;
+    const size_t total_doubles = (size_t)state_count * (size_t)region_all;
+    double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
+    upload_comm_state_soa(state_soa, state_count);
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+    kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
+        ctx.d_resident_mem[bank], d_comm,
+        ex[0], ex[1], sx, sy, sz,
+        fi0, fj0, fk0, region_all, state_count,
+        ex[0] * ex[1] * ex[2]);
+    CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
+                          total_doubles * sizeof(double),
+                          cudaMemcpyDeviceToHost));
+    return 0;
+}
+
+extern "C"
+int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag,
+                                                                double **state_host_key,
+                                                                int state_count,
+                                                                double *host_buffer,
+                                                                int *ex,
+                                                                int sx, int sy, int sz,
+                                                                int ii0, int jj0, int kk0,
+                                                                int lbc_i, int lbc_j, int lbc_k,
+                                                                const double *state_soa)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (!host_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const int bank = active_or_keyed_bank(ctx, state_host_key, all, false);
+    if (bank < 0 || !ctx.resident_valid[bank]) return 1;
+    const int region_all = sx * sy * sz;
+    const size_t total_doubles = (size_t)state_count * (size_t)region_all;
+    double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
+    upload_comm_state_soa(state_soa, state_count);
+    dim3 launch_grid((unsigned int)grid((size_t)region_all),
+                     (unsigned int)state_count);
+    kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
+        ctx.d_resident_mem[bank], d_comm,
+        ex[0], ex[1], sx, sy, sz,
+        ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
+        region_all, state_count,
+        ex[0] * ex[1] * ex[2]);
+    CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
+                          total_doubles * sizeof(double),
+                          cudaMemcpyDeviceToHost));
+    return 0;
+}
+
 static void copy_state_device_batch(void *block_tag,
                                    int state_count,
                                    double *device_buffer,