Optimize BSSN CUDA state transfers

2026-04-29 18:34:31 +08:00
parent 22c1e7168b
commit 090d8657ae
3 changed files with 332 additions and 9 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -74,6 +74,12 @@ struct CudaProfileStats {
    double bc_ms;
    double finalize_ms;
    double output_ms;
+    long long upload_calls;
+    long long resident_download_calls;
+    double upload_ms;
+    double resident_download_ms;
+    double upload_gb;
+    double resident_download_gb;
 };

 enum RhsStageId {
@@ -97,7 +103,10 @@ struct RhsStageProfileStats {
 };

 static CudaProfileStats &cuda_profile_stats() {
-    static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    static CudaProfileStats stats = {
+        0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0, 0, 0.0, 0.0, 0.0, 0.0
+    };
    return stats;
 }

@@ -162,7 +171,8 @@ static void cuda_profile_maybe_log() {
    CudaProfileStats &stats = cuda_profile_stats();
    if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
    fprintf(stderr,
-            "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n",
+            "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms"
+            " uploads=%lld avg_upload=%.3f ms upload_GB=%.3f resident_downloads=%lld avg_resident_download=%.3f ms resident_download_GB=%.3f\n",
            g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
            stats.total_ms / (double)stats.calls,
            stats.state_ms / (double)stats.calls,
@@ -170,7 +180,13 @@ static void cuda_profile_maybe_log() {
            stats.rhs_ms / (double)stats.calls,
            stats.bc_ms / (double)stats.calls,
            stats.finalize_ms / (double)stats.calls,
-            stats.output_ms / (double)stats.calls);
+            stats.output_ms / (double)stats.calls,
+            stats.upload_calls,
+            stats.upload_calls ? stats.upload_ms / (double)stats.upload_calls : 0.0,
+            stats.upload_gb,
+            stats.resident_download_calls,
+            stats.resident_download_calls ? stats.resident_download_ms / (double)stats.resident_download_calls : 0.0,
+            stats.resident_download_gb);
    fflush(stderr);
 }

@@ -542,6 +558,8 @@ struct StepAllocation {

 static std::unordered_map<void *, StepContext> g_step_ctx;
 static std::vector<StepAllocation> g_step_pool;
+static int *g_comm_segment_meta = nullptr;
+static size_t g_comm_segment_meta_cap = 0;

 static StepAllocation empty_step_allocation()
 {
@@ -760,6 +778,20 @@ static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doub
    return ctx.h_comm_mem;
 }

+static int *ensure_comm_segment_meta_buffer(size_t needed_ints)
+{
+    if (needed_ints == 0) return nullptr;
+    if (g_comm_segment_meta_cap < needed_ints) {
+        if (g_comm_segment_meta) {
+            CUDA_CHECK(cudaFree(g_comm_segment_meta));
+            g_comm_segment_meta = nullptr;
+        }
+        CUDA_CHECK(cudaMalloc(&g_comm_segment_meta, needed_ints * sizeof(int)));
+        g_comm_segment_meta_cap = needed_ints;
+    }
+    return g_comm_segment_meta;
+}
+
 static void upload_grid_params_if_needed(const GridParams &gp)
 {
    if (!g_gp_host_cache_valid ||
@@ -4716,18 +4748,25 @@ static void compute_patch_boundary_flags(int *ex,
 static void upload_state_inputs(double **state_host, size_t all)
 {
    const size_t bytes = all * sizeof(double);
+    const bool profile = cuda_profile_enabled();
+    const double t0 = profile ? cuda_profile_now_ms() : 0.0;
    static int direct_upload = -1;
    if (direct_upload < 0) {
        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD");
-        const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
-        direct_upload = env ? ((atoi(env) != 0) ? 1 : 0)
-                            : ((pin_env && atoi(pin_env) != 0) ? 1 : 0);
+        direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
    }
    if (direct_upload) {
        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
            CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i],
                                       bytes, cudaMemcpyHostToDevice));
        }
+        if (profile) {
+            cuda_profile_sync();
+            CudaProfileStats &stats = cuda_profile_stats();
+            stats.upload_calls++;
+            stats.upload_ms += cuda_profile_now_ms() - t0;
+            stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
+        }
        return;
    }
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
@@ -4736,6 +4775,12 @@ static void upload_state_inputs(double **state_host, size_t all)
    CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
                          (size_t)BSSN_STATE_COUNT * bytes,
                          cudaMemcpyHostToDevice));
+    if (profile) {
+        CudaProfileStats &stats = cuda_profile_stats();
+        stats.upload_calls++;
+        stats.upload_ms += cuda_profile_now_ms() - t0;
+        stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
+    }
 }

 static void upload_matter_cache(StepContext &ctx,
@@ -5168,6 +5213,62 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
    }
 }

+__global__ void kern_pack_state_segments_batch(const double * __restrict__ src_mem,
+                                               double * __restrict__ dst,
+                                               int nx, int ny,
+                                               const int * __restrict__ meta,
+                                               int state_count,
+                                               int all)
+{
+    const int segment = blockIdx.z;
+    const int state_index = blockIdx.y;
+    const int *m = meta + segment * 8;
+    const int i0 = m[0], j0 = m[1], k0 = m[2];
+    const int sx = m[3], sy = m[4];
+    const int region_all = m[6];
+    const int offset = m[7];
+    if (state_index >= state_count) return;
+    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
+         local < region_all;
+         local += blockDim.x * gridDim.x)
+    {
+        const int ii = local % sx;
+        const int jj = (local / sx) % sy;
+        const int kk = local / (sx * sy);
+        const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
+        dst[(size_t)offset + (size_t)state_index * region_all + local] =
+            src_mem[(size_t)state_index * all + src];
+    }
+}
+
+__global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem,
+                                                 const double * __restrict__ src,
+                                                 int nx, int ny,
+                                                 const int * __restrict__ meta,
+                                                 int state_count,
+                                                 int all)
+{
+    const int segment = blockIdx.z;
+    const int state_index = blockIdx.y;
+    const int *m = meta + segment * 8;
+    const int i0 = m[0], j0 = m[1], k0 = m[2];
+    const int sx = m[3], sy = m[4];
+    const int region_all = m[6];
+    const int offset = m[7];
+    if (state_index >= state_count) return;
+    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
+         local < region_all;
+         local += blockDim.x * gridDim.x)
+    {
+        const int ii = local % sx;
+        const int jj = (local / sx) % sy;
+        const int kk = local / (sx * sy);
+        const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
+        dst_mem[(size_t)state_index * all + dst] =
+            src[(size_t)offset + (size_t)state_index * region_all + local];
+    }
+}
+
 __global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
                                       double * __restrict__ dst,
                                       int subset_count,
@@ -5308,12 +5409,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, all);
+    const bool profile = cuda_profile_enabled();
+    const double t0 = profile ? cuda_profile_now_ms() : 0.0;
    static int direct_download = -1;
    if (direct_download < 0) {
        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
-        const char *pin_env = getenv("AMSS_CUDA_PIN_GRIDFUNCS");
-        direct_download = env ? ((atoi(env) != 0) ? 1 : 0)
-                              : ((pin_env && atoi(pin_env) != 0) ? 1 : 0);
+        direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
    }
    if (direct_download) {
        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
@@ -5321,6 +5422,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
                                       bytes, cudaMemcpyDeviceToHost));
        }
        CUDA_CHECK(cudaDeviceSynchronize());
+        if (profile) {
+            CudaProfileStats &stats = cuda_profile_stats();
+            stats.resident_download_calls++;
+            stats.resident_download_ms += cuda_profile_now_ms() - t0;
+            stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
+        }
        return;
    }
    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
@@ -5329,6 +5436,12 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
    }
+    if (profile) {
+        CudaProfileStats &stats = cuda_profile_stats();
+        stats.resident_download_calls++;
+        stats.resident_download_ms += cuda_profile_now_ms() - t0;
+        stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
+    }
 }

 static void copy_state_subset(void *block_tag,
@@ -6060,6 +6173,47 @@ static void copy_state_device_batch(void *block_tag,
    }
 }

+static void copy_state_device_segments(void *block_tag,
+                                       int state_count,
+                                       double *device_buffer,
+                                       const int *ex,
+                                       int segment_count,
+                                       const int *segment_meta,
+                                       int pack_not_unpack)
+{
+    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (segment_count <= 0 || !segment_meta) return;
+
+    int max_region_all = 0;
+    for (int s = 0; s < segment_count; ++s) {
+        const int *m = segment_meta + s * 8;
+        if (m[3] <= 0 || m[4] <= 0 || m[5] <= 0 || m[6] <= 0) return;
+        if (m[6] > max_region_all) max_region_all = m[6];
+    }
+    if (max_region_all <= 0) return;
+
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
+    CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
+                          (size_t)segment_count * 8 * sizeof(int),
+                          cudaMemcpyHostToDevice));
+
+    dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
+                     (unsigned int)state_count,
+                     (unsigned int)segment_count);
+    if (pack_not_unpack) {
+        kern_pack_state_segments_batch<<<launch_grid, BLK>>>(
+            ctx.d_state_curr_mem, device_buffer,
+            ex[0], ex[1], d_meta, state_count,
+            ex[0] * ex[1] * ex[2]);
+    } else {
+        kern_unpack_state_segments_batch<<<launch_grid, BLK>>>(
+            ctx.d_state_curr_mem, device_buffer,
+            ex[0], ex[1], d_meta, state_count,
+            ex[0] * ex[1] * ex[2]);
+    }
+}
+
 extern "C"
 int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                 int state_count,
@@ -6090,6 +6244,36 @@ int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
    return 0;
 }

+extern "C"
+int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
+                                                   int state_count,
+                                                   double *device_buffer,
+                                                   int *ex,
+                                                   int segment_count,
+                                                   const int *segment_meta)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
+                               segment_count, segment_meta, 1);
+    return 0;
+}
+
+extern "C"
+int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
+                                                       int state_count,
+                                                       double *device_buffer,
+                                                       int *ex,
+                                                       int segment_count,
+                                                       const int *segment_meta)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
+                               segment_count, segment_meta, 0);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_download_state_subset(void *block_tag,
                                    int *ex,