From 74ba5feb86616c976094acede925d8e9c307a8df Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sat, 2 May 2026 19:21:57 +0800 Subject: [PATCH] Pin EScalar scalar CUDA transfers --- AMSS_NCKU_source/bssn_rhs_cuda.cu | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index c3ef27d..90bad7b 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include "macrodef.h" @@ -198,6 +199,34 @@ static int rhs_stage_timing_every() { return every; } +static bool escalar_host_pin_enabled() { + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +static void try_pin_escalar_host_buffer(void *ptr, size_t bytes) { + if (!ptr || bytes == 0 || !escalar_host_pin_enabled()) + return; + + static std::unordered_set registered; + static std::unordered_set failed; + if (registered.find(ptr) != registered.end() || + failed.find(ptr) != failed.end()) + return; + + cudaError_t err = cudaHostRegister(ptr, bytes, cudaHostRegisterPortable); + if (err == cudaSuccess || err == cudaErrorHostMemoryAlreadyRegistered) { + registered.insert(ptr); + } else { + failed.insert(ptr); + } + cudaGetLastError(); +} + static double cuda_profile_now_ms() { using clock = std::chrono::steady_clock; return std::chrono::duration( @@ -7030,6 +7059,10 @@ int bssn_cuda_compute_escalar_matter(void *block_tag, g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); set_resident_host_clean(ctx, input_bank, false); } + try_pin_escalar_host_buffer(Sphi_host, bytes); + try_pin_escalar_host_buffer(Spi_host, bytes); + try_pin_escalar_host_buffer(Sphi_rhs_host, bytes); + try_pin_escalar_host_buffer(Spi_rhs_host, bytes); CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice));