Pin EScalar scalar CUDA transfers

This commit is contained in:
2026-05-02 19:21:57 +08:00
parent 6f28111a43
commit 74ba5feb86

View File

@@ -12,6 +12,7 @@
#include <cmath>
#include <cstring>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <cuda_runtime.h>
#include "macrodef.h"
@@ -198,6 +199,34 @@ static int rhs_stage_timing_every() {
return every;
}
static bool escalar_host_pin_enabled() {
static int enabled = -1;
if (enabled < 0) {
const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS");
enabled = (!env || atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
static void try_pin_escalar_host_buffer(void *ptr, size_t bytes) {
if (!ptr || bytes == 0 || !escalar_host_pin_enabled())
return;
static std::unordered_set<void *> registered;
static std::unordered_set<void *> failed;
if (registered.find(ptr) != registered.end() ||
failed.find(ptr) != failed.end())
return;
cudaError_t err = cudaHostRegister(ptr, bytes, cudaHostRegisterPortable);
if (err == cudaSuccess || err == cudaErrorHostMemoryAlreadyRegistered) {
registered.insert(ptr);
} else {
failed.insert(ptr);
}
cudaGetLastError();
}
static double cuda_profile_now_ms() {
using clock = std::chrono::steady_clock;
return std::chrono::duration<double, std::milli>(
@@ -7030,6 +7059,10 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
set_resident_host_clean(ctx, input_bank, false);
}
try_pin_escalar_host_buffer(Sphi_host, bytes);
try_pin_escalar_host_buffer(Spi_host, bytes);
try_pin_escalar_host_buffer(Sphi_rhs_host, bytes);
try_pin_escalar_host_buffer(Spi_rhs_host, bytes);
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice));