Pin EScalar scalar CUDA transfers
This commit is contained in:
@@ -12,6 +12,7 @@
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
#include <cuda_runtime.h>
|
||||
#include "macrodef.h"
|
||||
@@ -198,6 +199,34 @@ static int rhs_stage_timing_every() {
|
||||
return every;
|
||||
}
|
||||
|
||||
static bool escalar_host_pin_enabled() {
|
||||
static int enabled = -1;
|
||||
if (enabled < 0) {
|
||||
const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
static void try_pin_escalar_host_buffer(void *ptr, size_t bytes) {
|
||||
if (!ptr || bytes == 0 || !escalar_host_pin_enabled())
|
||||
return;
|
||||
|
||||
static std::unordered_set<void *> registered;
|
||||
static std::unordered_set<void *> failed;
|
||||
if (registered.find(ptr) != registered.end() ||
|
||||
failed.find(ptr) != failed.end())
|
||||
return;
|
||||
|
||||
cudaError_t err = cudaHostRegister(ptr, bytes, cudaHostRegisterPortable);
|
||||
if (err == cudaSuccess || err == cudaErrorHostMemoryAlreadyRegistered) {
|
||||
registered.insert(ptr);
|
||||
} else {
|
||||
failed.insert(ptr);
|
||||
}
|
||||
cudaGetLastError();
|
||||
}
|
||||
|
||||
static double cuda_profile_now_ms() {
|
||||
using clock = std::chrono::steady_clock;
|
||||
return std::chrono::duration<double, std::milli>(
|
||||
@@ -7030,6 +7059,10 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
|
||||
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
|
||||
set_resident_host_clean(ctx, input_bank, false);
|
||||
}
|
||||
try_pin_escalar_host_buffer(Sphi_host, bytes);
|
||||
try_pin_escalar_host_buffer(Spi_host, bytes);
|
||||
try_pin_escalar_host_buffer(Sphi_rhs_host, bytes);
|
||||
try_pin_escalar_host_buffer(Spi_rhs_host, bytes);
|
||||
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_S_arr], Sphi_host, bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_f_arr], Spi_host, bytes, cudaMemcpyHostToDevice));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user