Compare commits

..

1 Commits

Author SHA1 Message Date
abnerluo
d9c7ea8085 Use cudaMemcpyAsync with dedicated transfer stream for H2D/D2H transfers
Add cudaStream_t to GpuBuffers for async H2D/D2H transfers in BSSN and
Z4C substep functions. Adds cudaStreamSynchronize(0) before D2H to
enforce kernel/transfer ordering across streams, and a sync between
state and matter H2D uploads to prevent h_stage race on RK4==0.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-28 08:23:34 +08:00
4 changed files with 97 additions and 107 deletions

View File

@@ -279,11 +279,12 @@ struct GpuBuffers {
size_t cap_fh3_size;
int prev_nx, prev_ny, prev_nz;
bool initialized;
cudaStream_t stream; /* dedicated transfer stream */
};
static GpuBuffers g_buf = {
nullptr, nullptr, nullptr, nullptr, false, {},
0, 0, 0, 0, 0, 0, false
0, 0, 0, 0, 0, 0, false, nullptr
};
/* Slot assignments — INPUT (H2D) */
@@ -565,6 +566,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
|| (fh3_size > g_buf.cap_fh3_size);
if (need_grow) {
if (g_buf.stream) { cudaStreamDestroy(g_buf.stream); g_buf.stream = nullptr; }
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
@@ -592,6 +594,9 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
}
}
if (!g_buf.stream)
CUDA_CHECK(cudaStreamCreate(&g_buf.stream));
g_buf.cap_all = all;
g_buf.cap_fh2_size = fh2_size;
g_buf.cap_fh3_size = fh3_size;
@@ -4646,9 +4651,9 @@ static void upload_state_inputs(double **state_host, size_t all)
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
}
static void upload_matter_cache(StepContext &ctx,
@@ -4659,9 +4664,9 @@ static void upload_matter_cache(StepContext &ctx,
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
ctx.matter_ready = true;
}
@@ -4989,9 +4994,11 @@ static void launch_rhs_pipeline(int all, double eps, int co)
static void download_state_outputs(double **state_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -5000,9 +5007,11 @@ static void download_state_outputs(double **state_host_out, size_t all)
static void download_constraint_outputs(double **constraint_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -5708,11 +5717,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
if (use_zero_matter) {
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
} else {
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
upload_matter_cache(ctx, matter_host, all);
}
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice, g_buf.stream));
} else if (!ctx.matter_ready) {
if (use_zero_matter) zero_matter_cache(ctx, all);
else upload_matter_cache(ctx, matter_host, all);

View File

@@ -13,15 +13,12 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(TOOLCHAIN),intel)
OMP_FLAG = -qopenmp
ifeq ($(PGO_MODE),instrument)
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
@@ -29,24 +26,9 @@ else
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
endif
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc $(MKL_INC)
else
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
## PGO_MODE is ignored in this branch.
OMP_FLAG = -mp
CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
$(MKL_INC) $(POLINT6_FLAG)
TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-Dfortran3 -Dnewc $(MKL_INC)
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif
.SUFFIXES: .o .f90 .C .for .cu
@@ -96,11 +78,17 @@ z4c_rhs_c.o: z4c_rhs_c.C
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
# Input files
@@ -254,7 +242,7 @@ ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean:
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f

View File

@@ -1,7 +1,28 @@
## Toolchain selection
## nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default)
## intel : Intel oneAPI toolchain (legacy path)
TOOLCHAIN ?= nvhpc
## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization
@@ -22,14 +43,6 @@ else
INTERP_LB_FLAGS =
endif
MKLROOT ?= /home/intel/oneapi/mkl/latest
MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
MKL_INC ?= -I$(MKLROOT)/include
NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
CUDA_HOME ?= $(NVHPC_ROOT)/cuda
CUDA_ARCH ?= sm_80
## Kernel implementation switch
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
## 0 : fall back to original Fortran kernels
@@ -45,47 +58,17 @@ USE_CXX_Z4C_KERNELS ?= 1
## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(TOOLCHAIN),intel)
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lifcore -limf -liomp5 -lpthread -lm -ldl \
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
else ifeq ($(TOOLCHAIN),nvhpc)
f90 = mpifort
f77 = mpifort
CXX = mpicxx
CC = mpicc
CLINKER = mpicxx
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lpthread -lm -ldl \
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
-fortranlibs
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
CUDA_ARCH ?= sm_80
ifneq ($(strip $(CUDA_ARCH)),)
CUDA_APP_FLAGS += -arch=$(CUDA_ARCH)
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)

View File

@@ -292,11 +292,12 @@ struct GpuBuffers {
size_t cap_fh3_size;
int prev_nx, prev_ny, prev_nz;
bool initialized;
cudaStream_t stream; /* dedicated transfer stream */
};
static GpuBuffers g_buf = {
nullptr, nullptr, nullptr, nullptr, false, {},
0, 0, 0, 0, 0, 0, false
0, 0, 0, 0, 0, 0, false, nullptr
};
/* Slot assignments — INPUT (H2D) */
@@ -595,6 +596,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
|| (fh3_size > g_buf.cap_fh3_size);
if (need_grow) {
if (g_buf.stream) { cudaStreamDestroy(g_buf.stream); g_buf.stream = nullptr; }
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
@@ -622,6 +624,9 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
}
}
if (!g_buf.stream)
CUDA_CHECK(cudaStreamCreate(&g_buf.stream));
g_buf.cap_all = all;
g_buf.cap_fh2_size = fh2_size;
g_buf.cap_fh3_size = fh3_size;
@@ -4679,9 +4684,9 @@ static void upload_state_inputs(double **state_host, size_t all)
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
}
static void upload_matter_cache(StepContext &ctx,
@@ -4692,9 +4697,9 @@ static void upload_matter_cache(StepContext &ctx,
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
ctx.matter_ready = true;
}
@@ -5022,9 +5027,11 @@ static void launch_rhs_pipeline(int all, double eps, int co)
static void download_state_outputs(double **state_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -5033,9 +5040,11 @@ static void download_state_outputs(double **state_host_out, size_t all)
static void download_constraint_outputs(double **constraint_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -7306,9 +7315,9 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
}
if (RK4 == 0) {
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice, g_buf.stream));
}
if (profile) {
cuda_profile_sync();