From 01ac1f92509185233b82d9229e9664668f45586a Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Wed, 8 Apr 2026 19:43:17 +0800 Subject: [PATCH] Cache GPU main-path device buffers --- AMSS_NCKU_source/bssn_cuda_ops.cu | 178 +++++--- AMSS_NCKU_source/bssn_gpu.cu | 735 +++++++++++++++++++++++------- AMSS_NCKU_source/bssn_gpu.h | 6 +- 3 files changed, 697 insertions(+), 222 deletions(-) diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index 6ef14d0..0f6b8f4 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -33,15 +33,43 @@ struct DeviceArrays double *d = nullptr; }; -inline bool copy_to_device(double *&dst, const double *src, size_t bytes) +struct CachedBuffer { - cudaError_t err = cudaMalloc(&dst, bytes); + double *ptr = nullptr; + size_t capacity = 0; +}; + +inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes) +{ + if (bytes <= buffer.capacity && buffer.ptr) + return true; + + if (buffer.ptr) + { + cudaError_t free_err = cudaFree(buffer.ptr); + if (free_err != cudaSuccess) + report_cuda_error("cudaFree", free_err); + buffer.ptr = nullptr; + buffer.capacity = 0; + } + + cudaError_t err = cudaMalloc(&buffer.ptr, bytes); if (err != cudaSuccess) { report_cuda_error("cudaMalloc", err); return false; } - err = cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice); + + buffer.capacity = bytes; + return true; +} + +inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes) +{ + if (!ensure_capacity(dst, bytes)) + return false; + + cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice); if (err != cudaSuccess) { report_cuda_error("cudaMemcpy(H2D)", err); @@ -50,12 +78,6 @@ inline bool copy_to_device(double *&dst, const double *src, size_t bytes) return true; } -inline void free_device(double *ptr) -{ - if (ptr) - cudaFree(ptr); -} - __global__ void enforce_ga_kernel(int n, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, @@ -376,31 +398,37 @@ int bssn_cuda_enforce_ga(int *ex, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz) { + struct EnforceGaCache + { + CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz; + CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz; + }; + static thread_local EnforceGaCache cache; + int n = count_points(ex); const size_t bytes = static_cast(n) * sizeof(double); dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); - double *d_dxx = nullptr, *d_gxy = nullptr, *d_gxz = nullptr; - double *d_dyy = nullptr, *d_gyz = nullptr, *d_dzz = nullptr; - double *d_Axx = nullptr, *d_Axy = nullptr, *d_Axz = nullptr; - double *d_Ayy = nullptr, *d_Ayz = nullptr, *d_Azz = nullptr; - - bool ok = copy_to_device(d_dxx, dxx, bytes) && - copy_to_device(d_gxy, gxy, bytes) && - copy_to_device(d_gxz, gxz, bytes) && - copy_to_device(d_dyy, dyy, bytes) && - copy_to_device(d_gyz, gyz, bytes) && - copy_to_device(d_dzz, dzz, bytes) && - copy_to_device(d_Axx, Axx, bytes) && - copy_to_device(d_Axy, Axy, bytes) && - copy_to_device(d_Axz, Axz, bytes) && - copy_to_device(d_Ayy, Ayy, bytes) && - copy_to_device(d_Ayz, Ayz, bytes) && - copy_to_device(d_Azz, Azz, bytes); + bool ok = copy_to_device(cache.dxx, dxx, bytes) && + copy_to_device(cache.gxy, gxy, bytes) && + copy_to_device(cache.gxz, gxz, bytes) && + copy_to_device(cache.dyy, dyy, bytes) && + copy_to_device(cache.gyz, gyz, bytes) && + copy_to_device(cache.dzz, dzz, bytes) && + copy_to_device(cache.Axx, Axx, bytes) && + copy_to_device(cache.Axy, Axy, bytes) && + copy_to_device(cache.Axz, Axz, bytes) && + copy_to_device(cache.Ayy, Ayy, bytes) && + copy_to_device(cache.Ayz, Ayz, bytes) && + copy_to_device(cache.Azz, Azz, bytes); if (ok) { + double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr; + double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr; + double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr; + double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr; void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz, &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz}; ok = launch_and_sync(grid, block, (const void *)enforce_ga_kernel, args); @@ -408,27 +436,22 @@ int bssn_cuda_enforce_ga(int *ex, if (ok) { - cudaError_t err = cudaMemcpy(dxx, d_dxx, bytes, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy(dxx, cache.dxx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err); ok = err == cudaSuccess; - if (ok) { err = cudaMemcpy(gxy, d_gxy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(gxz, d_gxz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(dyy, d_dyy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(gyz, d_gyz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(dzz, d_dzz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Axx, d_Axx, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Axy, d_Axy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Axz, d_Axz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Ayy, d_Ayy, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Ayz, d_Ayz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Azz, d_Azz, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(gxy, cache.gxy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(gxz, cache.gxz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(dyy, cache.dyy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(gyz, cache.gyz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(dzz, cache.dzz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Axx, cache.Axx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Axy, cache.Axy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Axz, cache.Axz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Ayy, cache.Ayy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Ayz, cache.Ayz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; } + if (ok) { err = cudaMemcpy(Azz, cache.Azz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; } } - free_device(d_dxx); free_device(d_gxy); free_device(d_gxz); - free_device(d_dyy); free_device(d_gyz); free_device(d_dzz); - free_device(d_Axx); free_device(d_Axy); free_device(d_Axz); - free_device(d_Ayy); free_device(d_Ayz); free_device(d_Azz); - return ok ? 0 : 1; } @@ -446,6 +469,19 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, int lev, int rk_stage) { + struct Rk4BoundaryCache + { + CachedBuffer X, Y, Z; + CachedBuffer state0, boundary, stage, rhs; + const double *host_X = nullptr; + const double *host_Y = nullptr; + const double *host_Z = nullptr; + int nx = 0; + int ny = 0; + int nz = 0; + }; + static thread_local Rk4BoundaryCache cache; + int nx = ex[0]; int ny = ex[1]; int nz = ex[2]; @@ -457,23 +493,32 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); - double *d_X = nullptr, *d_Y = nullptr, *d_Z = nullptr; - double *d_state0 = nullptr, *d_boundary = nullptr, *d_stage = nullptr, *d_rhs = nullptr; + bool ok = true; + if (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z || + cache.nx != nx || cache.ny != ny || cache.nz != nz) + { + ok = copy_to_device(cache.X, X, bytes_x) && + copy_to_device(cache.Y, Y, bytes_y) && + copy_to_device(cache.Z, Z, bytes_z); + if (ok) + { + cache.host_X = X; + cache.host_Y = Y; + cache.host_Z = Z; + cache.nx = nx; + cache.ny = ny; + cache.nz = nz; + } + } - bool ok = copy_to_device(d_X, X, bytes_x) && - copy_to_device(d_Y, Y, bytes_y) && - copy_to_device(d_Z, Z, bytes_z) && - copy_to_device(d_state0, state0, bytes) && - copy_to_device(d_boundary, boundary_src, bytes) && - copy_to_device(d_stage, stage_data, bytes) && - copy_to_device(d_rhs, rhs_accum, bytes); + ok = ok && + copy_to_device(cache.state0, state0, bytes) && + copy_to_device(cache.boundary, boundary_src, bytes) && + copy_to_device(cache.stage, stage_data, bytes) && + copy_to_device(cache.rhs, rhs_accum, bytes); if (!ok) - { - free_device(d_X); free_device(d_Y); free_device(d_Z); - free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs); return 1; - } double dX = X[1] - X[0]; double dY = Y[1] - Y[0]; @@ -498,6 +543,9 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0; if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0; + double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr; + double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr; + double *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr; double *bam_target = (rk_stage == 0) ? d_rhs : d_stage; const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary; void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z, @@ -513,12 +561,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (ok) { + double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr; void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage}; ok = launch_and_sync(grid, block, (const void *)rk4_kernel, args); } if (ok && lev > 0) { + double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr; void *args[] = {&nx, &ny, &nz, &has_xmin, &has_ymin, &has_zmin, &has_xmax, &has_ymax, &has_zmax, @@ -528,45 +578,43 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (ok) { - cudaError_t err = cudaMemcpy(stage_data, d_stage, bytes, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); ok = err == cudaSuccess; if (ok) { - err = cudaMemcpy(rhs_accum, d_rhs, bytes, cudaMemcpyDeviceToHost); + err = cudaMemcpy(rhs_accum, cache.rhs.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) rhs_accum", err); ok = err == cudaSuccess; } } - free_device(d_X); free_device(d_Y); free_device(d_Z); - free_device(d_state0); free_device(d_boundary); free_device(d_stage); free_device(d_rhs); return ok ? 0 : 1; } int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) { + static thread_local CachedBuffer d_chi; + int n = count_points(ex); const size_t bytes = static_cast(n) * sizeof(double); dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); - double *d_chi = nullptr; bool ok = copy_to_device(d_chi, chi, bytes); if (ok) { - void *args[] = {&n, &d_chi, &tinny}; + double *ptr = d_chi.ptr; + void *args[] = {&n, &ptr, &tinny}; ok = launch_and_sync(grid, block, (const void *)lowerbound_kernel, args); } if (ok) { - cudaError_t err = cudaMemcpy(chi, d_chi, bytes, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy(chi, d_chi.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err); ok = err == cudaSuccess; } - - free_device(d_chi); return ok ? 0 : 1; } diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index bf6920c..b72d8ee 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -1,23 +1,26 @@ // includes, system #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include //#include "cutil.h" #include #include -using namespace std; +using namespace std; //includes, bssn #include "gpu_mem.h" -#include "bssn_gpu.h" -#ifdef RESULT_CHECK -#include -#endif - +#include "bssn_gpu.h" +#ifdef RESULT_CHECK +#include +#endif + +void destroy_meta(Meta *meta); + void compare_result_gpu(int ftag1,double * datac,int data_num){ #ifdef RESULT_CHECK double * data = (double*)malloc(sizeof(double)*data_num); @@ -30,8 +33,426 @@ void compare_result_gpu(int ftag1,double * datac,int data_num){ (void)data_num; #endif } - -__global__ void test_const_address(double * testd){ + +namespace { + +int read_local_rank_from_env() +{ + const char *keys[] = { + "AMSS_NCKU_CUDA_LOCAL_RANK", + "I_MPI_LOCAL_RANK", + "OMPI_COMM_WORLD_LOCAL_RANK", + "MPI_LOCALRANKID", + "PMI_LOCAL_RANK", + "SLURM_LOCALID" + }; + + for (size_t i = 0; i < sizeof(keys) / sizeof(keys[0]); ++i) + { + const char *value = getenv(keys[i]); + if (value && *value) + return atoi(value); + } + return -1; +} + +int read_forced_device_from_env() +{ + const char *value = getenv("AMSS_NCKU_CUDA_DEVICE"); + if (value && *value) + return atoi(value); + return -1; +} + +int select_cuda_device_for_process(int mpi_rank) +{ + static int cached_device = -2; + if (cached_device >= -1) + return cached_device; + + int device_count = 0; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess || device_count <= 0) + { + cached_device = -1; + return cached_device; + } + + int device = read_forced_device_from_env(); + if (device < 0) + { + int local_rank = read_local_rank_from_env(); + if (local_rank < 0) + local_rank = mpi_rank; + device = local_rank % device_count; + } + + if (device < 0) + device = 0; + if (device >= device_count) + device %= device_count; + + err = cudaSetDevice(device); + if (err != cudaSuccess) + { + cerr << "cudaSetDevice(" << device << ") failed: " + << cudaGetErrorString(err) << endl; + cached_device = -1; + return cached_device; + } + + cached_device = device; + return cached_device; +} + +struct BufferSpec +{ + double **slot; + size_t count; +}; + +struct CopySpec +{ + double *dst; + const double *src; + size_t count; +}; + +struct ZeroSpec +{ + double *ptr; + size_t count; +}; + +struct GpuRhsCache +{ + Meta meta{}; + int ex[3] = {0, 0, 0}; + int matrix_size = 0; + int device = -1; + bool allocated = false; + const double *last_x = nullptr; + const double *last_y = nullptr; + const double *last_z = nullptr; +}; + +GpuRhsCache &gpu_rhs_cache() +{ + static GpuRhsCache cache; + return cache; +} + +void reset_meta(Meta *meta) +{ + memset(meta, 0, sizeof(Meta)); +} + +bool ensure_device_buffer(double **ptr, size_t count) +{ + if (*ptr) + return true; + + cudaError_t err = cudaMalloc((void **)ptr, count * sizeof(double)); + if (err != cudaSuccess) + { + cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << endl; + return false; + } + return true; +} + +bool allocate_buffers(const BufferSpec *specs, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + if (!ensure_device_buffer(specs[i].slot, specs[i].count)) + return false; + } + return true; +} + +bool copy_buffers_to_device(const CopySpec *specs, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + cudaError_t err = cudaMemcpy(specs[i].dst, specs[i].src, + specs[i].count * sizeof(double), + cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + cerr << "cudaMemcpy(H2D) failed: " << cudaGetErrorString(err) << endl; + return false; + } + } + return true; +} + +bool zero_buffers(const ZeroSpec *specs, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + cudaError_t err = cudaMemset(specs[i].ptr, 0, specs[i].count * sizeof(double)); + if (err != cudaSuccess) + { + cerr << "cudaMemset failed: " << cudaGetErrorString(err) << endl; + return false; + } + } + return true; +} + +void cleanup_gpu_rhs_cache() +{ + GpuRhsCache &cache = gpu_rhs_cache(); + if (!cache.allocated) + return; + + if (cache.device >= 0) + cudaSetDevice(cache.device); + destroy_meta(&cache.meta); + reset_meta(&cache.meta); + cache.ex[0] = cache.ex[1] = cache.ex[2] = 0; + cache.matrix_size = 0; + cache.device = -1; + cache.allocated = false; + cache.last_x = nullptr; + cache.last_y = nullptr; + cache.last_z = nullptr; +} + +bool register_gpu_rhs_cleanup() +{ + static bool registered = false; + if (!registered) + { + atexit(cleanup_gpu_rhs_cache); + registered = true; + } + return true; +} + +bool prepare_gpu_rhs_cache(GpuRhsCache &cache, int device, int *ex) +{ + register_gpu_rhs_cleanup(); + + const bool shape_changed = + !cache.allocated || + cache.device != device || + cache.ex[0] != ex[0] || + cache.ex[1] != ex[1] || + cache.ex[2] != ex[2]; + + if (!shape_changed) + return true; + + if (cache.allocated) + { + if (cache.device >= 0) + cudaSetDevice(cache.device); + destroy_meta(&cache.meta); + reset_meta(&cache.meta); + } + + cache.device = device; + cache.ex[0] = ex[0]; + cache.ex[1] = ex[1]; + cache.ex[2] = ex[2]; + cache.matrix_size = ex[0] * ex[1] * ex[2]; + cache.last_x = nullptr; + cache.last_y = nullptr; + cache.last_z = nullptr; + + Meta *meta = &cache.meta; + const int matrix_size = cache.matrix_size; + const size_t fh_size = static_cast(ex[0] + 2) * static_cast(ex[1] + 2) * static_cast(ex[2] + 2); + const size_t fh2_size = static_cast(ex[0] + 3) * static_cast(ex[1] + 3) * static_cast(ex[2] + 3); + + const BufferSpec buffers[] = { + {&meta->X, static_cast(ex[0])}, + {&meta->Y, static_cast(ex[1])}, + {&meta->Z, static_cast(ex[2])}, + {&meta->chi, static_cast(matrix_size)}, + {&meta->dxx, static_cast(matrix_size)}, + {&meta->dyy, static_cast(matrix_size)}, + {&meta->dzz, static_cast(matrix_size)}, + {&meta->trK, static_cast(matrix_size)}, + {&meta->gxy, static_cast(matrix_size)}, + {&meta->gxz, static_cast(matrix_size)}, + {&meta->gyz, static_cast(matrix_size)}, + {&meta->Axx, static_cast(matrix_size)}, + {&meta->Axy, static_cast(matrix_size)}, + {&meta->Axz, static_cast(matrix_size)}, + {&meta->Ayz, static_cast(matrix_size)}, + {&meta->Ayy, static_cast(matrix_size)}, + {&meta->Azz, static_cast(matrix_size)}, + {&meta->Gamx, static_cast(matrix_size)}, + {&meta->Gamy, static_cast(matrix_size)}, + {&meta->Gamz, static_cast(matrix_size)}, + {&meta->Lap, static_cast(matrix_size)}, + {&meta->betax, static_cast(matrix_size)}, + {&meta->betay, static_cast(matrix_size)}, + {&meta->betaz, static_cast(matrix_size)}, + {&meta->dtSfx, static_cast(matrix_size)}, + {&meta->dtSfy, static_cast(matrix_size)}, + {&meta->dtSfz, static_cast(matrix_size)}, + {&meta->chi_rhs, static_cast(matrix_size)}, + {&meta->trK_rhs, static_cast(matrix_size)}, + {&meta->gxx_rhs, static_cast(matrix_size)}, + {&meta->gxy_rhs, static_cast(matrix_size)}, + {&meta->gxz_rhs, static_cast(matrix_size)}, + {&meta->gyy_rhs, static_cast(matrix_size)}, + {&meta->gyz_rhs, static_cast(matrix_size)}, + {&meta->gzz_rhs, static_cast(matrix_size)}, + {&meta->Axx_rhs, static_cast(matrix_size)}, + {&meta->Axy_rhs, static_cast(matrix_size)}, + {&meta->Axz_rhs, static_cast(matrix_size)}, + {&meta->Ayy_rhs, static_cast(matrix_size)}, + {&meta->Ayz_rhs, static_cast(matrix_size)}, + {&meta->Azz_rhs, static_cast(matrix_size)}, + {&meta->Gamx_rhs, static_cast(matrix_size)}, + {&meta->Gamy_rhs, static_cast(matrix_size)}, + {&meta->Gamz_rhs, static_cast(matrix_size)}, + {&meta->Lap_rhs, static_cast(matrix_size)}, + {&meta->betax_rhs, static_cast(matrix_size)}, + {&meta->betay_rhs, static_cast(matrix_size)}, + {&meta->betaz_rhs, static_cast(matrix_size)}, + {&meta->dtSfx_rhs, static_cast(matrix_size)}, + {&meta->dtSfy_rhs, static_cast(matrix_size)}, + {&meta->dtSfz_rhs, static_cast(matrix_size)}, + {&meta->rho, static_cast(matrix_size)}, + {&meta->Sx, static_cast(matrix_size)}, + {&meta->Sy, static_cast(matrix_size)}, + {&meta->Sz, static_cast(matrix_size)}, + {&meta->Sxx, static_cast(matrix_size)}, + {&meta->Sxy, static_cast(matrix_size)}, + {&meta->Sxz, static_cast(matrix_size)}, + {&meta->Syy, static_cast(matrix_size)}, + {&meta->Syz, static_cast(matrix_size)}, + {&meta->Szz, static_cast(matrix_size)}, + {&meta->Gamxxx, static_cast(matrix_size)}, + {&meta->Gamxxy, static_cast(matrix_size)}, + {&meta->Gamxxz, static_cast(matrix_size)}, + {&meta->Gamxyy, static_cast(matrix_size)}, + {&meta->Gamxyz, static_cast(matrix_size)}, + {&meta->Gamxzz, static_cast(matrix_size)}, + {&meta->Gamyxx, static_cast(matrix_size)}, + {&meta->Gamyxy, static_cast(matrix_size)}, + {&meta->Gamyxz, static_cast(matrix_size)}, + {&meta->Gamyyy, static_cast(matrix_size)}, + {&meta->Gamyyz, static_cast(matrix_size)}, + {&meta->Gamyzz, static_cast(matrix_size)}, + {&meta->Gamzxx, static_cast(matrix_size)}, + {&meta->Gamzxy, static_cast(matrix_size)}, + {&meta->Gamzxz, static_cast(matrix_size)}, + {&meta->Gamzyy, static_cast(matrix_size)}, + {&meta->Gamzyz, static_cast(matrix_size)}, + {&meta->Gamzzz, static_cast(matrix_size)}, + {&meta->Rxx, static_cast(matrix_size)}, + {&meta->Rxy, static_cast(matrix_size)}, + {&meta->Rxz, static_cast(matrix_size)}, + {&meta->Ryy, static_cast(matrix_size)}, + {&meta->Ryz, static_cast(matrix_size)}, + {&meta->Rzz, static_cast(matrix_size)}, + {&meta->ham_Res, static_cast(matrix_size)}, + {&meta->movx_Res, static_cast(matrix_size)}, + {&meta->movy_Res, static_cast(matrix_size)}, + {&meta->movz_Res, static_cast(matrix_size)}, + {&meta->Gmx_Res, static_cast(matrix_size)}, + {&meta->Gmy_Res, static_cast(matrix_size)}, + {&meta->Gmz_Res, static_cast(matrix_size)}, + {&meta->gxx, static_cast(matrix_size)}, + {&meta->gyy, static_cast(matrix_size)}, + {&meta->gzz, static_cast(matrix_size)}, + {&meta->chix, static_cast(matrix_size)}, + {&meta->chiy, static_cast(matrix_size)}, + {&meta->chiz, static_cast(matrix_size)}, + {&meta->gxxx, static_cast(matrix_size)}, + {&meta->gxyx, static_cast(matrix_size)}, + {&meta->gxzx, static_cast(matrix_size)}, + {&meta->gyyx, static_cast(matrix_size)}, + {&meta->gyzx, static_cast(matrix_size)}, + {&meta->gzzx, static_cast(matrix_size)}, + {&meta->gxxy, static_cast(matrix_size)}, + {&meta->gxyy, static_cast(matrix_size)}, + {&meta->gxzy, static_cast(matrix_size)}, + {&meta->gyyy, static_cast(matrix_size)}, + {&meta->gyzy, static_cast(matrix_size)}, + {&meta->gzzy, static_cast(matrix_size)}, + {&meta->gxxz, static_cast(matrix_size)}, + {&meta->gxyz, static_cast(matrix_size)}, + {&meta->gxzz, static_cast(matrix_size)}, + {&meta->gyyz, static_cast(matrix_size)}, + {&meta->gyzz, static_cast(matrix_size)}, + {&meta->gzzz, static_cast(matrix_size)}, + {&meta->Lapx, static_cast(matrix_size)}, + {&meta->Lapy, static_cast(matrix_size)}, + {&meta->Lapz, static_cast(matrix_size)}, + {&meta->betaxx, static_cast(matrix_size)}, + {&meta->betaxy, static_cast(matrix_size)}, + {&meta->betaxz, static_cast(matrix_size)}, + {&meta->betayy, static_cast(matrix_size)}, + {&meta->betayz, static_cast(matrix_size)}, + {&meta->betazz, static_cast(matrix_size)}, + {&meta->betayx, static_cast(matrix_size)}, + {&meta->betazy, static_cast(matrix_size)}, + {&meta->betazx, static_cast(matrix_size)}, + {&meta->Kx, static_cast(matrix_size)}, + {&meta->Ky, static_cast(matrix_size)}, + {&meta->Kz, static_cast(matrix_size)}, + {&meta->Gamxx, static_cast(matrix_size)}, + {&meta->Gamxy, static_cast(matrix_size)}, + {&meta->Gamxz, static_cast(matrix_size)}, + {&meta->Gamyy, static_cast(matrix_size)}, + {&meta->Gamyz, static_cast(matrix_size)}, + {&meta->Gamzz, static_cast(matrix_size)}, + {&meta->Gamyx, static_cast(matrix_size)}, + {&meta->Gamzy, static_cast(matrix_size)}, + {&meta->Gamzx, static_cast(matrix_size)}, + {&meta->div_beta, static_cast(matrix_size)}, + {&meta->S, static_cast(matrix_size)}, + {&meta->f, static_cast(matrix_size)}, + {&meta->fxx, static_cast(matrix_size)}, + {&meta->fxy, static_cast(matrix_size)}, + {&meta->fxz, static_cast(matrix_size)}, + {&meta->fyy, static_cast(matrix_size)}, + {&meta->fyz, static_cast(matrix_size)}, + {&meta->fzz, static_cast(matrix_size)}, + {&meta->gupxx, static_cast(matrix_size)}, + {&meta->gupxy, static_cast(matrix_size)}, + {&meta->gupxz, static_cast(matrix_size)}, + {&meta->gupyy, static_cast(matrix_size)}, + {&meta->gupyz, static_cast(matrix_size)}, + {&meta->gupzz, static_cast(matrix_size)}, + {&meta->Gamxa, static_cast(matrix_size)}, + {&meta->Gamya, static_cast(matrix_size)}, + {&meta->Gamza, static_cast(matrix_size)}, + {&meta->alpn1, static_cast(matrix_size)}, + {&meta->chin1, static_cast(matrix_size)}, + {&meta->fh, fh_size}, + {&meta->fh2, fh2_size}, +#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7) + {&meta->reta, static_cast(matrix_size)}, +#endif + }; + + if (!allocate_buffers(buffers, sizeof(buffers) / sizeof(buffers[0]))) + { + destroy_meta(meta); + reset_meta(meta); + cache.allocated = false; + cache.last_x = nullptr; + cache.last_y = nullptr; + cache.last_z = nullptr; + return false; + } + + cache.allocated = true; + return true; +} + +} // namespace + +__global__ void test_const_address(double * testd){ int _t = blockIdx.x*blockDim.x+threadIdx.x; if(_t == 0) testd[0] = F1o3; @@ -2010,39 +2431,30 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, double *ham_Res, double *movx_Res, double *movy_Res,double * movz_Res, double * Gmx_Res, double *Gmy_Res,double * Gmz_Res , int & Symmetry,int &Lev, double &eps, int &co) -{ - //#1------------init gpu meta data--------------------- - //cout<<"init GPU meta data\n"; - -#ifdef DEVICE_ID - // which device to use - cudaSetDevice(DEVICE_ID); -#endif - -#ifdef DEVICE_ID_BY_PID - pid_t pid = getpid(); - cudaSetDevice(pid % 2); - cout<<"My pid= "<(ex[0])}, + {Mh_ Y, Y, static_cast(ex[1])}, + {Mh_ Z, Z, static_cast(ex[2])}, + }; + if (!copy_buffers_to_device(coord_copies, sizeof(coord_copies) / sizeof(coord_copies[0]))) + return 1; + cache.last_x = X; + cache.last_y = Y; + cache.last_z = Z; + } + + const CopySpec state_copies[] = { + {Mh_ chi, chi, static_cast(matrix_size)}, + {Mh_ dxx, dxx, static_cast(matrix_size)}, + {Mh_ dyy, dyy, static_cast(matrix_size)}, + {Mh_ dzz, dzz, static_cast(matrix_size)}, + {Mh_ trK, trK, static_cast(matrix_size)}, + {Mh_ gxy, gxy, static_cast(matrix_size)}, + {Mh_ gxz, gxz, static_cast(matrix_size)}, + {Mh_ gyz, gyz, static_cast(matrix_size)}, + {Mh_ Axx, Axx, static_cast(matrix_size)}, + {Mh_ Axy, Axy, static_cast(matrix_size)}, + {Mh_ Axz, Axz, static_cast(matrix_size)}, + {Mh_ Ayz, Ayz, static_cast(matrix_size)}, + {Mh_ Ayy, Ayy, static_cast(matrix_size)}, + {Mh_ Azz, Azz, static_cast(matrix_size)}, + {Mh_ Gamx, Gamx, static_cast(matrix_size)}, + {Mh_ Gamy, Gamy, static_cast(matrix_size)}, + {Mh_ Gamz, Gamz, static_cast(matrix_size)}, + {Mh_ betax, betax, static_cast(matrix_size)}, + {Mh_ betay, betay, static_cast(matrix_size)}, + {Mh_ betaz, betaz, static_cast(matrix_size)}, + {Mh_ Lap, Lap, static_cast(matrix_size)}, + {Mh_ dtSfx, dtSfx, static_cast(matrix_size)}, + {Mh_ dtSfy, dtSfy, static_cast(matrix_size)}, + {Mh_ dtSfz, dtSfz, static_cast(matrix_size)}, + }; + if (!copy_buffers_to_device(state_copies, sizeof(state_copies) / sizeof(state_copies[0]))) + return 1; + + const ZeroSpec zero_specs[] = { + {Mh_ rho, static_cast(matrix_size)}, + {Mh_ Sxx, static_cast(matrix_size)}, + {Mh_ Sxy, static_cast(matrix_size)}, + {Mh_ Sxz, static_cast(matrix_size)}, + {Mh_ Syz, static_cast(matrix_size)}, + {Mh_ Syy, static_cast(matrix_size)}, + {Mh_ Szz, static_cast(matrix_size)}, + {Mh_ Sx, static_cast(matrix_size)}, + {Mh_ Sy, static_cast(matrix_size)}, + {Mh_ Sz, static_cast(matrix_size)}, + {Mh_ gxx, static_cast(matrix_size)}, + {Mh_ gyy, static_cast(matrix_size)}, + {Mh_ gzz, static_cast(matrix_size)}, + {Mh_ chix, static_cast(matrix_size)}, + {Mh_ chiy, static_cast(matrix_size)}, + {Mh_ chiz, static_cast(matrix_size)}, + {Mh_ gxxx, static_cast(matrix_size)}, + {Mh_ gxyx, static_cast(matrix_size)}, + {Mh_ gxzx, static_cast(matrix_size)}, + {Mh_ gyyx, static_cast(matrix_size)}, + {Mh_ gyzx, static_cast(matrix_size)}, + {Mh_ gzzx, static_cast(matrix_size)}, + {Mh_ gxxy, static_cast(matrix_size)}, + {Mh_ gxyy, static_cast(matrix_size)}, + {Mh_ gxzy, static_cast(matrix_size)}, + {Mh_ gyyy, static_cast(matrix_size)}, + {Mh_ gyzy, static_cast(matrix_size)}, + {Mh_ gzzy, static_cast(matrix_size)}, + {Mh_ gxxz, static_cast(matrix_size)}, + {Mh_ gxyz, static_cast(matrix_size)}, + {Mh_ gxzz, static_cast(matrix_size)}, + {Mh_ gyyz, static_cast(matrix_size)}, + {Mh_ gyzz, static_cast(matrix_size)}, + {Mh_ gzzz, static_cast(matrix_size)}, + {Mh_ Lapx, static_cast(matrix_size)}, + {Mh_ Lapy, static_cast(matrix_size)}, + {Mh_ Lapz, static_cast(matrix_size)}, + {Mh_ betaxx, static_cast(matrix_size)}, + {Mh_ betaxy, static_cast(matrix_size)}, + {Mh_ betaxz, static_cast(matrix_size)}, + {Mh_ betayy, static_cast(matrix_size)}, + {Mh_ betayz, static_cast(matrix_size)}, + {Mh_ betazz, static_cast(matrix_size)}, + {Mh_ betayx, static_cast(matrix_size)}, + {Mh_ betazy, static_cast(matrix_size)}, + {Mh_ betazx, static_cast(matrix_size)}, + {Mh_ Kx, static_cast(matrix_size)}, + {Mh_ Ky, static_cast(matrix_size)}, + {Mh_ Kz, static_cast(matrix_size)}, + {Mh_ Gamxx, static_cast(matrix_size)}, + {Mh_ Gamxy, static_cast(matrix_size)}, + {Mh_ Gamxz, static_cast(matrix_size)}, + {Mh_ Gamyy, static_cast(matrix_size)}, + {Mh_ Gamyz, static_cast(matrix_size)}, + {Mh_ Gamzz, static_cast(matrix_size)}, + {Mh_ Gamyx, static_cast(matrix_size)}, + {Mh_ Gamzy, static_cast(matrix_size)}, + {Mh_ Gamzx, static_cast(matrix_size)}, + {Mh_ div_beta, static_cast(matrix_size)}, + {Mh_ S, static_cast(matrix_size)}, + {Mh_ f, static_cast(matrix_size)}, + {Mh_ fxx, static_cast(matrix_size)}, + {Mh_ fxy, static_cast(matrix_size)}, + {Mh_ fxz, static_cast(matrix_size)}, + {Mh_ fyy, static_cast(matrix_size)}, + {Mh_ fyz, static_cast(matrix_size)}, + {Mh_ fzz, static_cast(matrix_size)}, + {Mh_ gupxx, static_cast(matrix_size)}, + {Mh_ gupxy, static_cast(matrix_size)}, + {Mh_ gupxz, static_cast(matrix_size)}, + {Mh_ gupyy, static_cast(matrix_size)}, + {Mh_ gupyz, static_cast(matrix_size)}, + {Mh_ gupzz, static_cast(matrix_size)}, + {Mh_ Gamxa, static_cast(matrix_size)}, + {Mh_ Gamya, static_cast(matrix_size)}, + {Mh_ Gamza, static_cast(matrix_size)}, + {Mh_ alpn1, static_cast(matrix_size)}, + {Mh_ chin1, static_cast(matrix_size)}, + }; + if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0]))) + return 1; double sss[3] = {1,1,1}; @@ -2907,8 +3339,5 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, #endif - destroy_meta(meta); - - - return 0;//TODO return -} + return 0;//TODO return +} diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h index 37d91d1..a581241 100644 --- a/AMSS_NCKU_source/bssn_gpu.h +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -4,10 +4,8 @@ #include "bssn_macro.h" #include "macrodef.fh" -#define DEVICE_ID 0 -// #define DEVICE_ID_BY_MPI_RANK -#define GRID_DIM 256 -#define BLOCK_DIM 128 +#define GRID_DIM 256 +#define BLOCK_DIM 128 #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]] #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]